From 8ddc7d9cd0a00062247c732b96386ec2462bdbc7 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 20:02:58 +0200
Subject: btrfs: add mirror_num to extent_read_full_page

Currently, extent_read_full_page always assumes we are trying to read mirror
0, which generally is the best we can do. To add flexibility, pass it as a
parameter. This will be needed by scrub fixup code.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent_io.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/extent_io.h')

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..a9dd994bf826 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -185,7 +185,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		    gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-			  get_extent_t *get_extent);
+			  get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
 void extent_io_exit(void);
 
-- 
cgit v1.2.3


From 0ef8e45158f97dde4801b535e25f70f7caf01a27 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 20:04:15 +0200
Subject: btrfs scrub: add fixup code for errors on nodatasum files

This removes a FIXME comment and introduces the first part of nodatasum
fixup: It gets the corresponding inode for a logical address and triggers a
regular readpage for the corrupted sector.

Once we have on-the-fly error correction our error will be automatically
corrected. The correction code is expected to clear the newly introduced
EXTENT_DAMAGED flag, making scrub report that error as "corrected" instead
of "uncorrectable" eventually.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent_io.h |   1 +
 fs/btrfs/scrub.c     | 188 +++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 183 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/extent_io.h')

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a9dd994bf826..435d454b9926 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,7 @@
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
 #define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_DAMAGED (1 << 13)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 41a01147b959..db09f01c0e4f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -22,6 +22,7 @@
 #include "volumes.h"
 #include "disk-io.h"
 #include "ordered-data.h"
+#include "transaction.h"
 #include "backref.h"
 
 /*
@@ -89,6 +90,7 @@ struct scrub_dev {
 	int			first_free;
 	int			curr;
 	atomic_t		in_flight;
+	atomic_t		fixup_cnt;
 	spinlock_t		list_lock;
 	wait_queue_head_t	list_wait;
 	u16			csum_size;
@@ -102,6 +104,14 @@ struct scrub_dev {
 	spinlock_t		stat_lock;
 };
 
+struct scrub_fixup_nodatasum {
+	struct scrub_dev	*sdev;
+	u64			logical;
+	struct btrfs_root	*root;
+	struct btrfs_work	work;
+	int			mirror_num;
+};
+
 struct scrub_warning {
 	struct btrfs_path	*path;
 	u64			extent_item_size;
@@ -190,12 +200,13 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 
 		if (i != SCRUB_BIOS_PER_DEV-1)
 			sdev->bios[i]->next_free = i + 1;
-		 else
+		else
 			sdev->bios[i]->next_free = -1;
 	}
 	sdev->first_free = 0;
 	sdev->curr = -1;
 	atomic_set(&sdev->in_flight, 0);
+	atomic_set(&sdev->fixup_cnt, 0);
 	atomic_set(&sdev->cancel_req, 0);
 	sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
 	INIT_LIST_HEAD(&sdev->csum_list);
@@ -347,6 +358,151 @@ out:
 	kfree(swarn.msg_buf);
 }
 
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	struct page *page;
+	unsigned long index;
+	struct scrub_fixup_nodatasum *fixup = ctx;
+	int ret;
+	int corrected;
+	struct btrfs_key key;
+	struct inode *inode;
+	u64 end = offset + PAGE_SIZE - 1;
+	struct btrfs_root *local_root;
+
+	key.objectid = root;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
+	if (IS_ERR(local_root))
+		return PTR_ERR(local_root);
+
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.objectid = inum;
+	key.offset = 0;
+	inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
+				EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS);
+
+	/* set_extent_bit should either succeed or give proper error */
+	WARN_ON(ret > 0);
+	if (ret)
+		return ret < 0 ? ret : -EFAULT;
+
+	index = offset >> PAGE_CACHE_SHIFT;
+
+	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+
+	ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
+					btrfs_get_extent, fixup->mirror_num);
+	wait_on_page_locked(page);
+	corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end,
+					EXTENT_DAMAGED, 0, NULL);
+
+	if (corrected)
+		WARN_ON(!PageUptodate(page));
+	else
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
+					EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS);
+
+	put_page(page);
+	iput(inode);
+
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0 && corrected) {
+		/*
+		 * we only need to call readpage for one of the inodes belonging
+		 * to this extent. so make iterate_extent_inodes stop
+		 */
+		return 1;
+	}
+
+	return -EIO;
+}
+
+static void scrub_fixup_nodatasum(struct btrfs_work *work)
+{
+	int ret;
+	struct scrub_fixup_nodatasum *fixup;
+	struct scrub_dev *sdev;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_path *path;
+	int uncorrectable = 0;
+
+	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
+	sdev = fixup->sdev;
+	fs_info = fixup->root->fs_info;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		spin_lock(&sdev->stat_lock);
+		++sdev->stat.malloc_errors;
+		spin_unlock(&sdev->stat_lock);
+		uncorrectable = 1;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(fixup->root);
+	if (IS_ERR(trans)) {
+		uncorrectable = 1;
+		goto out;
+	}
+
+	/*
+	 * the idea is to trigger a regular read through the standard path. we
+	 * read a page from the (failed) logical address by specifying the
+	 * corresponding copynum of the failed sector. thus, that readpage is
+	 * expected to fail.
+	 * that is the point where on-the-fly error correction will kick in
+	 * (once it's finished) and rewrite the failed sector if a good copy
+	 * can be found.
+	 */
+	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
+						path, scrub_fixup_readpage,
+						fixup);
+	if (ret < 0) {
+		uncorrectable = 1;
+		goto out;
+	}
+	WARN_ON(ret != 1);
+
+	spin_lock(&sdev->stat_lock);
+	++sdev->stat.corrected_errors;
+	spin_unlock(&sdev->stat_lock);
+
+out:
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, fixup->root);
+	if (uncorrectable) {
+		spin_lock(&sdev->stat_lock);
+		++sdev->stat.uncorrectable_errors;
+		spin_unlock(&sdev->stat_lock);
+		printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
+					"(nodatasum) error at logical %llu\n",
+					fixup->logical);
+	}
+
+	btrfs_free_path(path);
+	kfree(fixup);
+
+	/* see caller why we're pretending to be paused in the scrub counters */
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_dec(&fs_info->scrubs_running);
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+	atomic_dec(&sdev->fixup_cnt);
+	wake_up(&fs_info->scrub_pause_wait);
+	wake_up(&sdev->list_wait);
+}
+
 /*
  * scrub_recheck_error gets called when either verification of the page
  * failed or the bio failed to read, e.g. with EIO. In the latter case,
@@ -417,6 +573,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct btrfs_multi_bio *multi = NULL;
+	struct scrub_fixup_nodatasum *fixup;
 	u64 logical = sbio->logical + ix * PAGE_SIZE;
 	u64 length;
 	int i;
@@ -425,12 +582,30 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 
 	if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
 	    (sbio->spag[ix].have_csum == 0)) {
+		fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+		if (!fixup)
+			goto uncorrectable;
+		fixup->sdev = sdev;
+		fixup->logical = logical;
+		fixup->root = fs_info->extent_root;
+		fixup->mirror_num = sbio->spag[ix].mirror_num;
 		/*
-		 * nodatasum, don't try to fix anything
-		 * FIXME: we can do better, open the inode and trigger a
-		 * writeback
+		 * increment scrubs_running to prevent cancel requests from
+		 * completing as long as a fixup worker is running. we must also
+		 * increment scrubs_paused to prevent deadlocking on pause
+		 * requests used for transactions commits (as the worker uses a
+		 * transaction context). it is safe to regard the fixup worker
+		 * as paused for all matters practical. effectively, we only
+		 * avoid cancellation requests from completing.
 		 */
-		goto uncorrectable;
+		mutex_lock(&fs_info->scrub_lock);
+		atomic_inc(&fs_info->scrubs_running);
+		atomic_inc(&fs_info->scrubs_paused);
+		mutex_unlock(&fs_info->scrub_lock);
+		atomic_inc(&sdev->fixup_cnt);
+		fixup->work.func = scrub_fixup_nodatasum;
+		btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
+		return;
 	}
 
 	length = PAGE_SIZE;
@@ -1425,10 +1600,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
 		ret = scrub_enumerate_chunks(sdev, start, end);
 
 	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
-
 	atomic_dec(&fs_info->scrubs_running);
 	wake_up(&fs_info->scrub_pause_wait);
 
+	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
+
 	if (progress)
 		memcpy(progress, &sdev->stat, sizeof(*progress));
 
-- 
cgit v1.2.3


From 4a54c8c165b66300830a67349fc7595e3fc442f7 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 22 Jul 2011 15:41:52 +0200
Subject: btrfs: Moved repair code from inode.c to extent_io.c

The raid-retry code in inode.c can be generalized so that it works for
metadata as well. Thus, this patch moves it to extent_io.c and makes the
raid-retry code a raid-repair code.

Repair works that way: Whenever a read error occurs and we have more
mirrors to try, note the failed mirror, and retry another. If we find a
good one, check if we did note a failure earlier and if so, do not allow
the read to complete until after the bad sector was written with the good
data we just fetched. As we have the extent locked while reading, no one
can change the data in between.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent_io.c | 387 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/extent_io.h |  10 +-
 fs/btrfs/inode.c     | 155 +--------------------
 3 files changed, 393 insertions(+), 159 deletions(-)

(limited to 'fs/btrfs/extent_io.h')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index afebb95e3490..624ef10d36cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
 #include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
+#include "volumes.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -1599,6 +1600,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
 	return 0;
 }
 
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+	struct page *page;
+	u64 start;
+	u64 len;
+	u64 logical;
+	unsigned long bio_flags;
+	int this_mirror;
+	int failed_mirror;
+	int in_validation;
+};
+
+static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
+				int did_repair)
+{
+	int ret;
+	int err = 0;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+
+	set_state_private(failure_tree, rec->start, 0);
+	ret = clear_extent_bits(failure_tree, rec->start,
+				rec->start + rec->len - 1,
+				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+	if (ret)
+		err = ret;
+
+	if (did_repair) {
+		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+					rec->start + rec->len - 1,
+					EXTENT_DAMAGED, GFP_NOFS);
+		if (ret && !err)
+			err = ret;
+	}
+
+	kfree(rec);
+	return err;
+}
+
+static void repair_io_failure_callback(struct bio *bio, int err)
+{
+	complete(bio->bi_private);
+}
+
+/*
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchonization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
+ */
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+			u64 length, u64 logical, struct page *page,
+			int mirror_num)
+{
+	struct bio *bio;
+	struct btrfs_device *dev;
+	DECLARE_COMPLETION_ONSTACK(compl);
+	u64 map_length = 0;
+	u64 sector;
+	struct btrfs_bio *bbio = NULL;
+	int ret;
+
+	BUG_ON(!mirror_num);
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	if (!bio)
+		return -EIO;
+	bio->bi_private = &compl;
+	bio->bi_end_io = repair_io_failure_callback;
+	bio->bi_size = 0;
+	map_length = length;
+
+	ret = btrfs_map_block(map_tree, WRITE, logical,
+			      &map_length, &bbio, mirror_num);
+	if (ret) {
+		bio_put(bio);
+		return -EIO;
+	}
+	BUG_ON(mirror_num != bbio->mirror_num);
+	sector = bbio->stripes[mirror_num-1].physical >> 9;
+	bio->bi_sector = sector;
+	dev = bbio->stripes[mirror_num-1].dev;
+	kfree(bbio);
+	if (!dev || !dev->bdev || !dev->writeable) {
+		bio_put(bio);
+		return -EIO;
+	}
+	bio->bi_bdev = dev->bdev;
+	bio_add_page(bio, page, length, start-page_offset(page));
+	submit_bio(WRITE_SYNC, bio);
+	wait_for_completion(&compl);
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		/* try to remap that extent elsewhere? */
+		bio_put(bio);
+		return -EIO;
+	}
+
+	printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
+			"sector %llu)\n", page->mapping->host->i_ino, start,
+			dev->name, sector);
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int clean_io_failure(u64 start, struct page *page)
+{
+	u64 private;
+	u64 private_failure;
+	struct io_failure_record *failrec;
+	struct btrfs_mapping_tree *map_tree;
+	struct extent_state *state;
+	int num_copies;
+	int did_repair = 0;
+	int ret;
+	struct inode *inode = page->mapping->host;
+
+	private = 0;
+	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+				(u64)-1, 1, EXTENT_DIRTY, 0);
+	if (!ret)
+		return 0;
+
+	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+				&private_failure);
+	if (ret)
+		return 0;
+
+	failrec = (struct io_failure_record *)(unsigned long) private_failure;
+	BUG_ON(!failrec->this_mirror);
+
+	if (failrec->in_validation) {
+		/* there was no real error, just free the record */
+		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+			 failrec->start);
+		did_repair = 1;
+		goto out;
+	}
+
+	spin_lock(&BTRFS_I(inode)->io_tree.lock);
+	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+					    failrec->start,
+					    EXTENT_LOCKED);
+	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+
+	if (state && state->start == failrec->start) {
+		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+		num_copies = btrfs_num_copies(map_tree, failrec->logical,
+						failrec->len);
+		if (num_copies > 1)  {
+			ret = repair_io_failure(map_tree, start, failrec->len,
+						failrec->logical, page,
+						failrec->failed_mirror);
+			did_repair = !ret;
+		}
+	}
+
+out:
+	if (!ret)
+		ret = free_io_failure(inode, failrec, did_repair);
+
+	return ret;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, struct page *page,
+				u64 start, u64 end, int failed_mirror,
+				struct extent_state *state)
+{
+	struct io_failure_record *failrec = NULL;
+	u64 private;
+	struct extent_map *em;
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct bio *bio;
+	int num_copies;
+	int ret;
+	int read_mode;
+	u64 logical;
+
+	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+	ret = get_state_private(failure_tree, start, &private);
+	if (ret) {
+		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+		if (!failrec)
+			return -ENOMEM;
+		failrec->start = start;
+		failrec->len = end - start + 1;
+		failrec->this_mirror = 0;
+		failrec->bio_flags = 0;
+		failrec->in_validation = 0;
+
+		read_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, failrec->len);
+		if (!em) {
+			read_unlock(&em_tree->lock);
+			kfree(failrec);
+			return -EIO;
+		}
+
+		if (em->start > start || em->start + em->len < start) {
+			free_extent_map(em);
+			em = NULL;
+		}
+		read_unlock(&em_tree->lock);
+
+		if (!em || IS_ERR(em)) {
+			kfree(failrec);
+			return -EIO;
+		}
+		logical = start - em->start;
+		logical = em->block_start + logical;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			logical = em->block_start;
+			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&failrec->bio_flags,
+						 em->compress_type);
+		}
+		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
+			 "len=%llu\n", logical, start, failrec->len);
+		failrec->logical = logical;
+		free_extent_map(em);
+
+		/* set the bits in the private failure tree */
+		ret = set_extent_bits(failure_tree, start, end,
+					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+		if (ret >= 0)
+			ret = set_state_private(failure_tree, start,
+						(u64)(unsigned long)failrec);
+		/* set the bits in the inode's tree */
+		if (ret >= 0)
+			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+						GFP_NOFS);
+		if (ret < 0) {
+			kfree(failrec);
+			return ret;
+		}
+	} else {
+		failrec = (struct io_failure_record *)(unsigned long)private;
+		pr_debug("bio_readpage_error: (found) logical=%llu, "
+			 "start=%llu, len=%llu, validation=%d\n",
+			 failrec->logical, failrec->start, failrec->len,
+			 failrec->in_validation);
+		/*
+		 * when data can be on disk more than twice, add to failrec here
+		 * (e.g. with a list for failed_mirror) to make
+		 * clean_io_failure() clean all those errors at once.
+		 */
+	}
+	num_copies = btrfs_num_copies(
+			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
+			      failrec->logical, failrec->len);
+	if (num_copies == 1) {
+		/*
+		 * we only have a single copy of the data, so don't bother with
+		 * all the retry and error correction code that follows. no
+		 * matter what the error is, it is very likely to persist.
+		 */
+		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
+			 "state=%p, num_copies=%d, next_mirror %d, "
+			 "failed_mirror %d\n", state, num_copies,
+			 failrec->this_mirror, failed_mirror);
+		free_io_failure(inode, failrec, 0);
+		return -EIO;
+	}
+
+	if (!state) {
+		spin_lock(&tree->lock);
+		state = find_first_extent_bit_state(tree, failrec->start,
+						    EXTENT_LOCKED);
+		if (state && state->start != failrec->start)
+			state = NULL;
+		spin_unlock(&tree->lock);
+	}
+
+	/*
+	 * there are two premises:
+	 *	a) deliver good data to the caller
+	 *	b) correct the bad sectors on disk
+	 */
+	if (failed_bio->bi_vcnt > 1) {
+		/*
+		 * to fulfill b), we need to know the exact failing sectors, as
+		 * we don't want to rewrite any more than the failed ones. thus,
+		 * we need separate read requests for the failed bio
+		 *
+		 * if the following BUG_ON triggers, our validation request got
+		 * merged. we need separate requests for our algorithm to work.
+		 */
+		BUG_ON(failrec->in_validation);
+		failrec->in_validation = 1;
+		failrec->this_mirror = failed_mirror;
+		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+	} else {
+		/*
+		 * we're ready to fulfill a) and b) alongside. get a good copy
+		 * of the failed sector and if we succeed, we have setup
+		 * everything for repair_io_failure to do the rest for us.
+		 */
+		if (failrec->in_validation) {
+			BUG_ON(failrec->this_mirror != failed_mirror);
+			failrec->in_validation = 0;
+			failrec->this_mirror = 0;
+		}
+		failrec->failed_mirror = failed_mirror;
+		failrec->this_mirror++;
+		if (failrec->this_mirror == failed_mirror)
+			failrec->this_mirror++;
+		read_mode = READ_SYNC;
+	}
+
+	if (!state || failrec->this_mirror > num_copies) {
+		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
+			 "next_mirror %d, failed_mirror %d\n", state,
+			 num_copies, failrec->this_mirror, failed_mirror);
+		free_io_failure(inode, failrec, 0);
+		return -EIO;
+	}
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_private = state;
+	bio->bi_end_io = failed_bio->bi_end_io;
+	bio->bi_sector = failrec->logical >> 9;
+	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+	bio->bi_size = 0;
+
+	bio_add_page(bio, page, failrec->len, start - page_offset(page));
+
+	pr_debug("bio_readpage_error: submitting new read[%#x] to "
+		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
+		 failrec->this_mirror, num_copies, failrec->in_validation);
+
+	tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
+					failrec->bio_flags, 0);
+	return 0;
+}
+
 /* lots and lots of room for performance fixes in the end_bio funcs */
 
 /*
@@ -1697,6 +2060,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		struct extent_state *cached = NULL;
 		struct extent_state *state;
 
+		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
+			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+			 (long int)bio->bi_bdev);
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
 
 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2093,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 							      state);
 			if (ret)
 				uptodate = 0;
+			else
+				clean_io_failure(start, page);
 		}
-		if (!uptodate && tree->ops &&
-		    tree->ops->readpage_io_failed_hook) {
-			ret = tree->ops->readpage_io_failed_hook(bio, page,
-							 start, end, NULL);
+		if (!uptodate) {
+			u64 failed_mirror;
+			failed_mirror = (u64)bio->bi_bdev;
+			if (tree->ops && tree->ops->readpage_io_failed_hook)
+				ret = tree->ops->readpage_io_failed_hook(
+						bio, page, start, end,
+						failed_mirror, NULL);
+			else
+				ret = bio_readpage_error(bio, page, start, end,
+							 failed_mirror, NULL);
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2185,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 					   mirror_num, bio_flags, start);
 	else
 		submit_bio(rw, bio);
+
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	bio_put(bio);
@@ -2926,7 +3301,7 @@ out:
 	return ret;
 }
 
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+inline struct page *extent_buffer_page(struct extent_buffer *eb,
 					      unsigned long i)
 {
 	struct page *p;
@@ -2951,7 +3326,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 	return p;
 }
 
-static inline unsigned long num_extent_pages(u64 start, u64 len)
+inline unsigned long num_extent_pages(u64 start, u64 len)
 {
 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
 		(start >> PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 435d454b9926..a8e20b672922 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -68,7 +68,7 @@ struct extent_io_ops {
 			      unsigned long bio_flags);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-				       u64 start, u64 end,
+				       u64 start, u64 end, u64 failed_mirror,
 				       struct extent_state *state);
 	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
 					u64 start, u64 end,
@@ -252,6 +252,8 @@ void free_extent_buffer(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num);
+unsigned long num_extent_pages(u64 start, u64 len);
+struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
@@ -301,4 +303,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags);
+
+struct btrfs_mapping_tree;
+
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+			u64 length, u64 logical, struct page *page,
+			int mirror_num);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 936a6fabaa9f..9327f45434e8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
 #include "btrfs_inode.h"
 #include "ioctl.h"
 #include "print-tree.h"
-#include "volumes.h"
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
+#include "volumes.h"
 #include "compression.h"
 #include "locking.h"
 #include "free-space-cache.h"
@@ -1818,154 +1818,10 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
-/*
- * When IO fails, either with EIO or csum verification fails, we
- * try other mirrors that might have a good copy of the data.  This
- * io_failure_record is used to record state as we go through all the
- * mirrors.  If another mirror has good data, the page is set up to date
- * and things continue.  If a good mirror can't be found, the original
- * bio end_io callback is called to indicate things have failed.
- */
-struct io_failure_record {
-	struct page *page;
-	u64 start;
-	u64 len;
-	u64 logical;
-	unsigned long bio_flags;
-	int last_mirror;
-};
-
-static int btrfs_io_failed_hook(struct bio *failed_bio,
-			 struct page *page, u64 start, u64 end,
-			 struct extent_state *state)
-{
-	struct io_failure_record *failrec = NULL;
-	u64 private;
-	struct extent_map *em;
-	struct inode *inode = page->mapping->host;
-	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct bio *bio;
-	int num_copies;
-	int ret;
-	int rw;
-	u64 logical;
-
-	ret = get_state_private(failure_tree, start, &private);
-	if (ret) {
-		failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
-		if (!failrec)
-			return -ENOMEM;
-		failrec->start = start;
-		failrec->len = end - start + 1;
-		failrec->last_mirror = 0;
-		failrec->bio_flags = 0;
-
-		read_lock(&em_tree->lock);
-		em = lookup_extent_mapping(em_tree, start, failrec->len);
-		if (em->start > start || em->start + em->len < start) {
-			free_extent_map(em);
-			em = NULL;
-		}
-		read_unlock(&em_tree->lock);
-
-		if (IS_ERR_OR_NULL(em)) {
-			kfree(failrec);
-			return -EIO;
-		}
-		logical = start - em->start;
-		logical = em->block_start + logical;
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
-			logical = em->block_start;
-			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
-			extent_set_compress_type(&failrec->bio_flags,
-						 em->compress_type);
-		}
-		failrec->logical = logical;
-		free_extent_map(em);
-		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
-				EXTENT_DIRTY, GFP_NOFS);
-		set_state_private(failure_tree, start,
-				 (u64)(unsigned long)failrec);
-	} else {
-		failrec = (struct io_failure_record *)(unsigned long)private;
-	}
-	num_copies = btrfs_num_copies(
-			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
-			      failrec->logical, failrec->len);
-	failrec->last_mirror++;
-	if (!state) {
-		spin_lock(&BTRFS_I(inode)->io_tree.lock);
-		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
-						    failrec->start,
-						    EXTENT_LOCKED);
-		if (state && state->start != failrec->start)
-			state = NULL;
-		spin_unlock(&BTRFS_I(inode)->io_tree.lock);
-	}
-	if (!state || failrec->last_mirror > num_copies) {
-		set_state_private(failure_tree, failrec->start, 0);
-		clear_extent_bits(failure_tree, failrec->start,
-				  failrec->start + failrec->len - 1,
-				  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
-		kfree(failrec);
-		return -EIO;
-	}
-	bio = bio_alloc(GFP_NOFS, 1);
-	bio->bi_private = state;
-	bio->bi_end_io = failed_bio->bi_end_io;
-	bio->bi_sector = failrec->logical >> 9;
-	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-	bio->bi_size = 0;
-
-	bio_add_page(bio, page, failrec->len, start - page_offset(page));
-	if (failed_bio->bi_rw & REQ_WRITE)
-		rw = WRITE;
-	else
-		rw = READ;
-
-	ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-						      failrec->last_mirror,
-						      failrec->bio_flags, 0);
-	return ret;
-}
-
-/*
- * each time an IO finishes, we do a fast check in the IO failure tree
- * to see if we need to process or clean up an io_failure_record
- */
-static int btrfs_clean_io_failures(struct inode *inode, u64 start)
-{
-	u64 private;
-	u64 private_failure;
-	struct io_failure_record *failure;
-	int ret;
-
-	private = 0;
-	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-			     (u64)-1, 1, EXTENT_DIRTY, 0)) {
-		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
-					start, &private_failure);
-		if (ret == 0) {
-			failure = (struct io_failure_record *)(unsigned long)
-				   private_failure;
-			set_state_private(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start, 0);
-			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start,
-					  failure->start + failure->len - 1,
-					  EXTENT_DIRTY | EXTENT_LOCKED,
-					  GFP_NOFS);
-			kfree(failure);
-		}
-	}
-	return 0;
-}
-
 /*
  * when reads are done, we need to check csums to verify the data is correct
- * if there's a match, we allow the bio to finish.  If not, we go through
- * the io_failure_record routines to find good copies
+ * if there's a match, we allow the bio to finish.  If not, the code in
+ * extent_io.c will try to find good copies for us.
  */
 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
@@ -2011,10 +1867,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
 	kunmap_atomic(kaddr, KM_USER0);
 good:
-	/* if the io failure tree for this inode is non-empty,
-	 * check to see if we've recovered from a failed IO
-	 */
-	btrfs_clean_io_failures(inode, start);
 	return 0;
 
 zeroit:
@@ -7420,7 +7272,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.writepage_start_hook = btrfs_writepage_start_hook,
-	.readpage_io_failed_hook = btrfs_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
 	.merge_extent_hook = btrfs_merge_extent_hook,
-- 
cgit v1.2.3