summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/raid56.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/raid56.c')
-rw-r--r--fs/btrfs/raid56.c2052
1 files changed, 1082 insertions, 970 deletions
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 82c8e991300e..6a2cf754912d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -13,12 +13,15 @@
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
#include <linux/mm.h>
+#include "messages.h"
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
+#include "file-item.h"
+#include "btrfs_inode.h"
/* set when additional merges to this rbio are not allowed */
#define RBIO_RMW_LOCKED_BIT 1
@@ -63,19 +66,45 @@ struct sector_ptr {
unsigned int uptodate:8;
};
-static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
-static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
-static void rmw_work(struct work_struct *work);
-static void read_rebuild_work(struct work_struct *work);
-static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
-static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
-static void __free_raid_bio(struct btrfs_raid_bio *rbio);
+static void rmw_rbio_work(struct work_struct *work);
+static void rmw_rbio_work_locked(struct work_struct *work);
static void index_rbio_pages(struct btrfs_raid_bio *rbio);
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
-static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
- int need_check);
-static void scrub_parity_work(struct work_struct *work);
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check);
+static void scrub_rbio_work_locked(struct work_struct *work);
+
+static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
+{
+ bitmap_free(rbio->error_bitmap);
+ kfree(rbio->stripe_pages);
+ kfree(rbio->bio_sectors);
+ kfree(rbio->stripe_sectors);
+ kfree(rbio->finish_pointers);
+}
+
+static void free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+ int i;
+
+ if (!refcount_dec_and_test(&rbio->refs))
+ return;
+
+ WARN_ON(!list_empty(&rbio->stripe_cache));
+ WARN_ON(!list_empty(&rbio->hash_list));
+ WARN_ON(!bio_list_empty(&rbio->bio_list));
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i]) {
+ __free_page(rbio->stripe_pages[i]);
+ rbio->stripe_pages[i] = NULL;
+ }
+ }
+
+ btrfs_put_bioc(rbio->bioc);
+ free_raid_bio_pointers(rbio);
+ kfree(rbio);
+}
static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
{
@@ -146,8 +175,16 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
for (i = 0; i < rbio->nr_sectors; i++) {
/* Some range not covered by bio (partial write), skip it */
- if (!rbio->bio_sectors[i].page)
+ if (!rbio->bio_sectors[i].page) {
+ /*
+ * Even if the sector is not covered by bio, if it is
+ * a data sector it should still be uptodate as it is
+ * read from disk.
+ */
+ if (i < rbio->nr_data * rbio->stripe_nsectors)
+ ASSERT(rbio->stripe_sectors[i].uptodate);
continue;
+ }
ASSERT(rbio->stripe_sectors[i].page);
memcpy_page(rbio->stripe_sectors[i].page,
@@ -234,6 +271,21 @@ static void steal_rbio_page(struct btrfs_raid_bio *src,
dest->stripe_sectors[i].uptodate = true;
}
+static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
+{
+ const int sector_nr = (page_nr << PAGE_SHIFT) >>
+ rbio->bioc->fs_info->sectorsize_bits;
+
+ /*
+ * We have ensured PAGE_SIZE is aligned with sectorsize, thus
+ * we won't have a page which is half data half parity.
+ *
+ * Thus if the first sector of the page belongs to data stripes, then
+ * the full page belongs to data stripes.
+ */
+ return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
+}
+
/*
* Stealing an rbio means taking all the uptodate pages from the stripe array
* in the source rbio and putting them into the destination rbio.
@@ -244,16 +296,26 @@ static void steal_rbio_page(struct btrfs_raid_bio *src,
static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{
int i;
- struct page *s;
if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
return;
for (i = 0; i < dest->nr_pages; i++) {
- s = src->stripe_pages[i];
- if (!s || !full_page_sectors_uptodate(src, i))
+ struct page *p = src->stripe_pages[i];
+
+ /*
+ * We don't need to steal P/Q pages as they will always be
+ * regenerated for RMW or full write anyway.
+ */
+ if (!is_data_stripe_page(src, i))
continue;
+ /*
+ * If @src already has RBIO_CACHE_READY_BIT, it should have
+ * all data stripe pages present and uptodate.
+ */
+ ASSERT(p);
+ ASSERT(full_page_sectors_uptodate(src, i));
steal_rbio_page(src, dest, i);
}
index_stripe_sectors(dest);
@@ -336,7 +398,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
spin_unlock(&h->lock);
if (freeit)
- __free_raid_bio(rbio);
+ free_raid_bio(rbio);
}
/*
@@ -526,28 +588,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
return 0;
- if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
+ if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
+ last->operation == BTRFS_RBIO_READ_REBUILD)
return 0;
- if (last->operation == BTRFS_RBIO_READ_REBUILD) {
- int fa = last->faila;
- int fb = last->failb;
- int cur_fa = cur->faila;
- int cur_fb = cur->failb;
-
- if (last->faila >= last->failb) {
- fa = last->failb;
- fb = last->faila;
- }
-
- if (cur->faila >= cur->failb) {
- cur_fa = cur->failb;
- cur_fb = cur->faila;
- }
-
- if (fa != cur_fa || fb != cur_fb)
- return 0;
- }
return 1;
}
@@ -684,10 +728,12 @@ out:
if (cache_drop)
remove_rbio_from_cache(cache_drop);
if (freeit)
- __free_raid_bio(freeit);
+ free_raid_bio(freeit);
return ret;
}
+static void recover_rbio_work_locked(struct work_struct *work);
+
/*
* called as rmw or parity rebuild is completed. If the plug list has more
* rbios waiting for this stripe, the next one on the list will be started
@@ -745,16 +791,16 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
spin_unlock_irqrestore(&h->lock, flags);
if (next->operation == BTRFS_RBIO_READ_REBUILD)
- start_async_work(next, read_rebuild_work);
+ start_async_work(next, recover_rbio_work_locked);
else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
steal_rbio(rbio, next);
- start_async_work(next, read_rebuild_work);
+ start_async_work(next, recover_rbio_work_locked);
} else if (next->operation == BTRFS_RBIO_WRITE) {
steal_rbio(rbio, next);
- start_async_work(next, rmw_work);
+ start_async_work(next, rmw_rbio_work_locked);
} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
steal_rbio(rbio, next);
- start_async_work(next, scrub_parity_work);
+ start_async_work(next, scrub_rbio_work_locked);
}
goto done_nolock;
@@ -769,28 +815,6 @@ done_nolock:
remove_rbio_from_cache(rbio);
}
-static void __free_raid_bio(struct btrfs_raid_bio *rbio)
-{
- int i;
-
- if (!refcount_dec_and_test(&rbio->refs))
- return;
-
- WARN_ON(!list_empty(&rbio->stripe_cache));
- WARN_ON(!list_empty(&rbio->hash_list));
- WARN_ON(!bio_list_empty(&rbio->bio_list));
-
- for (i = 0; i < rbio->nr_pages; i++) {
- if (rbio->stripe_pages[i]) {
- __free_page(rbio->stripe_pages[i]);
- rbio->stripe_pages[i] = NULL;
- }
- }
-
- btrfs_put_bioc(rbio->bioc);
- kfree(rbio);
-}
-
static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
{
struct bio *next;
@@ -813,6 +837,11 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *extra;
+ kfree(rbio->csum_buf);
+ bitmap_free(rbio->csum_bitmap);
+ rbio->csum_buf = NULL;
+ rbio->csum_bitmap = NULL;
+
/*
* Clear the data bitmap, as the rbio may be cached for later usage.
* do this before before unlock_stripe() so there will be no new bio
@@ -830,7 +859,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
*/
unlock_stripe(rbio);
extra = bio_list_get(&rbio->bio_list);
- __free_raid_bio(rbio);
+ free_raid_bio(rbio);
rbio_endio_bio_list(cur, err);
if (extra)
@@ -838,36 +867,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
}
/*
- * end io function used by finish_rmw. When we finally
- * get here, we've written a full stripe
- */
-static void raid_write_end_io(struct bio *bio)
-{
- struct btrfs_raid_bio *rbio = bio->bi_private;
- blk_status_t err = bio->bi_status;
- int max_errors;
-
- if (err)
- fail_bio_stripe(rbio, bio);
-
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
-
- err = BLK_STS_OK;
-
- /* OK, we have read all the stripes we need to. */
- max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
- 0 : rbio->bioc->max_errors;
- if (atomic_read(&rbio->error) > max_errors)
- err = BLK_STS_IOERR;
-
- rbio_orig_end_io(rbio, err);
-}
-
-/**
- * Get a sector pointer specified by its @stripe_nr and @sector_nr
+ * Get a sector pointer specified by its @stripe_nr and @sector_nr.
*
* @rbio: The raid bio
* @stripe_nr: Stripe number, valid range [0, real_stripe)
@@ -919,7 +919,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
const unsigned int num_sectors = stripe_nsectors * real_stripes;
struct btrfs_raid_bio *rbio;
- void *p;
/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
@@ -929,16 +928,27 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
*/
ASSERT(stripe_nsectors <= BITS_PER_LONG);
- rbio = kzalloc(sizeof(*rbio) +
- sizeof(*rbio->stripe_pages) * num_pages +
- sizeof(*rbio->bio_sectors) * num_sectors +
- sizeof(*rbio->stripe_sectors) * num_sectors +
- sizeof(*rbio->finish_pointers) * real_stripes,
- GFP_NOFS);
+ rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
+ rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
+ GFP_NOFS);
+ rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
+ GFP_NOFS);
+ rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
+ GFP_NOFS);
+ rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
+ rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
+
+ if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
+ !rbio->finish_pointers || !rbio->error_bitmap) {
+ free_raid_bio_pointers(rbio);
+ kfree(rbio);
+ return ERR_PTR(-ENOMEM);
+ }
bio_list_init(&rbio->bio_list);
+ init_waitqueue_head(&rbio->io_wait);
INIT_LIST_HEAD(&rbio->plug_list);
spin_lock_init(&rbio->bio_list_lock);
INIT_LIST_HEAD(&rbio->stripe_cache);
@@ -950,27 +960,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
rbio->real_stripes = real_stripes;
rbio->stripe_npages = stripe_npages;
rbio->stripe_nsectors = stripe_nsectors;
- rbio->faila = -1;
- rbio->failb = -1;
refcount_set(&rbio->refs, 1);
- atomic_set(&rbio->error, 0);
atomic_set(&rbio->stripes_pending, 0);
- /*
- * The stripe_pages, bio_sectors, etc arrays point to the extra memory
- * we allocated past the end of the rbio.
- */
- p = rbio + 1;
-#define CONSUME_ALLOC(ptr, count) do { \
- ptr = p; \
- p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
- } while (0)
- CONSUME_ALLOC(rbio->stripe_pages, num_pages);
- CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
- CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
- CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
-#undef CONSUME_ALLOC
-
ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
@@ -1006,6 +998,45 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
}
/*
+ * Return the total numer of errors found in the vertical stripe of @sector_nr.
+ *
+ * @faila and @failb will also be updated to the first and second stripe
+ * number of the errors.
+ */
+static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
+ int *faila, int *failb)
+{
+ int stripe_nr;
+ int found_errors = 0;
+
+ if (faila || failb) {
+ /*
+ * Both @faila and @failb should be valid pointers if any of
+ * them is specified.
+ */
+ ASSERT(faila && failb);
+ *faila = -1;
+ *failb = -1;
+ }
+
+ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+ int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
+
+ if (test_bit(total_sector_nr, rbio->error_bitmap)) {
+ found_errors++;
+ if (faila) {
+ /* Update faila and failb. */
+ if (*faila < 0)
+ *faila = stripe_nr;
+ else if (*failb < 0)
+ *failb = stripe_nr;
+ }
+ }
+ }
+ return found_errors;
+}
+
+/*
* Add a single sector @sector into our list of bios for IO.
*
* Return 0 if everything went well.
@@ -1038,8 +1069,19 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
disk_start = stripe->physical + sector_nr * sectorsize;
/* if the device is missing, just fail this stripe */
- if (!stripe->dev->bdev)
- return fail_rbio_index(rbio, stripe_nr);
+ if (!stripe->dev->bdev) {
+ int found_errors;
+
+ set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
+ rbio->error_bitmap);
+
+ /* Check if we have reached tolerance early. */
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ NULL, NULL);
+ if (found_errors > rbio->bioc->max_errors)
+ return -EIO;
+ return 0;
+ }
/* see if we can add this page onto our existing bio */
if (last) {
@@ -1071,23 +1113,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
return 0;
}
-/*
- * while we're doing the read/modify/write cycle, we could
- * have errors in reading pages off the disk. This checks
- * for errors and if we're not able to read the page it'll
- * trigger parity reconstruction. The rmw will be finished
- * after we've reconstructed the failed stripes
- */
-static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
-{
- if (rbio->faila >= 0 || rbio->failb >= 0) {
- BUG_ON(rbio->faila == rbio->real_stripes - 1);
- __raid56_parity_recover(rbio);
- } else {
- finish_rmw(rbio);
- }
-}
-
static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
@@ -1158,109 +1183,71 @@ not_found:
trace_info->stripe_nr = -1;
}
-/*
- * this is called from one of two situations. We either
- * have a full stripe from the higher layers, or we've read all
- * the missing bits off disk.
- *
- * This will calculate the parity and then send down any
- * changed blocks.
- */
-static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+/* Generate PQ for one veritical stripe. */
+static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
{
- struct btrfs_io_context *bioc = rbio->bioc;
- const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
- int nr_data = rbio->nr_data;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ struct sector_ptr *sector;
+ int stripe;
+ const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
+
+ /* First collect one sector from each data stripe */
+ for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
+ }
+
+ /* Then add the parity stripe */
+ sector = rbio_pstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
+
+ if (has_qstripe) {
+ /*
+ * RAID6, add the qstripe and call the library function
+ * to fill in our p/q
+ */
+ sector = rbio_qstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) +
+ sector->pgoff;
+
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
+ pointers);
+ } else {
+ /* raid5 */
+ memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
+ }
+ for (stripe = stripe - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
+}
+
+static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
/* The total sector number inside the full stripe. */
int total_sector_nr;
- int stripe;
- /* Sector number inside a stripe. */
int sectornr;
- bool has_qstripe;
- struct bio_list bio_list;
- struct bio *bio;
+ int stripe;
int ret;
- bio_list_init(&bio_list);
-
- if (rbio->real_stripes - rbio->nr_data == 1)
- has_qstripe = false;
- else if (rbio->real_stripes - rbio->nr_data == 2)
- has_qstripe = true;
- else
- BUG();
+ ASSERT(bio_list_size(bio_list) == 0);
/* We should have at least one data sector. */
ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
- /* at this point we either have a full stripe,
- * or we've read the full stripe from the drive.
- * recalculate the parity and write the new results.
- *
- * We're not allowed to add any new bios to the
- * bio list here, anyone else that wants to
- * change this stripe needs to do their own rmw.
- */
- spin_lock_irq(&rbio->bio_list_lock);
- set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
- spin_unlock_irq(&rbio->bio_list_lock);
-
- atomic_set(&rbio->error, 0);
-
/*
- * now that we've set rmw_locked, run through the
- * bio list one last time and map the page pointers
- *
- * We don't cache full rbios because we're assuming
- * the higher layers are unlikely to use this area of
- * the disk again soon. If they do use it again,
- * hopefully they will send another full bio.
+ * Reset errors, as we may have errors inherited from from degraded
+ * write.
*/
- index_rbio_pages(rbio);
- if (!rbio_is_full(rbio))
- cache_rbio_pages(rbio);
- else
- clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
-
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
-
- /* First collect one sector from each data stripe */
- for (stripe = 0; stripe < nr_data; stripe++) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
- }
-
- /* Then add the parity stripe */
- sector = rbio_pstripe_sector(rbio, sectornr);
- sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
-
- if (has_qstripe) {
- /*
- * RAID6, add the qstripe and call the library function
- * to fill in our p/q
- */
- sector = rbio_qstripe_sector(rbio, sectornr);
- sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) +
- sector->pgoff;
-
- raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
- pointers);
- } else {
- /* raid5 */
- memcpy(pointers[nr_data], pointers[0], sectorsize);
- run_xor(pointers + 1, nr_data - 1, sectorsize);
- }
- for (stripe = stripe - 1; stripe >= 0; stripe--)
- kunmap_local(pointers[stripe]);
- }
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
/*
- * Start writing. Make bios for everything from the higher layers (the
+ * Start assembly. Make bios for everything from the higher layers (the
* bio_list in our rbio) and our P/Q. Ignore everything else.
*/
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
@@ -1282,15 +1269,16 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_WRITE);
if (ret)
- goto cleanup;
+ goto error;
}
- if (likely(!bioc->num_tgtdevs))
- goto write_data;
+ if (likely(!rbio->bioc->num_tgtdevs))
+ return 0;
+ /* Make a copy for the replace target device. */
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
struct sector_ptr *sector;
@@ -1298,7 +1286,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
stripe = total_sector_nr / rbio->stripe_nsectors;
sectornr = total_sector_nr % rbio->stripe_nsectors;
- if (!bioc->tgtdev_map[stripe]) {
+ if (!rbio->bioc->tgtdev_map[stripe]) {
/*
* We can skip the whole stripe completely, note
* total_sector_nr will be increased by one anyway.
@@ -1320,125 +1308,52 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ ret = rbio_add_io_sector(rbio, bio_list, sector,
rbio->bioc->tgtdev_map[stripe],
sectornr, REQ_OP_WRITE);
if (ret)
- goto cleanup;
- }
-
-write_data:
- atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
- BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
-
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_write_end_io;
-
- if (trace_raid56_write_stripe_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
-
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_write_stripe(rbio, bio, &trace_info);
- }
- submit_bio(bio);
+ goto error;
}
- return;
-cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
- while ((bio = bio_list_pop(&bio_list)))
+ return 0;
+error:
+ while ((bio = bio_list_pop(bio_list)))
bio_put(bio);
+ return -EIO;
}
-/*
- * helper to find the stripe number for a given bio. Used to figure out which
- * stripe has failed. This expects the bio to correspond to a physical disk,
- * so it looks up based on physical sector numbers.
- */
-static int find_bio_stripe(struct btrfs_raid_bio *rbio,
- struct bio *bio)
-{
- u64 physical = bio->bi_iter.bi_sector;
- int i;
- struct btrfs_io_stripe *stripe;
-
- physical <<= 9;
-
- for (i = 0; i < rbio->bioc->num_stripes; i++) {
- stripe = &rbio->bioc->stripes[i];
- if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
- stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
- return i;
- }
- }
- return -1;
-}
-
-/*
- * helper to find the stripe number for a given
- * bio (before mapping). Used to figure out which stripe has
- * failed. This looks up based on logical block numbers.
- */
-static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
- struct bio *bio)
-{
- u64 logical = bio->bi_iter.bi_sector << 9;
- int i;
-
- for (i = 0; i < rbio->nr_data; i++) {
- u64 stripe_start = rbio->bioc->raid_map[i];
-
- if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
- return i;
- }
- return -1;
-}
-
-/*
- * returns -EIO if we had too many failures
- */
-static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
+static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- unsigned long flags;
- int ret = 0;
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ rbio->bioc->raid_map[0];
+ int total_nr_sector = offset >> fs_info->sectorsize_bits;
- spin_lock_irqsave(&rbio->bio_list_lock, flags);
+ ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
- /* we already know this stripe is bad, move on */
- if (rbio->faila == failed || rbio->failb == failed)
- goto out;
+ bitmap_set(rbio->error_bitmap, total_nr_sector,
+ bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
- if (rbio->faila == -1) {
- /* first failure on this rbio */
- rbio->faila = failed;
- atomic_inc(&rbio->error);
- } else if (rbio->failb == -1) {
- /* second failure on this rbio */
- rbio->failb = failed;
- atomic_inc(&rbio->error);
- } else {
- ret = -EIO;
+ /*
+ * Special handling for raid56_alloc_missing_rbio() used by
+ * scrub/replace. Unlike call path in raid56_parity_recover(), they
+ * pass an empty bio here. Thus we have to find out the missing device
+ * and mark the stripe error instead.
+ */
+ if (bio->bi_iter.bi_size == 0) {
+ bool found_missing = false;
+ int stripe_nr;
+
+ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+ if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
+ found_missing = true;
+ bitmap_set(rbio->error_bitmap,
+ stripe_nr * rbio->stripe_nsectors,
+ rbio->stripe_nsectors);
+ }
+ }
+ ASSERT(found_missing);
}
-out:
- spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
-
- return ret;
-}
-
-/*
- * helper to fail a stripe based on a physical disk
- * bio.
- */
-static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
- struct bio *bio)
-{
- int failed = find_bio_stripe(rbio, bio);
-
- if (failed < 0)
- return -EIO;
-
- return fail_rbio_index(rbio, failed);
}
/*
@@ -1486,191 +1401,163 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
}
}
-static void raid56_bio_end_io(struct bio *bio)
+static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(rbio, bio);
+ struct bio_vec *bv = bio_first_bvec_all(bio);
+ int i;
- bio_put(bio);
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ struct sector_ptr *sector;
- if (atomic_dec_and_test(&rbio->stripes_pending))
- queue_work(rbio->bioc->fs_info->endio_raid56_workers,
- &rbio->end_io_work);
+ sector = &rbio->stripe_sectors[i];
+ if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ break;
+ sector = &rbio->bio_sectors[i];
+ if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ break;
+ }
+ ASSERT(i < rbio->nr_sectors);
+ return i;
}
-/*
- * End io handler for the read phase of the RMW cycle. All the bios here are
- * physical stripe bios we've read from the disk so we can recalculate the
- * parity of the stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid56_rmw_end_io_work(struct work_struct *work)
+static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- struct btrfs_raid_bio *rbio =
- container_of(work, struct btrfs_raid_bio, end_io_work);
+ int total_sector_nr = get_bio_sector_nr(rbio, bio);
+ u32 bio_size = 0;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
- return;
- }
+ bio_for_each_segment_all(bvec, bio, iter_all)
+ bio_size += bvec->bv_len;
- /*
- * This will normally call finish_rmw to start our write but if there
- * are any failed stripes we'll reconstruct from parity first.
- */
- validate_rbio_for_rmw(rbio);
+ bitmap_set(rbio->error_bitmap, total_sector_nr,
+ bio_size >> rbio->bioc->fs_info->sectorsize_bits);
}
-/*
- * the stripe must be locked by the caller. It will
- * unlock after all the writes are done
- */
-static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+/* Verify the data sectors at read time. */
+static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
{
- int bios_to_read = 0;
- struct bio_list bio_list;
- const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
- int ret;
- int total_sector_nr;
- struct bio *bio;
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ int total_sector_nr = get_bio_sector_nr(rbio, bio);
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- bio_list_init(&bio_list);
+ /* No data csum for the whole stripe, no need to verify. */
+ if (!rbio->csum_bitmap || !rbio->csum_buf)
+ return;
- ret = alloc_rbio_pages(rbio);
- if (ret)
- goto cleanup;
+ /* P/Q stripes, they have no data csum to verify against. */
+ if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
+ return;
- index_rbio_pages(rbio);
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ int bv_offset;
+
+ for (bv_offset = bvec->bv_offset;
+ bv_offset < bvec->bv_offset + bvec->bv_len;
+ bv_offset += fs_info->sectorsize, total_sector_nr++) {
+ u8 csum_buf[BTRFS_CSUM_SIZE];
+ u8 *expected_csum = rbio->csum_buf +
+ total_sector_nr * fs_info->csum_size;
+ int ret;
- atomic_set(&rbio->error, 0);
- /* Build a list of bios to read all the missing data sectors. */
- for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
- total_sector_nr++) {
- struct sector_ptr *sector;
- int stripe = total_sector_nr / rbio->stripe_nsectors;
- int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ /* No csum for this sector, skip to the next sector. */
+ if (!test_bit(total_sector_nr, rbio->csum_bitmap))
+ continue;
- /*
- * We want to find all the sectors missing from the rbio and
- * read them from the disk. If sector_in_rbio() finds a page
- * in the bio list we don't need to read it off the stripe.
- */
- sector = sector_in_rbio(rbio, stripe, sectornr, 1);
- if (sector)
- continue;
+ ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
+ bv_offset, csum_buf, expected_csum);
+ if (ret < 0)
+ set_bit(total_sector_nr, rbio->error_bitmap);
+ }
+ }
+}
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- /*
- * The bio cache may have handed us an uptodate page. If so,
- * use it.
- */
- if (sector->uptodate)
- continue;
+static void raid_wait_read_end_io(struct bio *bio)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
- ret = rbio_add_io_sector(rbio, &bio_list, sector,
- stripe, sectornr, REQ_OP_READ);
- if (ret)
- goto cleanup;
+ if (bio->bi_status) {
+ rbio_update_error_bitmap(rbio, bio);
+ } else {
+ set_bio_pages_uptodate(rbio, bio);
+ verify_bio_data_sectors(rbio, bio);
}
- bios_to_read = bio_list_size(&bio_list);
- if (!bios_to_read) {
- /*
- * this can happen if others have merged with
- * us, it means there is nothing left to read.
- * But if there are missing devices it may not be
- * safe to do the full stripe write yet.
- */
- goto finish;
- }
+ bio_put(bio);
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ wake_up(&rbio->io_wait);
+}
- /*
- * The bioc may be freed once we submit the last bio. Make sure not to
- * touch it after that.
- */
- atomic_set(&rbio->stripes_pending, bios_to_read);
- INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_bio_end_io;
+static void submit_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
+
+ atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
+ while ((bio = bio_list_pop(bio_list))) {
+ bio->bi_end_io = raid_wait_read_end_io;
- if (trace_raid56_read_partial_enabled()) {
+ if (trace_raid56_scrub_read_recover_enabled()) {
struct raid56_bio_trace_info trace_info = { 0 };
bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_read_partial(rbio, bio, &trace_info);
+ trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
}
submit_bio(bio);
}
- /* the actual write will happen once the reads are done */
- return 0;
+}
-cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
+static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
+ int total_sector_nr;
+ int ret = 0;
- while ((bio = bio_list_pop(&bio_list)))
- bio_put(bio);
+ ASSERT(bio_list_size(bio_list) == 0);
- return -EIO;
+ /*
+ * Build a list of bios to read all sectors (including data and P/Q).
+ *
+ * This behaviro is to compensate the later csum verification and
+ * recovery.
+ */
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
-finish:
- validate_rbio_for_rmw(rbio);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ ret = rbio_add_io_sector(rbio, bio_list, sector,
+ stripe, sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
+ }
return 0;
+
+cleanup:
+ while ((bio = bio_list_pop(bio_list)))
+ bio_put(bio);
+ return ret;
}
-/*
- * if the upper layers pass in a full stripe, we thank them by only allocating
- * enough pages to hold the parity, and sending it all down quickly.
- */
-static int full_stripe_write(struct btrfs_raid_bio *rbio)
+static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
{
+ const int data_pages = rbio->nr_data * rbio->stripe_npages;
int ret;
- ret = alloc_rbio_parity_pages(rbio);
- if (ret)
+ ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
+ if (ret < 0)
return ret;
- ret = lock_stripe_add(rbio);
- if (ret == 0)
- finish_rmw(rbio);
- return 0;
-}
-
-/*
- * partial stripe writes get handed over to async helpers.
- * We're really hoping to merge a few more writes into this
- * rbio before calculating new parity
- */
-static int partial_stripe_write(struct btrfs_raid_bio *rbio)
-{
- int ret;
-
- ret = lock_stripe_add(rbio);
- if (ret == 0)
- start_async_work(rbio, rmw_work);
+ index_stripe_sectors(rbio);
return 0;
}
/*
- * sometimes while we were reading from the drive to
- * recalculate parity, enough new bios come into create
- * a full stripe. So we do a check here to see if we can
- * go directly to finish_rmw
- */
-static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
-{
- /* head off into rmw land if we don't have a full stripe */
- if (!rbio_is_full(rbio))
- return partial_stripe_write(rbio);
- return full_stripe_write(rbio);
-}
-
-/*
* We use plugging call backs to collect full stripes.
* Any time we get a partial stripe write while plugged
* we collect it into a list. When the unplug comes down,
@@ -1704,71 +1591,39 @@ static int plug_cmp(void *priv, const struct list_head *a,
return 0;
}
-static void run_plug(struct btrfs_plug_cb *plug)
+static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
{
+ struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *last = NULL;
- /*
- * sort our plug list then try to merge
- * everything we can in hopes of creating full
- * stripes.
- */
list_sort(NULL, &plug->rbio_list, plug_cmp);
+
while (!list_empty(&plug->rbio_list)) {
cur = list_entry(plug->rbio_list.next,
struct btrfs_raid_bio, plug_list);
list_del_init(&cur->plug_list);
if (rbio_is_full(cur)) {
- int ret;
-
- /* we have a full stripe, send it down */
- ret = full_stripe_write(cur);
- BUG_ON(ret);
+ /* We have a full stripe, queue it down. */
+ start_async_work(cur, rmw_rbio_work);
continue;
}
if (last) {
if (rbio_can_merge(last, cur)) {
merge_rbio(last, cur);
- __free_raid_bio(cur);
+ free_raid_bio(cur);
continue;
-
}
- __raid56_parity_write(last);
+ start_async_work(last, rmw_rbio_work);
}
last = cur;
}
- if (last) {
- __raid56_parity_write(last);
- }
+ if (last)
+ start_async_work(last, rmw_rbio_work);
kfree(plug);
}
-/*
- * if the unplug comes from schedule, we have to push the
- * work off to a helper thread
- */
-static void unplug_work(struct work_struct *work)
-{
- struct btrfs_plug_cb *plug;
- plug = container_of(work, struct btrfs_plug_cb, work);
- run_plug(plug);
-}
-
-static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
-{
- struct btrfs_plug_cb *plug;
- plug = container_of(cb, struct btrfs_plug_cb, cb);
-
- if (from_schedule) {
- INIT_WORK(&plug->work, unplug_work);
- queue_work(plug->info->rmw_workers, &plug->work);
- return;
- }
- run_plug(plug);
-}
-
/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
{
@@ -1816,19 +1671,13 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
rbio_add_bio(rbio, bio);
/*
- * don't plug on full rbios, just get them out the door
+ * Don't plug on full rbios, just get them out the door
* as quickly as we can
*/
- if (rbio_is_full(rbio)) {
- ret = full_stripe_write(rbio);
- if (ret) {
- __free_raid_bio(rbio);
- goto fail;
- }
- return;
- }
+ if (rbio_is_full(rbio))
+ goto queue_rbio;
- cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
+ cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
if (cb) {
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (!plug->info) {
@@ -1836,13 +1685,14 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
INIT_LIST_HEAD(&plug->rbio_list);
}
list_add_tail(&rbio->plug_list, &plug->rbio_list);
- } else {
- ret = __raid56_parity_write(rbio);
- if (ret) {
- __free_raid_bio(rbio);
- goto fail;
- }
+ return;
}
+queue_rbio:
+ /*
+ * Either we don't have any existing plug, or we're doing a full stripe,
+ * can queue the rmw work now.
+ */
+ start_async_work(rbio, rmw_rbio_work);
return;
@@ -1851,268 +1701,254 @@ fail:
bio_endio(bio);
}
-/*
- * all parity reconstruction happens here. We've read in everything
- * we can find from the drives and this does the heavy lifting of
- * sorting the good from the bad.
- */
-static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+static int verify_one_sector(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- int sectornr, stripe;
- void **pointers;
- void **unmap_array;
- int faila = -1, failb = -1;
- blk_status_t err;
- int i;
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ struct sector_ptr *sector;
+ u8 csum_buf[BTRFS_CSUM_SIZE];
+ u8 *csum_expected;
+ int ret;
- /*
- * This array stores the pointer for each sector, thus it has the extra
- * pgoff value added from each sector
- */
- pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
- if (!pointers) {
- err = BLK_STS_RESOURCE;
- goto cleanup_io;
- }
+ if (!rbio->csum_bitmap || !rbio->csum_buf)
+ return 0;
+ /* No way to verify P/Q as they are not covered by data csum. */
+ if (stripe_nr >= rbio->nr_data)
+ return 0;
/*
- * Store copy of pointers that does not get reordered during
- * reconstruction so that kunmap_local works.
+ * If we're rebuilding a read, we have to use pages from the
+ * bio list if possible.
*/
- unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
- if (!unmap_array) {
- err = BLK_STS_RESOURCE;
- goto cleanup_pointers;
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
+ sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
}
- faila = rbio->faila;
- failb = rbio->failb;
+ ASSERT(sector->page);
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
- spin_lock_irq(&rbio->bio_list_lock);
- set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
- spin_unlock_irq(&rbio->bio_list_lock);
- }
+ csum_expected = rbio->csum_buf +
+ (stripe_nr * rbio->stripe_nsectors + sector_nr) *
+ fs_info->csum_size;
+ ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
+ csum_buf, csum_expected);
+ return ret;
+}
- index_rbio_pages(rbio);
+/*
+ * Recover a vertical stripe specified by @sector_nr.
+ * @*pointers are the pre-allocated pointers by the caller, so we don't
+ * need to allocate/free the pointers again and again.
+ */
+static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+ void **pointers, void **unmap_array)
+{
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ struct sector_ptr *sector;
+ const u32 sectorsize = fs_info->sectorsize;
+ int found_errors;
+ int faila;
+ int failb;
+ int stripe_nr;
+ int ret = 0;
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
- struct sector_ptr *sector;
+ /*
+ * Now we just use bitmap to mark the horizontal stripes in
+ * which we have data when doing parity scrub.
+ */
+ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+ !test_bit(sector_nr, &rbio->dbitmap))
+ return 0;
- /*
- * Now we just use bitmap to mark the horizontal stripes in
- * which we have data when doing parity scrub.
- */
- if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
- !test_bit(sectornr, &rbio->dbitmap))
- continue;
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
+ &failb);
+ /*
+ * No errors in the veritical stripe, skip it. Can happen for recovery
+ * which only part of a stripe failed csum check.
+ */
+ if (!found_errors)
+ return 0;
+ if (found_errors > rbio->bioc->max_errors)
+ return -EIO;
+
+ /*
+ * Setup our array of pointers with sectors from each stripe
+ *
+ * NOTE: store a duplicate array of pointers to preserve the
+ * pointer order.
+ */
+ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
/*
- * Setup our array of pointers with sectors from each stripe
- *
- * NOTE: store a duplicate array of pointers to preserve the
- * pointer order
+ * If we're rebuilding a read, we have to use pages from the
+ * bio list if possible.
*/
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- /*
- * If we're rebuilding a read, we have to use
- * pages from the bio list
- */
- if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
- (stripe == faila || stripe == failb)) {
- sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- } else {
- sector = rbio_stripe_sector(rbio, stripe, sectornr);
- }
- ASSERT(sector->page);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
- unmap_array[stripe] = pointers[stripe];
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
+ sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
}
+ ASSERT(sector->page);
+ pointers[stripe_nr] = kmap_local_page(sector->page) +
+ sector->pgoff;
+ unmap_array[stripe_nr] = pointers[stripe_nr];
+ }
- /* All raid6 handling here */
- if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
- /* Single failure, rebuild from parity raid5 style */
- if (failb < 0) {
- if (faila == rbio->nr_data) {
- /*
- * Just the P stripe has failed, without
- * a bad data or Q stripe.
- * TODO, we should redo the xor here.
- */
- err = BLK_STS_IOERR;
- goto cleanup;
- }
+ /* All raid6 handling here */
+ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+ /* Single failure, rebuild from parity raid5 style */
+ if (failb < 0) {
+ if (faila == rbio->nr_data)
/*
- * a single failure in raid6 is rebuilt
- * in the pstripe code below
+ * Just the P stripe has failed, without
+ * a bad data or Q stripe.
+ * We have nothing to do, just skip the
+ * recovery for this stripe.
*/
- goto pstripe;
- }
-
- /* make sure our ps and qs are in order */
- if (faila > failb)
- swap(faila, failb);
-
- /* if the q stripe is failed, do a pstripe reconstruction
- * from the xors.
- * If both the q stripe and the P stripe are failed, we're
- * here due to a crc mismatch and we can't give them the
- * data they want
+ goto cleanup;
+ /*
+ * a single failure in raid6 is rebuilt
+ * in the pstripe code below
*/
- if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->bioc->raid_map[faila] ==
- RAID5_P_STRIPE) {
- err = BLK_STS_IOERR;
- goto cleanup;
- }
+ goto pstripe;
+ }
+
+ /*
+ * If the q stripe is failed, do a pstripe reconstruction from
+ * the xors.
+ * If both the q stripe and the P stripe are failed, we're
+ * here due to a crc mismatch and we can't give them the
+ * data they want.
+ */
+ if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bioc->raid_map[faila] ==
+ RAID5_P_STRIPE)
/*
- * otherwise we have one bad data stripe and
- * a good P stripe. raid5!
+ * Only P and Q are corrupted.
+ * We only care about data stripes recovery,
+ * can skip this vertical stripe.
*/
- goto pstripe;
- }
+ goto cleanup;
+ /*
+ * Otherwise we have one bad data stripe and
+ * a good P stripe. raid5!
+ */
+ goto pstripe;
+ }
- if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
- raid6_datap_recov(rbio->real_stripes,
- sectorsize, faila, pointers);
- } else {
- raid6_2data_recov(rbio->real_stripes,
- sectorsize, faila, failb,
- pointers);
- }
+ if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
+ raid6_datap_recov(rbio->real_stripes, sectorsize,
+ faila, pointers);
} else {
- void *p;
+ raid6_2data_recov(rbio->real_stripes, sectorsize,
+ faila, failb, pointers);
+ }
+ } else {
+ void *p;
- /* rebuild from P stripe here (raid5 or raid6) */
- BUG_ON(failb != -1);
+ /* Rebuild from P stripe here (raid5 or raid6). */
+ ASSERT(failb == -1);
pstripe:
- /* Copy parity block into failed block to start with */
- memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
+ /* Copy parity block into failed block to start with */
+ memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
- /* rearrange the pointer array */
- p = pointers[faila];
- for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
- pointers[stripe] = pointers[stripe + 1];
- pointers[rbio->nr_data - 1] = p;
+ /* Rearrange the pointer array */
+ p = pointers[faila];
+ for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
+ stripe_nr++)
+ pointers[stripe_nr] = pointers[stripe_nr + 1];
+ pointers[rbio->nr_data - 1] = p;
- /* xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, sectorsize);
- }
- /* if we're doing this rebuild as part of an rmw, go through
- * and set all of our private rbio pages in the
- * failed stripes as uptodate. This way finish_rmw will
- * know they can be trusted. If this was a read reconstruction,
- * other endio functions will fiddle the uptodate bits
- */
- if (rbio->operation == BTRFS_RBIO_WRITE) {
- for (i = 0; i < rbio->stripe_nsectors; i++) {
- if (faila != -1) {
- sector = rbio_stripe_sector(rbio, faila, i);
- sector->uptodate = 1;
- }
- if (failb != -1) {
- sector = rbio_stripe_sector(rbio, failb, i);
- sector->uptodate = 1;
- }
- }
- }
- for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
- kunmap_local(unmap_array[stripe]);
+ /* Xor in the rest */
+ run_xor(pointers, rbio->nr_data - 1, sectorsize);
+
+ }
+
+ /*
+ * No matter if this is a RMW or recovery, we should have all
+ * failed sectors repaired in the vertical stripe, thus they are now
+ * uptodate.
+ * Especially if we determine to cache the rbio, we need to
+ * have at least all data sectors uptodate.
+ *
+ * If possible, also check if the repaired sector matches its data
+ * checksum.
+ */
+ if (faila >= 0) {
+ ret = verify_one_sector(rbio, faila, sector_nr);
+ if (ret < 0)
+ goto cleanup;
+
+ sector = rbio_stripe_sector(rbio, faila, sector_nr);
+ sector->uptodate = 1;
+ }
+ if (failb >= 0) {
+ ret = verify_one_sector(rbio, faila, sector_nr);
+ if (ret < 0)
+ goto cleanup;
+
+ sector = rbio_stripe_sector(rbio, failb, sector_nr);
+ sector->uptodate = 1;
}
- err = BLK_STS_OK;
cleanup:
- kfree(unmap_array);
-cleanup_pointers:
- kfree(pointers);
+ for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
+ kunmap_local(unmap_array[stripe_nr]);
+ return ret;
+}
+
+static int recover_sectors(struct btrfs_raid_bio *rbio)
+{
+ void **pointers = NULL;
+ void **unmap_array = NULL;
+ int sectornr;
+ int ret = 0;
-cleanup_io:
/*
- * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
- * valid rbio which is consistent with ondisk content, thus such a
- * valid rbio can be cached to avoid further disk reads.
+ * @pointers array stores the pointer for each sector.
+ *
+ * @unmap_array stores copy of pointers that does not get reordered
+ * during reconstruction so that kunmap_local works.
*/
+ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!pointers || !unmap_array) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
- /*
- * - In case of two failures, where rbio->failb != -1:
- *
- * Do not cache this rbio since the above read reconstruction
- * (raid6_datap_recov() or raid6_2data_recov()) may have
- * changed some content of stripes which are not identical to
- * on-disk content any more, otherwise, a later write/recover
- * may steal stripe_pages from this rbio and end up with
- * corruptions or rebuild failures.
- *
- * - In case of single failure, where rbio->failb == -1:
- *
- * Cache this rbio iff the above read reconstruction is
- * executed without problems.
- */
- if (err == BLK_STS_OK && rbio->failb < 0)
- cache_rbio_pages(rbio);
- else
- clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+ }
- rbio_orig_end_io(rbio, err);
- } else if (err == BLK_STS_OK) {
- rbio->faila = -1;
- rbio->failb = -1;
+ index_rbio_pages(rbio);
- if (rbio->operation == BTRFS_RBIO_WRITE)
- finish_rmw(rbio);
- else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
- finish_parity_scrub(rbio, 0);
- else
- BUG();
- } else {
- rbio_orig_end_io(rbio, err);
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
+ if (ret < 0)
+ break;
}
-}
-
-/*
- * This is called only for stripes we've read from disk to reconstruct the
- * parity.
- */
-static void raid_recover_end_io_work(struct work_struct *work)
-{
- struct btrfs_raid_bio *rbio =
- container_of(work, struct btrfs_raid_bio, end_io_work);
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
- else
- __raid_recover_end_io(rbio);
+out:
+ kfree(pointers);
+ kfree(unmap_array);
+ return ret;
}
-/*
- * reads everything we need off the disk to reconstruct
- * the parity. endio handlers trigger final reconstruction
- * when the IO is done.
- *
- * This is used both for reads from the higher layers and for
- * parity construction required to finish a rmw cycle.
- */
-static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
{
- int bios_to_read = 0;
- struct bio_list bio_list;
- int ret;
- int total_sector_nr;
struct bio *bio;
+ int total_sector_nr;
+ int ret = 0;
- bio_list_init(&bio_list);
-
- ret = alloc_rbio_pages(rbio);
- if (ret)
- goto cleanup;
-
- atomic_set(&rbio->error, 0);
-
+ ASSERT(bio_list_size(bio_list) == 0);
/*
* Read everything that hasn't failed. However this time we will
* not trust any cached sector.
@@ -2127,64 +1963,139 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int sectornr = total_sector_nr % rbio->stripe_nsectors;
struct sector_ptr *sector;
- if (rbio->faila == stripe || rbio->failb == stripe) {
- atomic_inc(&rbio->error);
- /* Skip the current stripe. */
- ASSERT(sectornr == 0);
- total_sector_nr += rbio->stripe_nsectors - 1;
+ /*
+ * Skip the range which has error. It can be a range which is
+ * marked error (for csum mismatch), or it can be a missing
+ * device.
+ */
+ if (!rbio->bioc->stripes[stripe].dev->bdev ||
+ test_bit(total_sector_nr, rbio->error_bitmap)) {
+ /*
+ * Also set the error bit for missing device, which
+ * may not yet have its error bit set.
+ */
+ set_bit(total_sector_nr, rbio->error_bitmap);
continue;
}
+
sector = rbio_stripe_sector(rbio, stripe, sectornr);
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_READ);
if (ret < 0)
- goto cleanup;
+ goto error;
}
+ return 0;
+error:
+ while ((bio = bio_list_pop(bio_list)))
+ bio_put(bio);
- bios_to_read = bio_list_size(&bio_list);
- if (!bios_to_read) {
- /*
- * we might have no bios to read just because the pages
- * were up to date, or we might have no bios to read because
- * the devices were gone.
- */
- if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
- __raid_recover_end_io(rbio);
- return 0;
- } else {
- goto cleanup;
- }
- }
+ return -EIO;
+}
+
+static int recover_rbio(struct btrfs_raid_bio *rbio)
+{
+ struct bio_list bio_list;
+ struct bio *bio;
+ int ret;
/*
- * The bioc may be freed once we submit the last bio. Make sure not to
- * touch it after that.
+ * Either we're doing recover for a read failure or degraded write,
+ * caller should have set error bitmap correctly.
*/
- atomic_set(&rbio->stripes_pending, bios_to_read);
- INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_bio_end_io;
+ ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
+ bio_list_init(&bio_list);
- if (trace_raid56_scrub_read_recover_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
+ /* For recovery, we need to read all sectors including P/Q. */
+ ret = alloc_rbio_pages(rbio);
+ if (ret < 0)
+ goto out;
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
- }
- submit_bio(bio);
- }
+ index_rbio_pages(rbio);
- return 0;
+ ret = recover_assemble_read_bios(rbio, &bio_list);
+ if (ret < 0)
+ goto out;
-cleanup:
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
+ submit_read_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+ ret = recover_sectors(rbio);
+out:
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
- return -EIO;
+ return ret;
+}
+
+static void recover_rbio_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio;
+ int ret;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0) {
+ ret = recover_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ }
+}
+
+static void recover_rbio_work_locked(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio;
+ int ret;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+
+ ret = recover_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+}
+
+static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
+{
+ bool found = false;
+ int sector_nr;
+
+ /*
+ * This is for RAID6 extra recovery tries, thus mirror number should
+ * be large than 2.
+ * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
+ * RAID5 methods.
+ */
+ ASSERT(mirror_num > 2);
+ for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
+ int found_errors;
+ int faila;
+ int failb;
+
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ &faila, &failb);
+ /* This vertical stripe doesn't have errors. */
+ if (!found_errors)
+ continue;
+
+ /*
+ * If we found errors, there should be only one error marked
+ * by previous set_rbio_range_error().
+ */
+ ASSERT(found_errors == 1);
+ found = true;
+
+ /* Now select another stripe to mark as error. */
+ failb = rbio->real_stripes - (mirror_num - 1);
+ if (failb <= faila)
+ failb--;
+
+ /* Set the extra bit in error bitmap. */
+ if (failb >= 0)
+ set_bit(failb * rbio->stripe_nsectors + sector_nr,
+ rbio->error_bitmap);
+ }
+
+ /* We should found at least one vertical stripe with error.*/
+ ASSERT(found);
}
/*
@@ -2202,68 +2113,284 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
- goto out_end_bio;
+ bio_endio(bio);
+ return;
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
rbio_add_bio(rbio, bio);
- rbio->faila = find_logical_bio_stripe(rbio, bio);
- if (rbio->faila == -1) {
- btrfs_warn(fs_info,
-"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
- __func__, bio->bi_iter.bi_sector << 9,
- (u64)bio->bi_iter.bi_size, bioc->map_type);
- __free_raid_bio(rbio);
- bio->bi_status = BLK_STS_IOERR;
- goto out_end_bio;
- }
+ set_rbio_range_error(rbio, bio);
/*
* Loop retry:
* for 'mirror == 2', reconstruct from all other stripes.
* for 'mirror_num > 2', select a stripe to fail on every retry.
*/
- if (mirror_num > 2) {
+ if (mirror_num > 2)
+ set_rbio_raid6_extra_error(rbio, mirror_num);
+
+ start_async_work(rbio, recover_rbio_work);
+}
+
+static void fill_data_csums(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
+ rbio->bioc->raid_map[0]);
+ const u64 start = rbio->bioc->raid_map[0];
+ const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
+ fs_info->sectorsize_bits;
+ int ret;
+
+ /* The rbio should not have its csum buffer initialized. */
+ ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
+
+ /*
+ * Skip the csum search if:
+ *
+ * - The rbio doesn't belong to data block groups
+ * Then we are doing IO for tree blocks, no need to search csums.
+ *
+ * - The rbio belongs to mixed block groups
+ * This is to avoid deadlock, as we're already holding the full
+ * stripe lock, if we trigger a metadata read, and it needs to do
+ * raid56 recovery, we will deadlock.
+ */
+ if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
+ rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
+ return;
+
+ rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
+ fs_info->csum_size, GFP_NOFS);
+ rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
+ GFP_NOFS);
+ if (!rbio->csum_buf || !rbio->csum_bitmap) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1,
+ rbio->csum_buf, rbio->csum_bitmap);
+ if (ret < 0)
+ goto error;
+ if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
+ goto no_csum;
+ return;
+
+error:
+ /*
+ * We failed to allocate memory or grab the csum, but it's not fatal,
+ * we can still continue. But better to warn users that RMW is no
+ * longer safe for this particular sub-stripe write.
+ */
+ btrfs_warn_rl(fs_info,
+"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
+ rbio->bioc->raid_map[0], ret);
+no_csum:
+ kfree(rbio->csum_buf);
+ bitmap_free(rbio->csum_bitmap);
+ rbio->csum_buf = NULL;
+ rbio->csum_bitmap = NULL;
+}
+
+static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
+{
+ struct bio_list bio_list;
+ struct bio *bio;
+ int ret;
+
+ bio_list_init(&bio_list);
+
+ /*
+ * Fill the data csums we need for data verification. We need to fill
+ * the csum_bitmap/csum_buf first, as our endio function will try to
+ * verify the data sectors.
+ */
+ fill_data_csums(rbio);
+
+ ret = rmw_assemble_read_bios(rbio, &bio_list);
+ if (ret < 0)
+ goto out;
+
+ submit_read_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+ /*
+ * We may or may not have any corrupted sectors (including missing dev
+ * and csum mismatch), just let recover_sectors() to handle them all.
+ */
+ ret = recover_sectors(rbio);
+ return ret;
+out:
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
+
+ return ret;
+}
+
+static void raid_wait_write_end_io(struct bio *bio)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+ blk_status_t err = bio->bi_status;
+
+ if (err)
+ rbio_update_error_bitmap(rbio, bio);
+ bio_put(bio);
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ wake_up(&rbio->io_wait);
+}
+
+static void submit_write_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
+{
+ struct bio *bio;
+
+ atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
+ while ((bio = bio_list_pop(bio_list))) {
+ bio->bi_end_io = raid_wait_write_end_io;
+
+ if (trace_raid56_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_write_stripe(rbio, bio, &trace_info);
+ }
+ submit_bio(bio);
+ }
+}
+
+/*
+ * To determine if we need to read any sector from the disk.
+ * Should only be utilized in RMW path, to skip cached rbio.
+ */
+static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
+{
+ int i;
+
+ for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
+ struct sector_ptr *sector = &rbio->stripe_sectors[i];
+
/*
- * 'mirror == 3' is to fail the p stripe and
- * reconstruct from the q stripe. 'mirror > 3' is to
- * fail a data stripe and reconstruct from p+q stripe.
+ * We have a sector which doesn't have page nor uptodate,
+ * thus this rbio can not be cached one, as cached one must
+ * have all its data sectors present and uptodate.
*/
- rbio->failb = rbio->real_stripes - (mirror_num - 1);
- ASSERT(rbio->failb > 0);
- if (rbio->failb <= rbio->faila)
- rbio->failb--;
+ if (!sector->page || !sector->uptodate)
+ return true;
}
+ return false;
+}
- if (lock_stripe_add(rbio))
- return;
+static int rmw_rbio(struct btrfs_raid_bio *rbio)
+{
+ struct bio_list bio_list;
+ int sectornr;
+ int ret = 0;
/*
- * This adds our rbio to the list of rbios that will be handled after
- * the current lock owner is done.
+ * Allocate the pages for parity first, as P/Q pages will always be
+ * needed for both full-stripe and sub-stripe writes.
*/
- __raid56_parity_recover(rbio);
- return;
+ ret = alloc_rbio_parity_pages(rbio);
+ if (ret < 0)
+ return ret;
-out_end_bio:
- bio_endio(bio);
+ /*
+ * Either full stripe write, or we have every data sector already
+ * cached, can go to write path immediately.
+ */
+ if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio))
+ goto write;
+
+ /*
+ * Now we're doing sub-stripe write, also need all data stripes to do
+ * the full RMW.
+ */
+ ret = alloc_rbio_data_pages(rbio);
+ if (ret < 0)
+ return ret;
+
+ index_rbio_pages(rbio);
+
+ ret = rmw_read_wait_recover(rbio);
+ if (ret < 0)
+ return ret;
+
+write:
+ /*
+ * At this stage we're not allowed to add any new bios to the
+ * bio list any more, anyone else that wants to change this stripe
+ * needs to do their own rmw.
+ */
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
+
+ index_rbio_pages(rbio);
+
+ /*
+ * We don't cache full rbios because we're assuming
+ * the higher layers are unlikely to use this area of
+ * the disk again soon. If they do use it again,
+ * hopefully they will send another full bio.
+ */
+ if (!rbio_is_full(rbio))
+ cache_rbio_pages(rbio);
+ else
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
+ generate_pq_vertical(rbio, sectornr);
+
+ bio_list_init(&bio_list);
+ ret = rmw_assemble_write_bios(rbio, &bio_list);
+ if (ret < 0)
+ return ret;
+
+ /* We should have at least one bio assembled. */
+ ASSERT(bio_list_size(&bio_list));
+ submit_write_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+ /* We may have more errors than our tolerance during the read. */
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ int found_errors;
+
+ found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
+ if (found_errors > rbio->bioc->max_errors) {
+ ret = -EIO;
+ break;
+ }
+ }
+ return ret;
}
-static void rmw_work(struct work_struct *work)
+static void rmw_rbio_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
+ int ret;
rbio = container_of(work, struct btrfs_raid_bio, work);
- raid56_rmw_stripe(rbio);
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0) {
+ ret = rmw_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+ }
}
-static void read_rebuild_work(struct work_struct *work)
+static void rmw_rbio_work_locked(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
+ int ret;
rbio = container_of(work, struct btrfs_raid_bio, work);
- __raid56_parity_recover(rbio);
+
+ ret = rmw_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}
/*
@@ -2358,8 +2485,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
return 0;
}
-static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
- int need_check)
+static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
{
struct btrfs_io_context *bioc = rbio->bioc;
const u32 sectorsize = bioc->fs_info->sectorsize;
@@ -2402,7 +2528,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
p_sector.page = alloc_page(GFP_NOFS);
if (!p_sector.page)
- goto cleanup;
+ return -ENOMEM;
p_sector.pgoff = 0;
p_sector.uptodate = 1;
@@ -2412,14 +2538,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (!q_sector.page) {
__free_page(p_sector.page);
p_sector.page = NULL;
- goto cleanup;
+ return -ENOMEM;
}
q_sector.pgoff = 0;
q_sector.uptodate = 1;
pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
}
- atomic_set(&rbio->error, 0);
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
/* Map the parity stripe just once */
pointers[nr_data] = kmap_local_page(p_sector.page);
@@ -2499,33 +2625,13 @@ writeback:
}
submit_write:
- nr_data = bio_list_size(&bio_list);
- if (!nr_data) {
- /* Every parity is right */
- rbio_orig_end_io(rbio, BLK_STS_OK);
- return;
- }
-
- atomic_set(&rbio->stripes_pending, nr_data);
-
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid_write_end_io;
-
- if (trace_raid56_scrub_write_stripe_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
-
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
- }
- submit_bio(bio);
- }
- return;
+ submit_write_bios(rbio, &bio_list);
+ return 0;
cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
+ return ret;
}
static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@ -2535,102 +2641,99 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
return 0;
}
-/*
- * While we're doing the parity check and repair, we could have errors
- * in reading pages off the disk. This checks for errors and if we're
- * not able to read the page it'll trigger parity reconstruction. The
- * parity scrub will be finished after we've reconstructed the failed
- * stripes
- */
-static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
+static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
{
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
- goto cleanup;
+ void **pointers = NULL;
+ void **unmap_array = NULL;
+ int sector_nr;
+ int ret = 0;
+
+ /*
+ * @pointers array stores the pointer for each sector.
+ *
+ * @unmap_array stores copy of pointers that does not get reordered
+ * during reconstruction so that kunmap_local works.
+ */
+ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!pointers || !unmap_array) {
+ ret = -ENOMEM;
+ goto out;
+ }
- if (rbio->faila >= 0 || rbio->failb >= 0) {
+ for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
int dfail = 0, failp = -1;
+ int faila;
+ int failb;
+ int found_errors;
+
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ &faila, &failb);
+ if (found_errors > rbio->bioc->max_errors) {
+ ret = -EIO;
+ goto out;
+ }
+ if (found_errors == 0)
+ continue;
- if (is_data_stripe(rbio, rbio->faila))
- dfail++;
- else if (is_parity_stripe(rbio->faila))
- failp = rbio->faila;
+ /* We should have at least one error here. */
+ ASSERT(faila >= 0 || failb >= 0);
- if (is_data_stripe(rbio, rbio->failb))
+ if (is_data_stripe(rbio, faila))
dfail++;
- else if (is_parity_stripe(rbio->failb))
- failp = rbio->failb;
+ else if (is_parity_stripe(faila))
+ failp = faila;
+ if (is_data_stripe(rbio, failb))
+ dfail++;
+ else if (is_parity_stripe(failb))
+ failp = failb;
/*
- * Because we can not use a scrubbing parity to repair
- * the data, so the capability of the repair is declined.
- * (In the case of RAID5, we can not repair anything)
+ * Because we can not use a scrubbing parity to repair the
+ * data, so the capability of the repair is declined. (In the
+ * case of RAID5, we can not repair anything.)
*/
- if (dfail > rbio->bioc->max_errors - 1)
- goto cleanup;
-
+ if (dfail > rbio->bioc->max_errors - 1) {
+ ret = -EIO;
+ goto out;
+ }
/*
- * If all data is good, only parity is correctly, just
- * repair the parity.
+ * If all data is good, only parity is correctly, just repair
+ * the parity, no need to recover data stripes.
*/
- if (dfail == 0) {
- finish_parity_scrub(rbio, 0);
- return;
- }
+ if (dfail == 0)
+ continue;
/*
* Here means we got one corrupted data stripe and one
- * corrupted parity on RAID6, if the corrupted parity
- * is scrubbing parity, luckily, use the other one to repair
- * the data, or we can not repair the data stripe.
+ * corrupted parity on RAID6, if the corrupted parity is
+ * scrubbing parity, luckily, use the other one to repair the
+ * data, or we can not repair the data stripe.
*/
- if (failp != rbio->scrubp)
- goto cleanup;
+ if (failp != rbio->scrubp) {
+ ret = -EIO;
+ goto out;
+ }
- __raid_recover_end_io(rbio);
- } else {
- finish_parity_scrub(rbio, 1);
+ ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
+ if (ret < 0)
+ goto out;
}
- return;
-
-cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-}
-
-/*
- * end io for the read phase of the rmw cycle. All the bios here are physical
- * stripe bios we've read from the disk so we can recalculate the parity of the
- * stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid56_parity_scrub_end_io_work(struct work_struct *work)
-{
- struct btrfs_raid_bio *rbio =
- container_of(work, struct btrfs_raid_bio, end_io_work);
-
- /*
- * This will normally call finish_rmw to start our write, but if there
- * are any failed stripes we'll reconstruct from parity first
- */
- validate_rbio_for_parity_scrub(rbio);
+out:
+ kfree(pointers);
+ kfree(unmap_array);
+ return ret;
}
-static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list)
{
- int bios_to_read = 0;
- struct bio_list bio_list;
- int ret;
- int total_sector_nr;
struct bio *bio;
+ int total_sector_nr;
+ int ret = 0;
- bio_list_init(&bio_list);
-
- ret = alloc_rbio_essential_pages(rbio);
- if (ret)
- goto cleanup;
+ ASSERT(bio_list_size(bio_list) == 0);
- atomic_set(&rbio->error, 0);
/* Build a list of bios to read all the missing parts. */
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
@@ -2659,67 +2762,84 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
if (sector->uptodate)
continue;
- ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_READ);
if (ret)
- goto cleanup;
+ goto error;
}
+ return 0;
+error:
+ while ((bio = bio_list_pop(bio_list)))
+ bio_put(bio);
+ return ret;
+}
- bios_to_read = bio_list_size(&bio_list);
- if (!bios_to_read) {
- /*
- * this can happen if others have merged with
- * us, it means there is nothing left to read.
- * But if there are missing devices it may not be
- * safe to do the full stripe write yet.
- */
- goto finish;
- }
+static int scrub_rbio(struct btrfs_raid_bio *rbio)
+{
+ bool need_check = false;
+ struct bio_list bio_list;
+ int sector_nr;
+ int ret;
+ struct bio *bio;
- /*
- * The bioc may be freed once we submit the last bio. Make sure not to
- * touch it after that.
- */
- atomic_set(&rbio->stripes_pending, bios_to_read);
- INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
- while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_end_io = raid56_bio_end_io;
+ bio_list_init(&bio_list);
- if (trace_raid56_scrub_read_enabled()) {
- struct raid56_bio_trace_info trace_info = { 0 };
+ ret = alloc_rbio_essential_pages(rbio);
+ if (ret)
+ goto cleanup;
- bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_scrub_read(rbio, bio, &trace_info);
+ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
+
+ ret = scrub_assemble_read_bios(rbio, &bio_list);
+ if (ret < 0)
+ goto cleanup;
+
+ submit_read_bios(rbio, &bio_list);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+
+ /* We may have some failures, recover the failed sectors first. */
+ ret = recover_scrub_rbio(rbio);
+ if (ret < 0)
+ goto cleanup;
+
+ /*
+ * We have every sector properly prepared. Can finish the scrub
+ * and writeback the good content.
+ */
+ ret = finish_parity_scrub(rbio, need_check);
+ wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
+ for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
+ int found_errors;
+
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
+ if (found_errors > rbio->bioc->max_errors) {
+ ret = -EIO;
+ break;
}
- submit_bio(bio);
}
- /* the actual write will happen once the reads are done */
- return;
+ return ret;
cleanup:
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
- return;
-
-finish:
- validate_rbio_for_parity_scrub(rbio);
+ return ret;
}
-static void scrub_parity_work(struct work_struct *work)
+static void scrub_rbio_work_locked(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
+ int ret;
rbio = container_of(work, struct btrfs_raid_bio, work);
- raid56_parity_scrub_stripe(rbio);
+ ret = scrub_rbio(rbio);
+ rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
{
if (!lock_stripe_add(rbio))
- start_async_work(rbio, scrub_parity_work);
+ start_async_work(rbio, scrub_rbio_work_locked);
}
/* The following code is used for dev replace of a missing RAID 5/6 device. */
@@ -2742,20 +2862,12 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
*/
ASSERT(!bio->bi_iter.bi_size);
- rbio->faila = find_logical_bio_stripe(rbio, bio);
- if (rbio->faila == -1) {
- btrfs_warn_rl(fs_info,
- "can not determine the failed stripe number for full stripe %llu",
- bioc->raid_map[0]);
- __free_raid_bio(rbio);
- return NULL;
- }
+ set_rbio_range_error(rbio, bio);
return rbio;
}
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
{
- if (!lock_stripe_add(rbio))
- start_async_work(rbio, read_rebuild_work);
+ start_async_work(rbio, recover_rbio_work);
}