diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 1156 |
1 files changed, 803 insertions, 353 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fcf77e1ded40..8d904dd7ea9f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -19,6 +19,7 @@ #include "btrfs_inode.h" #include "volumes.h" #include "check-integrity.h" +#include "locking.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -53,6 +54,13 @@ struct extent_page_data { unsigned int sync_io:1; }; +static noinline void flush_write_bio(void *data); +static inline struct btrfs_fs_info * +tree_fs_info(struct extent_io_tree *tree) +{ + return btrfs_sb(tree->mapping->host->i_sb); +} + int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("extent_state", @@ -136,6 +144,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) #endif atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); + trace_alloc_extent_state(state, mask, _RET_IP_); return state; } @@ -153,6 +162,7 @@ void free_extent_state(struct extent_state *state) list_del(&state->leak_list); spin_unlock_irqrestore(&leak_lock, flags); #endif + trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } } @@ -439,6 +449,13 @@ alloc_extent_state_atomic(struct extent_state *prealloc) return prealloc; } +void extent_io_tree_panic(struct extent_io_tree *tree, int err) +{ + btrfs_panic(tree_fs_info(tree), err, "Locking error: " + "Extent tree was modified by another " + "thread while locked."); +} + /* * clear some bits on a range in the tree. This may require splitting * or inserting elements in the tree, so the gfp mask is used to @@ -449,8 +466,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc) * * the range [start, end] is inclusive. * - * This takes the tree lock, and returns < 0 on error, > 0 if any of the - * bits were already set, or zero if none of the bits were already set. + * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int wake, int delete, @@ -464,7 +480,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct rb_node *node; u64 last_end; int err; - int set = 0; int clear = 0; if (delete) @@ -513,6 +528,15 @@ hit_next: WARN_ON(state->end < start); last_end = state->end; + if (state->end < end && !need_resched()) + next_node = rb_next(&state->rb_node); + else + next_node = NULL; + + /* the state doesn't have the wanted bits, go ahead */ + if (!(state->state & bits)) + goto next; + /* * | ---- desired range ---- | * | state | or @@ -533,12 +557,14 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, &bits, wake); + clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -555,30 +581,27 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, &bits, wake); + clear_state_bit(tree, prealloc, &bits, wake); prealloc = NULL; goto out; } - if (state->end < end && prealloc && !need_resched()) - next_node = rb_next(&state->rb_node); - else - next_node = NULL; - - set |= clear_state_bit(tree, state, &bits, wake); + clear_state_bit(tree, state, &bits, wake); +next: if (last_end == (u64)-1) goto out; start = last_end + 1; if (start <= end && next_node) { state = rb_entry(next_node, struct extent_state, rb_node); - if (state->start == start) - goto hit_next; + goto hit_next; } goto search_again; @@ -587,7 +610,7 @@ out: if (prealloc) free_extent_state(prealloc); - return set; + return 0; search_again: if (start > end) @@ -598,8 +621,8 @@ search_again: goto again; } -static int wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) +static void wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) __releases(tree->lock) __acquires(tree->lock) { @@ -609,7 +632,6 @@ static int wait_on_state(struct extent_io_tree *tree, schedule(); spin_lock(&tree->lock); finish_wait(&state->wq, &wait); - return 0; } /* @@ -617,7 +639,7 @@ static int wait_on_state(struct extent_io_tree *tree, * The range [start, end] is inclusive. * The tree lock is taken by this function */ -int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) { struct extent_state *state; struct rb_node *node; @@ -654,7 +676,6 @@ again: } out: spin_unlock(&tree->lock); - return 0; } static void set_state_bits(struct extent_io_tree *tree, @@ -702,9 +723,10 @@ static void uncache_state(struct extent_state **cached_ptr) * [start, end] is inclusive This takes the tree lock. */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask) +static int __must_check +__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -738,8 +760,10 @@ again: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = insert_state(tree, prealloc, start, end, &bits); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; - BUG_ON(err == -EEXIST); goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -805,7 +829,9 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; @@ -842,12 +868,9 @@ hit_next: */ err = insert_state(tree, prealloc, start, this_end, &bits); - BUG_ON(err == -EEXIST); - if (err) { - free_extent_state(prealloc); - prealloc = NULL; - goto out; - } + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; start = this_end + 1; @@ -869,7 +892,8 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); cache_state(prealloc, cached_state); @@ -896,6 +920,15 @@ search_again: goto again; } +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, + u64 *failed_start, struct extent_state **cached_state, + gfp_t mask) +{ + return __set_extent_bit(tree, start, end, bits, 0, failed_start, + cached_state, mask); +} + + /** * convert_extent - convert all bits in a given range from one bit to another * @tree: the io tree to search @@ -942,7 +975,8 @@ again: } err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -961,8 +995,6 @@ hit_next: set_state_bits(tree, state, &bits); clear_state_bit(tree, state, &clear_bits, 0); - - merge_state(tree, state); if (last_end == (u64)-1) goto out; @@ -1000,14 +1032,14 @@ hit_next: goto out; } err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); prealloc = NULL; if (err) goto out; if (state->end <= end) { set_state_bits(tree, state, &bits); clear_state_bit(tree, state, &clear_bits, 0); - merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1040,12 +1072,8 @@ hit_next: */ err = insert_state(tree, prealloc, start, this_end, &bits); - BUG_ON(err == -EEXIST); - if (err) { - free_extent_state(prealloc); - prealloc = NULL; - goto out; - } + if (err) + extent_io_tree_panic(tree, err); prealloc = NULL; start = this_end + 1; goto search_again; @@ -1064,12 +1092,11 @@ hit_next: } err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); clear_state_bit(tree, prealloc, &clear_bits, 0); - - merge_state(tree, prealloc); prealloc = NULL; goto out; } @@ -1096,14 +1123,14 @@ search_again: int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, NULL, mask); } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { - return set_extent_bit(tree, start, end, bits, 0, NULL, + return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } @@ -1118,7 +1145,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE, - 0, NULL, cached_state, mask); + NULL, cached_state, mask); } int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, @@ -1132,7 +1159,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); } @@ -1140,7 +1167,7 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, - NULL, cached_state, mask); + cached_state, mask); } static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -1156,42 +1183,40 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached_state, gfp_t mask) + int bits, struct extent_state **cached_state) { int err; u64 failed_start; while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, - EXTENT_LOCKED, &failed_start, - cached_state, mask); - if (err == -EEXIST && (mask & __GFP_WAIT)) { + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, GFP_NOFS); + if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; - } else { + } else break; - } WARN_ON(start > end); } return err; } -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { - return lock_extent_bits(tree, start, end, 0, NULL, mask); + return lock_extent_bits(tree, start, end, 0, NULL); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { int err; u64 failed_start; - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, mask); + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL, mask); + EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); return 0; } return 1; @@ -1204,10 +1229,10 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, mask); } -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) { return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - mask); + GFP_NOFS); } /* @@ -1221,7 +1246,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) while (index <= end_index) { page = find_get_page(tree->mapping, index); - BUG_ON(!page); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ set_page_writeback(page); page_cache_release(page); index++; @@ -1344,9 +1369,9 @@ out: return found; } -static noinline int __unlock_for_delalloc(struct inode *inode, - struct page *locked_page, - u64 start, u64 end) +static noinline void __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) { int ret; struct page *pages[16]; @@ -1356,7 +1381,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode, int i; if (index == locked_page->index && end_index == index) - return 0; + return; while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, @@ -1371,7 +1396,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode, index += ret; cond_resched(); } - return 0; } static noinline int lock_delalloc_pages(struct inode *inode, @@ -1501,11 +1525,10 @@ again: goto out_failed; } } - BUG_ON(ret); + BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, - 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, @@ -1762,39 +1785,34 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, * helper function to set a given page up to date if all the * extents in the tree for that page are up to date */ -static int check_page_uptodate(struct extent_io_tree *tree, - struct page *page) +static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); - return 0; } /* * helper function to unlock a page if all the extents in the tree * for that page are unlocked */ -static int check_page_locked(struct extent_io_tree *tree, - struct page *page) +static void check_page_locked(struct extent_io_tree *tree, struct page *page) { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) unlock_page(page); - return 0; } /* * helper function to end page writeback if all the extents * in the tree for that page are done with writeback */ -static int check_page_writeback(struct extent_io_tree *tree, - struct page *page) +static void check_page_writeback(struct extent_io_tree *tree, + struct page *page) { end_page_writeback(page); - return 0; } /* @@ -1913,6 +1931,26 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return 0; } +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + u64 start = eb->start; + unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); + int ret; + + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, + start, p, mirror_num); + if (ret) + break; + start += PAGE_CACHE_SIZE; + } + + return ret; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record @@ -2154,13 +2192,46 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, failrec->this_mirror, num_copies, failrec->in_validation); - tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, - failrec->bio_flags, 0); - return 0; + ret = tree->ops->submit_bio_hook(inode, read_mode, bio, + failrec->this_mirror, + failrec->bio_flags, 0); + return ret; } /* lots and lots of room for performance fixes in the end_bio funcs */ +int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +{ + int uptodate = (err == 0); + struct extent_io_tree *tree; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate && tree->ops && + tree->ops->writepage_io_failed_hook) { + ret = tree->ops->writepage_io_failed_hook(NULL, page, + start, end, NULL); + /* Writeback already completed */ + if (ret == 0) + return 1; + } + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); + ClearPageUptodate(page); + SetPageError(page); + } + return 0; +} + /* * after a writepage IO is done, we need to: * clear the uptodate bits on error @@ -2172,13 +2243,11 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, */ static void end_bio_extent_writepage(struct bio *bio, int err) { - int uptodate = err == 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct extent_io_tree *tree; u64 start; u64 end; int whole_page; - int ret; do { struct page *page = bvec->bv_page; @@ -2195,28 +2264,9 @@ static void end_bio_extent_writepage(struct bio *bio, int err) if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); - if (tree->ops && tree->ops->writepage_end_io_hook) { - ret = tree->ops->writepage_end_io_hook(page, start, - end, NULL, uptodate); - if (ret) - uptodate = 0; - } - - if (!uptodate && tree->ops && - tree->ops->writepage_io_failed_hook) { - ret = tree->ops->writepage_io_failed_hook(bio, page, - start, end, NULL); - if (ret == 0) { - uptodate = (err == 0); - continue; - } - } - if (!uptodate) { - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); - ClearPageUptodate(page); - SetPageError(page); - } + if (end_extent_writepage(page, err, start, end)) + continue; if (whole_page) end_page_writeback(page); @@ -2247,6 +2297,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 start; u64 end; int whole_page; + int failed_mirror; int ret; if (err) @@ -2293,9 +2344,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err) else clean_io_failure(start, page); } - if (!uptodate) { - int failed_mirror; + + if (!uptodate) failed_mirror = (int)(unsigned long)bio->bi_bdev; + + if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(page, failed_mirror); + if (!ret && !err && + test_bit(BIO_UPTODATE, &bio->bi_flags)) + uptodate = 1; + } else if (!uptodate) { /* * The generic bio_readpage_error handles errors the * following way: If possible, new read requests are @@ -2309,7 +2367,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err) ret = bio_readpage_error(bio, page, start, end, failed_mirror, NULL); if (ret == 0) { -error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -2317,16 +2374,9 @@ error_handled: uncache_state(&cached); continue; } - if (tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook( - bio, page, start, end, - failed_mirror, state); - if (ret == 0) - goto error_handled; - } } - if (uptodate) { + if (uptodate && tree->track_uptodate) { set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); } @@ -2375,8 +2425,12 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } -static int submit_one_bio(int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) +/* + * Since writes are async, they will only return -ENOMEM. + * Reads can return the full range of I/O error conditions. + */ +static int __must_check submit_one_bio(int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -2402,6 +2456,19 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, return ret; } +static int merge_bio(struct extent_io_tree *tree, struct page *page, + unsigned long offset, size_t size, struct bio *bio, + unsigned long bio_flags) +{ + int ret = 0; + if (tree->ops && tree->ops->merge_bio_hook) + ret = tree->ops->merge_bio_hook(page, offset, size, bio, + bio_flags); + BUG_ON(ret < 0); + return ret; + +} + static int submit_extent_page(int rw, struct extent_io_tree *tree, struct page *page, sector_t sector, size_t size, unsigned long offset, @@ -2430,12 +2497,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, sector; if (prev_bio_flags != bio_flags || !contig || - (tree->ops && tree->ops->merge_bio_hook && - tree->ops->merge_bio_hook(page, offset, page_size, bio, - bio_flags)) || + merge_bio(tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); + if (ret < 0) + return ret; bio = NULL; } else { return 0; @@ -2462,25 +2529,31 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, return ret; } -void set_page_extent_mapped(struct page *page) +void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { if (!PagePrivate(page)) { SetPagePrivate(page); page_cache_get(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); + set_page_private(page, (unsigned long)eb); + } else { + WARN_ON(page->private != (unsigned long)eb); } } -static void set_page_extent_head(struct page *page, unsigned long len) +void set_page_extent_mapped(struct page *page) { - WARN_ON(!PagePrivate(page)); - set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, EXTENT_PAGE_PRIVATE); + } } /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io * handlers) + * XXX JDM: This needs looking at to ensure proper page locking */ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, @@ -2520,11 +2593,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end = page_end; while (1) { - lock_extent(tree, start, end, GFP_NOFS); + lock_extent(tree, start, end); ordered = btrfs_lookup_ordered_extent(inode, start); if (!ordered) break; - unlock_extent(tree, start, end, GFP_NOFS); + unlock_extent(tree, start, end); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } @@ -2535,10 +2608,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree, if (zero_offset) { iosize = PAGE_CACHE_SIZE - zero_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); } } while (cur <= end) { @@ -2547,10 +2620,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct extent_state *cached = NULL; iosize = PAGE_CACHE_SIZE - pg_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, cur + iosize - 1, @@ -2561,7 +2634,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end - cur + 1, 0); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - unlock_extent(tree, cur, end, GFP_NOFS); + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -2596,10 +2669,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree, char *userpage; struct extent_state *cached = NULL; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -2613,7 +2686,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2623,7 +2696,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2643,6 +2716,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); + BUG_ON(ret == -ENOMEM); nr++; *bio_flags = this_bio_flag; } @@ -2745,10 +2819,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (page->index == end_index) { char *userpage; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, PAGE_CACHE_SIZE - pg_offset); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); flush_dcache_page(page); } pg_offset = 0; @@ -2779,9 +2853,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_start = delalloc_end + 1; continue; } - tree->ops->fill_delalloc(inode, page, delalloc_start, - delalloc_end, &page_started, - &nr_written); + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + &nr_written); + /* File system has been set read-only */ + if (ret) { + SetPageError(page); + goto done; + } /* * delalloc_end is already one less than the total * length, so we don't subtract one from @@ -2818,8 +2899,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); update_nr_written(page, wbc, nr_written); unlock_page(page); ret = 0; @@ -2950,6 +3035,275 @@ done_unlocked: return 0; } +static int eb_wait(void *word) +{ + io_schedule(); + return 0; +} + +static void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, + TASK_UNINTERRUPTIBLE); +} + +static int lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) +{ + unsigned long i, num_pages; + int flush = 0; + int ret = 0; + + if (!btrfs_try_tree_write_lock(eb)) { + flush = 1; + flush_write_bio(epd); + btrfs_tree_lock(eb); + } + + if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_unlock(eb); + if (!epd->sync_io) + return 0; + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + while (1) { + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); + if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) + break; + btrfs_tree_unlock(eb); + } + } + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + spin_lock(&fs_info->delalloc_lock); + if (fs_info->dirty_metadata_bytes >= eb->len) + fs_info->dirty_metadata_bytes -= eb->len; + else + WARN_ON(1); + spin_unlock(&fs_info->delalloc_lock); + ret = 1; + } + + btrfs_tree_unlock(eb); + + if (!ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + if (!trylock_page(p)) { + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + lock_page(p); + } + } + + return ret; +} + +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_clear_bit(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + +static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +{ + int uptodate = err == 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_buffer *eb; + int done; + + do { + struct page *page = bvec->bv_page; + + bvec--; + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + done = atomic_dec_and_test(&eb->io_pages); + + if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + ClearPageUptodate(page); + SetPageError(page); + } + + end_page_writeback(page); + + if (!done) + continue; + + end_extent_buffer_writeback(eb); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); + +} + +static int write_one_eb(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct block_device *bdev = fs_info->fs_devices->latest_bdev; + u64 offset = eb->start; + unsigned long i, num_pages; + int rw = (epd->sync_io ? WRITE_SYNC : WRITE); + int ret; + + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + atomic_set(&eb->io_pages, num_pages); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + clear_page_dirty_for_io(p); + set_page_writeback(p); + ret = submit_extent_page(rw, eb->tree, p, offset >> 9, + PAGE_CACHE_SIZE, 0, bdev, &epd->bio, + -1, end_bio_extent_buffer_writepage, + 0, 0, 0); + if (ret) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + SetPageError(p); + if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) + end_extent_buffer_writeback(eb); + ret = -EIO; + break; + } + offset += PAGE_CACHE_SIZE; + update_nr_written(p, wbc, 1); + unlock_page(p); + } + + if (unlikely(ret)) { + for (; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + unlock_page(p); + } + } + + return ret; +} + +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct extent_buffer *eb, *prev_eb = NULL; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + }; + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int tag; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + scanned = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (!PagePrivate(page)) + continue; + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + break; + } + + eb = (struct extent_buffer *)page->private; + if (!eb) { + WARN_ON(1); + continue; + } + + if (eb == prev_eb) + continue; + + if (!atomic_inc_not_zero(&eb->refs)) { + WARN_ON(1); + continue; + } + + prev_eb = eb; + ret = lock_extent_buffer_for_io(eb, fs_info, &epd); + if (!ret) { + free_extent_buffer(eb); + continue; + } + + ret = write_one_eb(eb, fs_info, wbc, &epd); + if (ret) { + done = 1; + free_extent_buffer(eb); + break; + } + free_extent_buffer(eb); + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + flush_write_bio(&epd); + return ret; +} + /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write @@ -3081,10 +3435,14 @@ retry: static void flush_epd_write_bio(struct extent_page_data *epd) { if (epd->bio) { + int rw = WRITE; + int ret; + if (epd->sync_io) - submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); - else - submit_one_bio(WRITE, epd->bio, 0, 0); + rw = WRITE_SYNC; + + ret = submit_one_bio(rw, epd->bio, 0, 0); + BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } } @@ -3201,7 +3559,7 @@ int extent_readpages(struct extent_io_tree *tree, } BUG_ON(!list_empty(pages)); if (bio) - submit_one_bio(READ, bio, 0, bio_flags); + return submit_one_bio(READ, bio, 0, bio_flags); return 0; } @@ -3222,7 +3580,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, start, end, 0, &cached_state); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -3289,7 +3647,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, len = end - start + 1; write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); - if (IS_ERR_OR_NULL(em)) { + if (!em) { write_unlock(&map->lock); break; } @@ -3436,7 +3794,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, - &cached_state, GFP_NOFS); + &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); @@ -3530,26 +3888,7 @@ out: inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { - struct page *p; - struct address_space *mapping; - - if (i == 0) - return eb->first_page; - i += eb->start >> PAGE_CACHE_SHIFT; - mapping = eb->first_page->mapping; - if (!mapping) - return NULL; - - /* - * extent_buffer_page is only called after pinning the page - * by increasing the reference count. So we know the page must - * be in the radix tree. - */ - rcu_read_lock(); - p = radix_tree_lookup(&mapping->page_tree, i); - rcu_read_unlock(); - - return p; + return eb->pages[i]; } inline unsigned long num_extent_pages(u64 start, u64 len) @@ -3558,6 +3897,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len) (start >> PAGE_CACHE_SHIFT); } +static void __free_extent_buffer(struct extent_buffer *eb) +{ +#if LEAK_DEBUG + unsigned long flags; + spin_lock_irqsave(&leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + if (eb->pages && eb->pages != eb->inline_pages) + kfree(eb->pages); + kmem_cache_free(extent_buffer_cache, eb); +} + static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len, @@ -3573,6 +3925,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; + eb->tree = tree; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); atomic_set(&eb->read_locks, 0); @@ -3589,20 +3942,32 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, list_add(&eb->leak_list, &buffers); spin_unlock_irqrestore(&leak_lock, flags); #endif + spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); + atomic_set(&eb->io_pages, 0); + + if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { + struct page **pages; + int num_pages = (len + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + pages = kzalloc(num_pages, mask); + if (!pages) { + __free_extent_buffer(eb); + return NULL; + } + eb->pages = pages; + } else { + eb->pages = eb->inline_pages; + } return eb; } -static void __free_extent_buffer(struct extent_buffer *eb) +static int extent_buffer_under_io(struct extent_buffer *eb) { -#if LEAK_DEBUG - unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); - list_del(&eb->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - kmem_cache_free(extent_buffer_cache, eb); + return (atomic_read(&eb->io_pages) || + test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } /* @@ -3614,8 +3979,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, unsigned long index; struct page *page; - if (!eb->first_page) - return; + BUG_ON(extent_buffer_under_io(eb)); index = num_extent_pages(eb->start, eb->len); if (start_idx >= index) @@ -3624,8 +3988,34 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, do { index--; page = extent_buffer_page(eb, index); - if (page) + if (page) { + spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); + /* + * We need to make sure we haven't be attached + * to a new eb. + */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ + page_cache_release(page); + } + spin_unlock(&page->mapping->private_lock); + + /* One for when we alloced the page */ page_cache_release(page); + } } while (index != start_idx); } @@ -3638,9 +4028,50 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) __free_extent_buffer(eb); } +static void check_buffer_tree_ref(struct extent_buffer *eb) +{ + /* the ref bit is tricky. We have to make sure it is set + * if we have the buffer dirty. Otherwise the + * code to free a buffer can end up dropping a dirty + * page + * + * Once the ref bit is set, it won't go away while the + * buffer is dirty or in writeback, and it also won't + * go away while we have the reference count on the + * eb bumped. + * + * We can't just set the ref bit without bumping the + * ref on the eb because free_extent_buffer might + * see the ref bit and try to clear it. If this happens + * free_extent_buffer might end up dropping our original + * ref by mistake and freeing the page before we are able + * to add one more ref. + * + * So bump the ref count first, then set the bit. If someone + * beat us to it, drop the ref we added. + */ + if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + atomic_inc(&eb->refs); + if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + } +} + +static void mark_extent_buffer_accessed(struct extent_buffer *eb) +{ + unsigned long num_pages, i; + + check_buffer_tree_ref(eb); + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + mark_page_accessed(p); + } +} + struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0) + u64 start, unsigned long len) { unsigned long num_pages = num_extent_pages(start, len); unsigned long i; @@ -3656,7 +4087,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->first_page); + mark_extent_buffer_accessed(eb); return eb; } rcu_read_unlock(); @@ -3665,32 +4096,43 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (!eb) return NULL; - if (page0) { - eb->first_page = page0; - i = 1; - index++; - page_cache_get(page0); - mark_page_accessed(page0); - set_page_extent_mapped(page0); - set_page_extent_head(page0, len); - uptodate = PageUptodate(page0); - } else { - i = 0; - } - for (; i < num_pages; i++, index++) { + for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS); if (!p) { WARN_ON(1); goto free_eb; } - set_page_extent_mapped(p); - mark_page_accessed(p); - if (i == 0) { - eb->first_page = p; - set_page_extent_head(p, len); - } else { - set_page_private(p, EXTENT_PAGE_PRIVATE); + + spin_lock(&mapping->private_lock); + if (PagePrivate(p)) { + /* + * We could have already allocated an eb for this page + * and attached one so lets see if we can get a ref on + * the existing eb, and if we can we know it's good and + * we can just return that one, else we know we can just + * overwrite page->private. + */ + exists = (struct extent_buffer *)p->private; + if (atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + mark_extent_buffer_accessed(exists); + goto free_eb; + } + + /* + * Do this so attach doesn't complain and we need to + * drop the ref the old guy had. + */ + ClearPagePrivate(p); + WARN_ON(PageDirty(p)); + page_cache_release(p); } + attach_extent_buffer_page(eb, p); + spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); + mark_page_accessed(p); + eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -3698,12 +4140,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * see below about how we avoid a nasty race with release page * and why we unlock later */ - if (i != 0) - unlock_page(p); } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - +again: ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) goto free_eb; @@ -3713,14 +4153,21 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (ret == -EEXIST) { exists = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - /* add one reference for the caller */ - atomic_inc(&exists->refs); + if (!atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); + exists = NULL; + goto again; + } spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); + mark_extent_buffer_accessed(exists); goto free_eb; } /* add one reference for the tree */ - atomic_inc(&eb->refs); + spin_lock(&eb->refs_lock); + check_buffer_tree_ref(eb); + spin_unlock(&eb->refs_lock); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); @@ -3733,15 +4180,20 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * after the extent buffer is in the radix tree so * it doesn't get lost */ - set_page_extent_mapped(eb->first_page); - set_page_extent_head(eb->first_page, eb->len); - if (!page0) - unlock_page(eb->first_page); + SetPageChecked(eb->pages[0]); + for (i = 1; i < num_pages; i++) { + p = extent_buffer_page(eb, i); + ClearPageChecked(p); + unlock_page(p); + } + unlock_page(eb->pages[0]); return eb; free_eb: - if (eb->first_page && !page0) - unlock_page(eb->first_page); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) + unlock_page(eb->pages[i]); + } if (!atomic_dec_and_test(&eb->refs)) return exists; @@ -3758,7 +4210,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->first_page); + mark_extent_buffer_accessed(eb); return eb; } rcu_read_unlock(); @@ -3766,19 +4218,71 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, return NULL; } +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +{ + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); + + __free_extent_buffer(eb); +} + +/* Expects to have eb->eb_lock already held */ +static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) +{ + WARN_ON(atomic_read(&eb->refs) == 0); + if (atomic_dec_and_test(&eb->refs)) { + struct extent_io_tree *tree = eb->tree; + + spin_unlock(&eb->refs_lock); + + spin_lock(&tree->buffer_lock); + radix_tree_delete(&tree->buffer, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&tree->buffer_lock); + + /* Should be safe to release our pages at this point */ + btrfs_release_extent_buffer_page(eb, 0); + + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return; + } + spin_unlock(&eb->refs_lock); +} + void free_extent_buffer(struct extent_buffer *eb) { if (!eb) return; - if (!atomic_dec_and_test(&eb->refs)) + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && + !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + + /* + * I know this is terrible, but it's temporary until we stop tracking + * the uptodate bits and such for the extent buffers. + */ + release_extent_buffer(eb, GFP_ATOMIC); +} + +void free_extent_buffer_stale(struct extent_buffer *eb) +{ + if (!eb) return; - WARN_ON(1); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_STALE, &eb->bflags); + + if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + release_extent_buffer(eb, GFP_NOFS); } -int clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +void clear_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; @@ -3794,10 +4298,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, lock_page(page); WARN_ON(!PagePrivate(page)); - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); - clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -3809,24 +4309,29 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, ClearPageError(page); unlock_page(page); } - return 0; + WARN_ON(atomic_read(&eb->refs) == 0); } -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; int was_dirty = 0; + check_buffer_tree_ref(eb); + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + for (i = 0; i < num_pages; i++) - __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + set_page_dirty(extent_buffer_page(eb, i)); return was_dirty; } -static int __eb_straddles_pages(u64 start, u64 len) +static int range_straddles_pages(u64 start, u64 len) { if (len < PAGE_CACHE_SIZE) return 1; @@ -3837,26 +4342,14 @@ static int __eb_straddles_pages(u64 start, u64 len) return 0; } -static int eb_straddles_pages(struct extent_buffer *eb) -{ - return __eb_straddles_pages(eb->start, eb->len); -} - -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state) +int clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; - num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - if (eb_straddles_pages(eb)) { - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); - } + num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3865,27 +4358,16 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, return 0; } -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - - if (eb_straddles_pages(eb)) { - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - NULL, GFP_NOFS); - } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - check_page_uptodate(tree, page); - continue; - } SetPageUptodate(page); } return 0; @@ -3900,7 +4382,7 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - if (__eb_straddles_pages(start, end - start + 1)) { + if (range_straddles_pages(start, end - start + 1)) { ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); if (ret) @@ -3922,35 +4404,9 @@ int extent_range_uptodate(struct extent_io_tree *tree, return pg_uptodate; } -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state) +int extent_buffer_uptodate(struct extent_buffer *eb) { - int ret = 0; - unsigned long num_pages; - unsigned long i; - struct page *page; - int pg_uptodate = 1; - - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return 1; - - if (eb_straddles_pages(eb)) { - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; - } - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (!PageUptodate(page)) { - pg_uptodate = 0; - break; - } - } - return pg_uptodate; + return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); } int read_extent_buffer_pages(struct extent_io_tree *tree, @@ -3964,21 +4420,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, int ret = 0; int locked_pages = 0; int all_uptodate = 1; - int inc_all_pages = 0; unsigned long num_pages; + unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (eb_straddles_pages(eb)) { - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; - } - } - if (start) { WARN_ON(start < eb->start); start_i = (start >> PAGE_CACHE_SHIFT) - @@ -3997,8 +4446,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, lock_page(page); } locked_pages++; - if (!PageUptodate(page)) + if (!PageUptodate(page)) { + num_reads++; all_uptodate = 0; + } } if (all_uptodate) { if (start_i == 0) @@ -4006,20 +4457,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->failed_mirror = 0; + atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - - WARN_ON(!PagePrivate(page)); - - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); - - if (inc_all_pages) - page_cache_get(page); if (!PageUptodate(page)) { - if (start_i == 0) - inc_all_pages = 1; ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, @@ -4031,8 +4474,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } } - if (bio) - submit_one_bio(READ, bio, mirror_num, bio_flags); + if (bio) { + err = submit_one_bio(READ, bio, mirror_num, bio_flags); + if (err) + return err; + } if (ret || wait != WAIT_COMPLETE) return ret; @@ -4044,8 +4490,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ret = -EIO; } - if (!ret) - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: @@ -4287,15 +4731,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page, { char *dst_kaddr = page_address(dst_page); char *src_kaddr; + int must_memmove = 0; if (dst_page != src_page) { src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; - BUG_ON(areas_overlap(src_off, dst_off, len)); + if (areas_overlap(src_off, dst_off, len)) + must_memmove = 1; } - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + if (must_memmove) + memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); + else + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, @@ -4365,7 +4814,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } - if (!areas_overlap(src_offset, dst_offset, len)) { + if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } @@ -4391,47 +4840,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +int try_release_extent_buffer(struct page *page, gfp_t mask) { - struct extent_buffer *eb = - container_of(head, struct extent_buffer, rcu_head); - - btrfs_release_extent_buffer(eb); -} - -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); struct extent_buffer *eb; - int ret = 1; - spin_lock(&tree->buffer_lock); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (!eb) { - spin_unlock(&tree->buffer_lock); - return ret; + /* + * We need to make sure noboody is attaching this page to an eb right + * now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + return 1; } - if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - ret = 0; - goto out; - } + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); /* - * set @eb->refs to 0 if it is already 1, and then release the @eb. - * Or go back. + * This is a little awful but should be ok, we need to make sure that + * the eb doesn't disappear out from under us while we're looking at + * this page. */ - if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { - ret = 0; - goto out; + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&page->mapping->private_lock); + return 0; } + spin_unlock(&page->mapping->private_lock); - radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); -out: - spin_unlock(&tree->buffer_lock); + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; - /* at this point we can safely release the extent buffer */ - if (atomic_read(&eb->refs) == 0) - call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); - return ret; + /* + * If tree ref isn't set then we know the ref on this eb is a real ref, + * so just return, this page will likely be freed soon anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + return 0; + } + release_extent_buffer(eb, mask); + + return 1; } |