From 0ca1f7ceb1991099ed5273885ebcf4323948c72e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 16 May 2010 10:48:47 -0400 Subject: Btrfs: Update metadata reservation for delayed allocation Introduce metadata reservation context for delayed allocation and update various related functions. This patch also introduces EXTENT_FIRST_DELALLOC control bit for set/clear_extent_bit. It tells set/clear_bit_hook whether they are processing the first extent_state with EXTENT_DELALLOC bit set. This change is important if set/clear_extent_bit involves multiple extent_state. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 63 +++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 33 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d2d03684fab2..1a57c17d4029 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree, } static int set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) + struct extent_state *state, int *bits) { if (tree->ops && tree->ops->set_bit_hook) { return tree->ops->set_bit_hook(tree->mapping->host, - state->start, state->end, - state->state, bits); + state, bits); } return 0; } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) + struct extent_state *state, int *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); @@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, - int bits) + int *bits) { struct rb_node *node; + int bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; if (end < start) { @@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree, if (ret) return ret; - if (bits & EXTENT_DIRTY) + if (bits_to_set & EXTENT_DIRTY) tree->dirty_bytes += end - start + 1; - state->state |= bits; + state->state |= bits_to_set; node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; @@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, * struct is freed and removed from the tree */ static int clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, int bits, int wake, - int delete) + struct extent_state *state, + int *bits, int wake) { - int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int bits_to_clear = *bits & ~EXTENT_CTLBITS; int ret = state->state & bits_to_clear; - if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range; @@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree, state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); - if (delete || state->state == 0) { + if (state->state == 0) { if (state->tree) { - clear_state_cb(tree, state, state->state); rb_erase(&state->rb_node, &tree->state); state->tree = NULL; free_extent_state(state); @@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int set = 0; int clear = 0; + if (delete) + bits |= ~EXTENT_CTLBITS; + bits |= EXTENT_FIRST_DELALLOC; + if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: @@ -580,8 +581,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, bits, wake, - delete); + set |= clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -602,7 +602,7 @@ hit_next: if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, wake, delete); + set |= clear_state_bit(tree, prealloc, &bits, wake); prealloc = NULL; goto out; @@ -613,7 +613,7 @@ hit_next: else next_node = NULL; - set |= clear_state_bit(tree, state, bits, wake, delete); + set |= clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -706,19 +706,19 @@ out: static int set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - int bits) + int *bits) { int ret; + int bits_to_set = *bits & ~EXTENT_CTLBITS; ret = set_state_cb(tree, state, bits); if (ret) return ret; - - if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - state->state |= bits; + state->state |= bits_to_set; return 0; } @@ -757,6 +757,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; + bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -778,7 +779,7 @@ again: */ node = tree_search(tree, start); if (!node) { - err = insert_state(tree, prealloc, start, end, bits); + err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; BUG_ON(err == -EEXIST); goto out; @@ -802,7 +803,7 @@ hit_next: goto out; } - err = set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, &bits); if (err) goto out; @@ -852,7 +853,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - err = set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, &bits); if (err) goto out; cache_state(state, cached_state); @@ -877,7 +878,7 @@ hit_next: else this_end = last_start - 1; err = insert_state(tree, prealloc, start, this_end, - bits); + &bits); BUG_ON(err == -EEXIST); if (err) { prealloc = NULL; @@ -903,7 +904,7 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - err = set_state_bits(tree, prealloc, bits); + err = set_state_bits(tree, prealloc, &bits); if (err) { prealloc = NULL; goto out; @@ -966,8 +967,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, - NULL, mask); + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); } int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, @@ -1435,9 +1435,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; - if (op & EXTENT_CLEAR_ACCOUNTING) - clear_bits |= EXTENT_DO_ACCOUNTING; - clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | -- cgit v1.2.3 From 11c65dccf70be9ace5dbd3906778e1a099b1fee1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sun, 23 May 2010 11:07:21 -0400 Subject: Btrfs: do aio_write instead of write In order for AIO to work, we need to implement aio_write. This patch converts our btrfs_file_write to btrfs_aio_write. I've tested this with xfstests and nothing broke, and the AIO stuff magically started working. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 11 +++- fs/btrfs/file.c | 176 +++++++++++++++++++++++++++------------------------ 2 files changed, 104 insertions(+), 83 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1a57c17d4029..a53aca338c7f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2017,6 +2017,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; + struct btrfs_ordered_extent *ordered; int ret; int nr = 0; size_t page_offset = 0; @@ -2028,7 +2029,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree, set_page_extent_mapped(page); end = page_end; - lock_extent(tree, start, end, GFP_NOFS); + while (1) { + lock_extent(tree, start, end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, start); + if (!ordered) + break; + unlock_extent(tree, start, end, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } if (page->index == last_byte >> PAGE_CACHE_SHIFT) { char *userpage; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a28810abfb98..233aea2e5ef2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -46,32 +46,42 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes, struct page **prepared_pages, - const char __user *buf) + struct iov_iter *i) { - long page_fault = 0; - int i; + size_t copied; + int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); - for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { + while (write_bytes > 0) { size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[i]; - fault_in_pages_readable(buf, count); + struct page *page = prepared_pages[pg]; +again: + if (unlikely(iov_iter_fault_in_readable(i, count))) + return -EFAULT; /* Copy data from userspace to the current page */ - kmap(page); - page_fault = __copy_from_user(page_address(page) + offset, - buf, count); + copied = iov_iter_copy_from_user(page, i, offset, count); + /* Flush processor's dcache for this page */ flush_dcache_page(page); - kunmap(page); - buf += count; - write_bytes -= count; + iov_iter_advance(i, copied); + write_bytes -= copied; - if (page_fault) - break; + if (unlikely(copied == 0)) { + count = min_t(size_t, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + + if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + offset += copied; + } else { + pg++; + offset = 0; + } } - return page_fault ? -EFAULT : 0; + return 0; } /* @@ -822,60 +832,24 @@ again: return 0; } -/* Copied from read-write.c */ -static void wait_on_retry_sync_kiocb(struct kiocb *iocb) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - if (!kiocbIsKicked(iocb)) - schedule(); - else - kiocbClearKicked(iocb); - __set_current_state(TASK_RUNNING); -} - -/* - * Just a copy of what do_sync_write does. - */ -static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf, - size_t count, loff_t pos, loff_t *ppos) +static ssize_t btrfs_file_aio_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; - unsigned long nr_segs = 1; - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = pos; - kiocb.ki_left = count; - kiocb.ki_nbytes = count; - - while (1) { - ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos, - ppos, count, count); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); - *ppos = kiocb.ki_pos; - return ret; -} - -static ssize_t btrfs_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - loff_t pos; + struct file *file = iocb->ki_filp; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *pinned[2]; + struct page **pages = NULL; + struct iov_iter i; + loff_t *ppos = &iocb->ki_pos; loff_t start_pos; ssize_t num_written = 0; ssize_t err = 0; + size_t count; + size_t ocount; int ret = 0; - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct page **pages = NULL; int nrptrs; - struct page *pinned[2]; unsigned long first_index; unsigned long last_index; int will_write; @@ -887,13 +861,17 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, pinned[0] = NULL; pinned[1] = NULL; - pos = *ppos; start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); mutex_lock(&inode->i_mutex); + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) + goto out; + count = ocount; + current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) @@ -910,14 +888,48 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, BTRFS_I(inode)->sequence++; if (unlikely(file->f_flags & O_DIRECT)) { - num_written = __btrfs_direct_write(file, buf, count, pos, - ppos); - pos += num_written; - count -= num_written; + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; - /* We've written everything we wanted to, exit */ - if (num_written < 0 || !count) + num_written = generic_file_direct_write(iocb, iov, &nr_segs, + pos, ppos, count, + ocount); + + /* + * the generic O_DIRECT will update in-memory i_size after the + * DIOs are done. But our endio handlers that update the on + * disk i_size never update past the in memory i_size. So we + * need one more update here to catch any additions to the + * file + */ + if (inode->i_size != BTRFS_I(inode)->disk_i_size) { + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + mark_inode_dirty(inode); + } + + if (num_written < 0) { + if (num_written != -EIOCBQUEUED) { + /* + * aio land will take care of releasing the + * delalloc + */ + btrfs_delalloc_release_space(inode, count); + } + ret = num_written; + num_written = 0; goto out; + } else if (num_written == count) { + /* pick up pos changes done by the generic code */ + pos = *ppos; + goto out; + } + + /* + * the buffered IO will reserve bytes for the rest of the + * range, don't double count them here + */ + btrfs_delalloc_release_space(inode, count - num_written); /* * We are going to do buffered for the rest of the range, so we @@ -925,18 +937,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, * done. */ buffered = 1; - buf += num_written; + pos += num_written; } - nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, - PAGE_CACHE_SIZE / (sizeof(struct page *))); + iov_iter_init(&i, iov, nr_segs, count, num_written); + nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / + (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); /* generic_write_checks can change our pos */ start_pos = pos; first_index = pos >> PAGE_CACHE_SHIFT; - last_index = (pos + count) >> PAGE_CACHE_SHIFT; + last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; /* * there are lots of better ways to do this, but this code @@ -953,7 +967,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, unlock_page(pinned[0]); } } - if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { + if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { pinned[1] = grab_cache_page(inode->i_mapping, last_index); if (!PageUptodate(pinned[1])) { ret = btrfs_readpage(NULL, pinned[1]); @@ -964,10 +978,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } } - while (count > 0) { + while (iov_iter_count(&i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); - size_t write_bytes = min(count, nrptrs * - (size_t)PAGE_CACHE_SIZE - + size_t write_bytes = min(iov_iter_count(&i), + nrptrs * (size_t)PAGE_CACHE_SIZE - offset); size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -988,7 +1002,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } ret = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, buf); + write_bytes, pages, &i); if (ret == 0) { dirty_and_release_pages(NULL, root, file, pages, num_pages, pos, write_bytes); @@ -1012,8 +1026,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, btrfs_throttle(root); } - buf += write_bytes; - count -= write_bytes; pos += write_bytes; num_written += write_bytes; @@ -1206,7 +1218,7 @@ const struct file_operations btrfs_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .splice_read = generic_file_splice_read, - .write = btrfs_file_write, + .aio_write = btrfs_file_aio_write, .mmap = btrfs_file_mmap, .open = generic_file_open, .release = btrfs_release_file, -- cgit v1.2.3 From eaf25d933e64c2bf3c79b83e8820404f36fdfc52 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 25 May 2010 09:48:28 -0400 Subject: Btrfs: use async helpers for DIO write checksumming The async helper threads offload crc work onto all the CPUs, and make streaming writes much faster. This changes the O_DIRECT write code to use them. The only small complication was that we need to pass in the logical offset in the file for each bio, because we can't find it in the bio's pages. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 23 ++++++++++++++++++----- fs/btrfs/disk-io.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 40 +++++++++++++++++++++++++++++++--------- 5 files changed, 52 insertions(+), 17 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a8772b5a9cb5..f3b287c22caf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -74,6 +74,11 @@ struct async_submit_bio { int rw; int mirror_num; unsigned long bio_flags; + /* + * bio_offset is optional, can be used if the pages in the bio + * can't tell us where in the file the bio should go + */ + u64 bio_offset; struct btrfs_work work; }; @@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work) async = container_of(work, struct async_submit_bio, work); fs_info = BTRFS_I(async->inode)->root->fs_info; async->submit_bio_start(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags); + async->mirror_num, async->bio_flags, + async->bio_offset); } static void run_one_async_done(struct btrfs_work *work) @@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); async->submit_bio_done(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags); + async->mirror_num, async->bio_flags, + async->bio_offset); } static void run_one_async_free(struct btrfs_work *work) @@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work) int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done) { @@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->work.flags = 0; async->bio_flags = bio_flags; + async->bio_offset = bio_offset; atomic_inc(&fs_info->nr_async_submits); @@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio) static int __btree_submit_bio_start(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, } static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, } static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { int ret; @@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, */ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, 0, + bio_offset, __btree_submit_bio_start, __btree_submit_bio_done); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2c064eba6f09..88e825a0bf21 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags, + unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a53aca338c7f..15392af21bfb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1913,7 +1913,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, if (tree->ops && tree->ops->submit_bio_hook) tree->ops->submit_bio_hook(page->mapping->host, rw, bio, - mirror_num, bio_flags); + mirror_num, bio_flags, start); else submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 86f10dc791d9..86c7b341d070 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -49,7 +49,7 @@ struct extent_state; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags); + unsigned long bio_flags, u64 bio_offset); struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1695440a59a4..13a4aa222861 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1385,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, */ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -1404,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, * are inserted into the btree */ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; return btrfs_map_bio(root, rw, bio, mirror_num, 1); @@ -1415,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * on write, or reading the csums from the tree before a read */ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -1440,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, /* we're doing a write, do the async checksumming */ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, - bio_flags, __btrfs_submit_bio_start, + bio_flags, bio_offset, + __btrfs_submit_bio_start, __btrfs_submit_bio_done); } @@ -1844,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, failrec->last_mirror, - failrec->bio_flags); + failrec->bio_flags, 0); return 0; } @@ -5484,6 +5488,17 @@ out_done: dio_end_io(bio, err); } +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 offset) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); + BUG_ON(ret); + return 0; +} + static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, loff_t file_offset) { @@ -5535,13 +5550,20 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, if (ret) goto out_err; - if (write && !skip_sum) - btrfs_csum_one_bio(root, inode, bio, dip->logical_offset, 1); - else if (!skip_sum) + if (write && !skip_sum) { + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, 0, 0, + dip->logical_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + if (ret) + goto out_err; + return; + } else if (!skip_sum) btrfs_lookup_bio_sums_dio(root, inode, bio, dip->logical_offset, dip->csums); - ret = btrfs_map_bio(root, rw, bio, 0, 0); + ret = btrfs_map_bio(root, rw, bio, 0, 1); if (ret) goto out_err; return; -- cgit v1.2.3 From 4845e44ffdb26be9b25610664228e8ecaf949a0d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 25 May 2010 20:56:50 -0400 Subject: Btrfs: rework O_DIRECT enospc handling This changes O_DIRECT write code to mark extents as delalloc while it is processing them. Yan Zheng has reworked the enospc accounting based on tracking delalloc extents and this makes it much easier to track enospc in the O_DIRECT code. There are a few space cases with the O_DIRECT code though, it only sets the EXTENT_DELALLOC bits, instead of doing EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, because we don't want to mess with clearing the dirty and uptodate bits when things go wrong. This is important because there are no pages in the page cache, so any extent state structs that we put in the tree won't get freed by releasepage. We have to clear them ourselves as the DIO ends. With this commit, we reserve space at in btrfs_file_aio_write, and then as each btrfs_direct_IO call progresses it sets EXTENT_DELALLOC on the range. btrfs_get_blocks_direct is responsible for clearing the delalloc at the same time it drops the extent lock. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 9 ++++----- fs/btrfs/extent_io.h | 4 ++++ fs/btrfs/file.c | 14 -------------- fs/btrfs/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++----------- 4 files changed, 49 insertions(+), 30 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 15392af21bfb..a4080c21ec55 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) return state; } -static void free_extent_state(struct extent_state *state) +void free_extent_state(struct extent_state *state) { if (!state) return; @@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state, * [start, end] is inclusive This takes the tree lock. */ -static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, - gfp_t mask) +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 86c7b341d070..5691c7b590da 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -178,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, unsigned long bits); +void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -187,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 233aea2e5ef2..54556cae4497 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -909,13 +909,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, } if (num_written < 0) { - if (num_written != -EIOCBQUEUED) { - /* - * aio land will take care of releasing the - * delalloc - */ - btrfs_delalloc_release_space(inode, count); - } ret = num_written; num_written = 0; goto out; @@ -924,13 +917,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, pos = *ppos; goto out; } - - /* - * the buffered IO will reserve bytes for the rest of the - * range, don't double count them here - */ - btrfs_delalloc_release_space(inode, count - num_written); - /* * We are going to do buffered for the rest of the range, so we * need to make sure to invalidate the buffered pages when we're diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 13a4aa222861..00aefbdcc2df 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5327,8 +5327,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, return PTR_ERR(em); len = min(len, em->block_len); } - unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, - GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, + 0, NULL, GFP_NOFS); map: bh_result->b_blocknr = (em->block_start + (start - em->start)) >> inode->i_blkbits; @@ -5596,14 +5597,18 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; u64 lockstart, lockend; ssize_t ret; + int writing = rw & WRITE; + int write_bits = 0; lockstart = offset; lockend = offset + iov_length(iov, nr_segs) - 1; + while (1) { - lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - GFP_NOFS); + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state, GFP_NOFS); /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure theres no ordered @@ -5613,29 +5618,54 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, lockend - lockstart + 1); if (!ordered) break; - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); cond_resched(); } + /* + * we don't use btrfs_set_extent_delalloc because we don't want + * the dirty or uptodate bits + */ + if (writing) { + write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_DELALLOC, 0, NULL, &cached_state, + GFP_NOFS); + if (ret) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_LOCKED | write_bits, + 1, 0, &cached_state, GFP_NOFS); + goto out; + } + } + + free_extent_state(cached_state); + cached_state = NULL; + ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, 0); if (ret < 0 && ret != -EIOCBQUEUED) { - unlock_extent(&BTRFS_I(inode)->io_tree, offset, - offset + iov_length(iov, nr_segs) - 1, GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { /* * We're falling back to buffered, unlock the section we didn't * do IO on. */ - unlock_extent(&BTRFS_I(inode)->io_tree, offset + ret, - offset + iov_length(iov, nr_segs) - 1, GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); } - +out: + free_extent_state(cached_state); return ret; } -- cgit v1.2.3