diff options
Diffstat (limited to 'fs/btrfs/inode.c')
| -rw-r--r-- | fs/btrfs/inode.c | 915 |
1 files changed, 542 insertions, 373 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4439fbb4ff45..2aaba58b4856 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -66,6 +66,13 @@ struct btrfs_iget_args { struct btrfs_root *root; }; +struct btrfs_dio_data { + u64 outstanding_extents; + u64 reserve; + u64 unsubmitted_oe_range_start; + u64 unsubmitted_oe_range_end; +}; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_dir_ro_inode_operations; @@ -74,17 +81,16 @@ static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct address_space_operations btrfs_symlink_aops; static const struct file_operations btrfs_dir_file_operations; -static struct extent_io_ops btrfs_extent_io_ops; +static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; -static struct kmem_cache *btrfs_delalloc_work_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; #define S_SHIFT 12 -static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, @@ -188,7 +194,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, while (compressed_size > 0) { cpage = compressed_pages[i]; cur_size = min_t(unsigned long, compressed_size, - PAGE_CACHE_SIZE); + PAGE_SIZE); kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); @@ -202,13 +208,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, compress_type); } else { page = find_get_page(inode->i_mapping, - start >> PAGE_CACHE_SHIFT); + start >> PAGE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = start & (PAGE_CACHE_SIZE - 1); + offset = start & (PAGE_SIZE - 1); write_extent_buffer(leaf, kaddr + offset, ptr, size); kunmap_atomic(kaddr); - page_cache_release(page); + put_page(page); } btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -257,7 +263,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, data_len = compressed_size; if (start > 0 || - actual_end > PAGE_CACHE_SIZE || + actual_end > root->sectorsize || data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || (!compressed_size && (actual_end & (root->sectorsize - 1)) == 0) || @@ -316,7 +322,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE); + btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; @@ -414,23 +420,23 @@ static noinline void compress_file_range(struct inode *inode, unsigned long nr_pages_ret = 0; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned long max_compressed = 128 * 1024; - unsigned long max_uncompressed = 128 * 1024; + unsigned long max_compressed = SZ_128K; + unsigned long max_uncompressed = SZ_128K; int i; int will_compress; int compress_type = root->fs_info->compress_type; int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ - if ((end - start + 1) < 16 * 1024 && + if ((end - start + 1) < SZ_16K && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; - nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; - nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE); /* * we don't want to send crud past the end of i_size through @@ -508,7 +514,7 @@ again: if (!ret) { unsigned long offset = total_compressed & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); struct page *page = pages[nr_pages_ret - 1]; char *kaddr; @@ -518,7 +524,7 @@ again: if (offset) { kaddr = kmap_atomic(page); memset(kaddr + offset, 0, - PAGE_CACHE_SIZE - offset); + PAGE_SIZE - offset); kunmap_atomic(kaddr); } will_compress = 1; @@ -574,7 +580,7 @@ cont: * one last check to make sure the compression is really a * win, compare the page count read with the blocks on disk */ - total_in = ALIGN(total_in, PAGE_CACHE_SIZE); + total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed >= total_in) { will_compress = 0; } else { @@ -588,7 +594,7 @@ cont: */ for (i = 0; i < nr_pages_ret; i++) { WARN_ON(pages[i]->mapping); - page_cache_release(pages[i]); + put_page(pages[i]); } kfree(pages); pages = NULL; @@ -644,7 +650,7 @@ cleanup_and_bail_uncompressed: free_pages_out: for (i = 0; i < nr_pages_ret; i++) { WARN_ON(pages[i]->mapping); - page_cache_release(pages[i]); + put_page(pages[i]); } kfree(pages); } @@ -658,7 +664,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); - page_cache_release(async_extent->pages[i]); + put_page(async_extent->pages[i]); } kfree(async_extent->pages); async_extent->nr_pages = 0; @@ -944,7 +950,7 @@ static noinline int cow_file_range(struct inode *inode, disk_num_bytes = num_bytes; /* if this is a small write inside eof, kick off defrag */ - if (num_bytes < 64 * 1024 && + if (num_bytes < SZ_64K && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); @@ -960,7 +966,7 @@ static noinline int cow_file_range(struct inode *inode, PAGE_END_WRITEBACK); *nr_written = *nr_written + - (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; goto out; } else if (ret < 0) { @@ -1100,14 +1106,14 @@ static noinline void async_cow_submit(struct btrfs_work *work) async_cow = container_of(work, struct async_cow, work); root = async_cow->root; - nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> + PAGE_SHIFT; /* * atomic_sub_return implies a barrier for waitqueue_active */ if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < - 5 * 1024 * 1024 && + 5 * SZ_1M && waitqueue_active(&root->fs_info->async_submit_wait)) wake_up(&root->fs_info->async_submit_wait); @@ -1132,7 +1138,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; - int limit = 10 * 1024 * 1024; + int limit = 10 * SZ_1M; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); @@ -1148,7 +1154,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, !btrfs_test_opt(root, FORCE_COMPRESS)) cur_end = end; else - cur_end = min(end, start + 512 * 1024 - 1); + cur_end = min(end, start + SZ_512K - 1); async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); @@ -1158,8 +1164,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow_start, async_cow_submit, async_cow_free); - nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + nr_pages = (cur_end - start + PAGE_SIZE) >> + PAGE_SHIFT; atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); btrfs_queue_work(root->fs_info->delalloc_workers, @@ -1304,8 +1310,14 @@ next_slot: num_bytes = 0; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid > ino || - found_key.type > BTRFS_EXTENT_DATA_KEY || + if (found_key.objectid > ino) + break; + if (WARN_ON_ONCE(found_key.objectid < ino) || + found_key.type < BTRFS_EXTENT_DATA_KEY) { + path->slots[0]++; + goto next_slot; + } + if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; @@ -1948,7 +1960,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state) { - WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); + WARN_ON((end & (PAGE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, cached_state, GFP_NOFS); } @@ -1981,16 +1993,17 @@ again: inode = page->mapping->host; page_start = page_offset(page); - page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; + page_end = page_offset(page) + PAGE_SIZE - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) goto out; - ordered = btrfs_lookup_ordered_extent(inode, page_start); + ordered = btrfs_lookup_ordered_range(inode, page_start, + PAGE_SIZE); if (ordered) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -2001,7 +2014,7 @@ again: } ret = btrfs_delalloc_reserve_space(inode, page_start, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); @@ -2017,7 +2030,7 @@ out: &cached_state, GFP_NOFS); out_page: unlock_page(page); - page_cache_release(page); + put_page(page); kfree(fixup); } @@ -2050,7 +2063,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) return -EAGAIN; SetPageChecked(page); - page_cache_get(page); + get_page(page); btrfs_init_work(&fixup->work, btrfs_fixup_helper, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; @@ -2476,7 +2489,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path, lock_start = backref->file_pos; lock_end = backref->file_pos + backref->num_bytes - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - 0, &cached); + &cached); ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); if (ordered) { @@ -2868,7 +2881,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - 0, &cached_state); + &cached_state); ret = test_range_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, @@ -3100,56 +3113,46 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, start, (size_t)(end - start + 1)); } -struct delayed_iput { - struct list_head list; - struct inode *inode; -}; - -/* JDM: If this is fs-wide, why can't we add a pointer to - * btrfs_inode instead and avoid the allocation? */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct delayed_iput *delayed; + struct btrfs_inode *binode = BTRFS_I(inode); if (atomic_add_unless(&inode->i_count, -1, 1)) return; - delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); - delayed->inode = inode; - spin_lock(&fs_info->delayed_iput_lock); - list_add_tail(&delayed->list, &fs_info->delayed_iputs); + if (binode->delayed_iput_count == 0) { + ASSERT(list_empty(&binode->delayed_iput)); + list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); + } else { + binode->delayed_iput_count++; + } spin_unlock(&fs_info->delayed_iput_lock); } void btrfs_run_delayed_iputs(struct btrfs_root *root) { - LIST_HEAD(list); struct btrfs_fs_info *fs_info = root->fs_info; - struct delayed_iput *delayed; - int empty; - - spin_lock(&fs_info->delayed_iput_lock); - empty = list_empty(&fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); - if (empty) - return; - - down_read(&fs_info->delayed_iput_sem); spin_lock(&fs_info->delayed_iput_lock); - list_splice_init(&fs_info->delayed_iputs, &list); - spin_unlock(&fs_info->delayed_iput_lock); - - while (!list_empty(&list)) { - delayed = list_entry(list.next, struct delayed_iput, list); - list_del(&delayed->list); - iput(delayed->inode); - kfree(delayed); + while (!list_empty(&fs_info->delayed_iputs)) { + struct btrfs_inode *inode; + + inode = list_first_entry(&fs_info->delayed_iputs, + struct btrfs_inode, delayed_iput); + if (inode->delayed_iput_count) { + inode->delayed_iput_count--; + list_move_tail(&inode->delayed_iput, + &fs_info->delayed_iputs); + } else { + list_del_init(&inode->delayed_iput); + } + spin_unlock(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + spin_lock(&fs_info->delayed_iput_lock); } - - up_read(&root->fs_info->delayed_iput_sem); + spin_unlock(&fs_info->delayed_iput_lock); } /* @@ -3345,7 +3348,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = -ENOMEM; goto out; } - path->reada = -1; + path->reada = READA_BACK; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -3544,10 +3547,10 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, int scanned = 0; if (!xattr_access) { - xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, - strlen(POSIX_ACL_XATTR_ACCESS)); - xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, - strlen(POSIX_ACL_XATTR_DEFAULT)); + xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, + strlen(XATTR_NAME_POSIX_ACL_ACCESS)); + xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, + strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); } slot++; @@ -3768,6 +3771,7 @@ cache_acl: break; case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_symlink_aops; break; default: @@ -4010,7 +4014,8 @@ err: btrfs_i_size_write(dir, dir->i_size - name_len * 2); inode_inc_iversion(inode); inode_inc_iversion(dir); - inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; + inode->i_ctime = dir->i_mtime = + dir->i_ctime = current_fs_time(inode->i_sb); ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4040,9 +4045,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, */ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - int ret; /* * 1 for the possible orphan item @@ -4051,27 +4054,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) * 1 for the inode ref * 1 for the inode */ - trans = btrfs_start_transaction(root, 5); - if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) - return trans; - - if (PTR_ERR(trans) == -ENOSPC) { - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return trans; - ret = btrfs_cond_migrate_bytes(root->fs_info, - &root->fs_info->trans_block_rsv, - num_bytes, 5); - if (ret) { - btrfs_end_transaction(trans, root); - return ERR_PTR(ret); - } - trans->block_rsv = &root->fs_info->trans_block_rsv; - trans->bytes_reserved = num_bytes; - } - return trans; + return btrfs_start_transaction_fallback_global_rsv(root, 5, 5); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -4175,7 +4158,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->i_size - name_len * 2); inode_inc_iversion(dir); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) btrfs_abort_transaction(trans, root, ret); @@ -4230,11 +4213,20 @@ static int truncate_space_check(struct btrfs_trans_handle *trans, { int ret; + /* + * This is only used to apply pressure to the enospc system, we don't + * intend to use this reservation at all. + */ bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); + bytes_deleted *= root->nodesize; ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, bytes_deleted, BTRFS_RESERVE_NO_FLUSH); - if (!ret) + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "transaction", + trans->transid, + bytes_deleted, 1); trans->bytes_reserved += bytes_deleted; + } return ret; } @@ -4255,7 +4247,7 @@ static int truncate_inline_extent(struct inode *inode, if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { loff_t offset = new_size; - loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE); + loff_t page_end = ALIGN(offset, PAGE_SIZE); /* * Zero out the remaining of the last page of our inline extent, @@ -4267,7 +4259,8 @@ static int truncate_inline_extent(struct inode *inode, * read the extent item from disk (data not in the page cache). */ btrfs_release_path(path); - return btrfs_truncate_page(inode, offset, page_end - offset, 0); + return btrfs_truncate_block(inode, offset, page_end - offset, + 0); } btrfs_set_file_extent_ram_bytes(leaf, fi, size); @@ -4333,7 +4326,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = -1; + path->reada = READA_BACK; /* * We want to drop from the next block forward in case this new size is @@ -4364,7 +4357,7 @@ search_again: * up a huge file in a single leaf. Most of the time that * bytes_deleted is > 0, it will be huge by the time we get here */ - if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (be_nice && bytes_deleted > SZ_32M) { if (btrfs_should_end_transaction(trans, root)) { err = -EAGAIN; goto error; @@ -4607,7 +4600,7 @@ error: btrfs_free_path(path); - if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (be_nice && bytes_deleted > SZ_32M) { unsigned long updates = trans->delayed_ref_updates; if (updates) { trans->delayed_ref_updates = 0; @@ -4620,17 +4613,17 @@ error: } /* - * btrfs_truncate_page - read, zero a chunk and write a page + * btrfs_truncate_block - read, zero a chunk and write a block * @inode - inode that we're zeroing * @from - the offset to start zeroing * @len - the length to zero, 0 to zero the entire range respective to the * offset * @front - zero up to the offset instead of from the offset on * - * This will find the page for the "from" offset and cow the page and zero the + * This will find the block for the "from" offset and cow the block and zero the * part we want to zero. This is used with truncate and hole punching. */ -int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, +int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, int front) { struct address_space *mapping = inode->i_mapping; @@ -4640,19 +4633,20 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, struct extent_state *cached_state = NULL; char *kaddr; u32 blocksize = root->sectorsize; - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + pgoff_t index = from >> PAGE_SHIFT; + unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); int ret = 0; - u64 page_start; - u64 page_end; + u64 block_start; + u64 block_end; if ((offset & (blocksize - 1)) == 0 && (!len || ((len & (blocksize - 1)) == 0))) goto out; + ret = btrfs_delalloc_reserve_space(inode, - round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE); + round_down(from, blocksize), blocksize); if (ret) goto out; @@ -4660,21 +4654,21 @@ again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, - round_down(from, PAGE_CACHE_SIZE), - PAGE_CACHE_SIZE); + round_down(from, blocksize), + blocksize); ret = -ENOMEM; goto out; } - page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; + block_start = round_down(from, blocksize); + block_end = block_start + blocksize - 1; if (!PageUptodate(page)) { ret = btrfs_readpage(NULL, page); lock_page(page); if (page->mapping != mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto again; } if (!PageUptodate(page)) { @@ -4684,55 +4678,57 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(io_tree, block_start, block_end, &cached_state); set_page_extent_mapped(page); - ordered = btrfs_lookup_ordered_extent(inode, page_start); + ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { - unlock_extent_cached(io_tree, page_start, page_end, + unlock_extent_cached(io_tree, block_start, block_end, &cached_state, GFP_NOFS); unlock_page(page); - page_cache_release(page); + put_page(page); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, + clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, + ret = btrfs_set_extent_delalloc(inode, block_start, block_end, &cached_state); if (ret) { - unlock_extent_cached(io_tree, page_start, page_end, + unlock_extent_cached(io_tree, block_start, block_end, &cached_state, GFP_NOFS); goto out_unlock; } - if (offset != PAGE_CACHE_SIZE) { + if (offset != blocksize) { if (!len) - len = PAGE_CACHE_SIZE - offset; + len = blocksize - offset; kaddr = kmap(page); if (front) - memset(kaddr, 0, offset); + memset(kaddr + (block_start - page_offset(page)), + 0, offset); else - memset(kaddr + offset, 0, len); + memset(kaddr + (block_start - page_offset(page)) + offset, + 0, len); flush_dcache_page(page); kunmap(page); } ClearPageChecked(page); set_page_dirty(page); - unlock_extent_cached(io_tree, page_start, page_end, &cached_state, + unlock_extent_cached(io_tree, block_start, block_end, &cached_state, GFP_NOFS); out_unlock: if (ret) - btrfs_delalloc_release_space(inode, page_start, - PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, block_start, + blocksize); unlock_page(page); - page_cache_release(page); + put_page(page); out: return ret; } @@ -4801,11 +4797,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) int err = 0; /* - * If our size started in the middle of a page we need to zero out the - * rest of the page before we expand the i_size, otherwise we could + * If our size started in the middle of a block we need to zero out the + * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - err = btrfs_truncate_page(inode, oldsize, 0, 0); + err = btrfs_truncate_block(inode, oldsize, 0, 0); if (err) return err; @@ -4815,7 +4811,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) while (1) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(io_tree, hole_start, block_end - 1, 0, + lock_extent_bits(io_tree, hole_start, block_end - 1, &cached_state); ordered = btrfs_lookup_ordered_range(inode, hole_start, block_end - hole_start); @@ -4891,26 +4887,6 @@ next: return err; } -static int wait_snapshoting_atomic_t(atomic_t *a) -{ - schedule(); - return 0; -} - -static void wait_for_snapshot_creation(struct btrfs_root *root) -{ - while (true) { - int ret; - - ret = btrfs_start_write_no_snapshoting(root); - if (ret) - break; - wait_on_atomic_t(&root->will_be_snapshoted, - wait_snapshoting_atomic_t, - TASK_UNINTERRUPTIBLE); - } -} - static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4934,7 +4910,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) } if (newsize > oldsize) { - truncate_pagecache(inode, newsize); /* * Don't do an expanding truncate while snapshoting is ongoing. * This is to ensure the snapshot captures a fully consistent @@ -4942,7 +4917,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * truncation, it must capture all writes that happened before * this truncation. */ - wait_for_snapshot_creation(root); + btrfs_wait_for_snapshot_creation(root); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { btrfs_end_write_no_snapshoting(root); @@ -4957,6 +4932,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, newsize); btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, root, inode); btrfs_end_write_no_snapshoting(root); btrfs_end_transaction(trans, root); @@ -5127,7 +5103,7 @@ static void evict_inode_truncate_pages(struct inode *inode) end = state->end; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, start, end, 0, &cached_state); + lock_extent_bits(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5320,7 +5296,6 @@ void btrfs_evict_inode(struct inode *inode) no_delete: btrfs_remove_delayed_node(inode); clear_inode(inode); - return; } /* @@ -5628,7 +5603,7 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = CURRENT_TIME; + inode->i_mtime = current_fs_time(inode->i_sb); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; @@ -5757,6 +5732,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) char *name_ptr; int name_len; int is_curr = 0; /* ctx->pos points to the current index? */ + bool emitted; /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) @@ -5769,7 +5745,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; if (key_type == BTRFS_DIR_INDEX_KEY) { INIT_LIST_HEAD(&ins_list); @@ -5785,6 +5761,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (ret < 0) goto err; + emitted = false; while (1) { leaf = path->nodes[0]; slot = path->slots[0]; @@ -5828,7 +5805,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (name_len <= sizeof(tmp_name)) { name_ptr = tmp_name; } else { - name_ptr = kmalloc(name_len, GFP_NOFS); + name_ptr = kmalloc(name_len, GFP_KERNEL); if (!name_ptr) { ret = -ENOMEM; goto err; @@ -5864,6 +5841,7 @@ skip: if (over) goto nopos; + emitted = true; di_len = btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di) + sizeof(*di); di_cur += di_len; @@ -5876,11 +5854,20 @@ next: if (key_type == BTRFS_DIR_INDEX_KEY) { if (is_curr) ctx->pos++; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted); if (ret) goto nopos; } + /* + * If we haven't emitted any dir entry, we must not touch ctx->pos as + * it was was set to the termination value in previous call. We assume + * that "." and ".." were emitted if we reach this point and set the + * termination value as well for an empty directory. + */ + if (ctx->pos > 2 && !emitted) + goto nopos; + /* Reached end of directory/root. Bump pos past the last item. */ ctx->pos++; @@ -6200,7 +6187,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); - inode->i_mtime = CURRENT_TIME; + inode->i_mtime = current_fs_time(inode->i_sb); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; @@ -6313,7 +6300,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->i_size + name_len * 2); inode_inc_iversion(parent_inode); - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + parent_inode->i_mtime = parent_inode->i_ctime = + current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode(trans, root, parent_inode); if (ret) btrfs_abort_transaction(trans, root, ret); @@ -6360,9 +6348,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, u64 objectid; u64 index = 0; - if (!new_valid_dev(rdev)) - return -EINVAL; - /* * 2 for inode item and ref * 2 for dir items @@ -6500,7 +6485,7 @@ out_unlock_inode: static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); u64 index; @@ -6526,6 +6511,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; goto fail; } @@ -6533,7 +6519,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_fs_time(inode->i_sb); ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); @@ -6559,9 +6545,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, inode, NULL, parent); } - btrfs_end_transaction(trans, root); btrfs_balance_delayed_items(root); fail: + if (trans) + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -6706,7 +6693,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, } static noinline int uncompress_inline(struct btrfs_path *path, - struct inode *inode, struct page *page, + struct page *page, size_t pg_offset, u64 extent_offset, struct btrfs_file_extent_item *item) { @@ -6730,7 +6717,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); + max_size = min_t(unsigned long, PAGE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); kfree(tmp); @@ -6803,7 +6790,7 @@ again: * Chances are we'll be called again, so go ahead and do * readahead */ - path->reada = 1; + path->reada = READA_FORWARD; } ret = btrfs_lookup_file_extent(trans, root, path, @@ -6892,8 +6879,8 @@ next: size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; - copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, - size - extent_offset); + copy_size = min_t(u64, PAGE_SIZE - pg_offset, + size - extent_offset); em->start = extent_start + extent_offset; em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; @@ -6902,8 +6889,7 @@ next: if (create == 0 && !PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, inode, page, - pg_offset, + ret = uncompress_inline(path, page, pg_offset, extent_offset, item); if (ret) { err = ret; @@ -6913,9 +6899,9 @@ next: map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, copy_size); - if (pg_offset + copy_size < PAGE_CACHE_SIZE) { + if (pg_offset + copy_size < PAGE_SIZE) { memset(map + pg_offset + copy_size, 0, - PAGE_CACHE_SIZE - pg_offset - + PAGE_SIZE - pg_offset - copy_size); } kunmap(page); @@ -7158,21 +7144,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return em; - } - + /* + * Create the ordered extent before the extent map. This is to avoid + * races with the fast fsync path that would lead to it logging file + * extent items that point to disk extents that were not yet written to. + * The fast fsync path collects ordered extents into a local list and + * then collects all the new extent maps, so we must create the ordered + * extent first and make sure the fast fsync path collects any new + * ordered extents after collecting new extent maps as well. + * The fsync path simply can not rely on inode_dio_wait() because it + * causes deadlock with AIO. + */ ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - free_extent_map(em); return ERR_PTR(ret); } + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, + ins.offset, ins.offset, ins.offset, 0); + if (IS_ERR(em)) { + struct btrfs_ordered_extent *oe; + + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + oe = btrfs_lookup_ordered_extent(inode, start); + ASSERT(oe); + if (WARN_ON(!oe)) + return em; + set_bit(BTRFS_ORDERED_IOERR, &oe->flags); + set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); + btrfs_remove_ordered_extent(inode, oe); + /* Once for our lookup and once for the ordered extents tree. */ + btrfs_put_ordered_extent(oe); + btrfs_put_ordered_extent(oe); + } return em; } @@ -7330,12 +7336,12 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) int start_idx; int end_idx; - start_idx = start >> PAGE_CACHE_SHIFT; + start_idx = start >> PAGE_SHIFT; /* * end is the last byte in the last page. end == start is legal */ - end_idx = end >> PAGE_CACHE_SHIFT; + end_idx = end >> PAGE_SHIFT; rcu_read_lock(); @@ -7376,7 +7382,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) * include/linux/pagemap.h for details. */ if (unlikely(page != *pagep)) { - page_cache_release(page); + put_page(page); page = NULL; } } @@ -7384,7 +7390,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) if (page) { if (page->index <= end_idx) found = true; - page_cache_release(page); + put_page(page); } rcu_read_unlock(); @@ -7399,7 +7405,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, while (1) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, cached_state); + cached_state); /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure theres no ordered @@ -7424,30 +7430,47 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cached_state, GFP_NOFS); if (ordered) { - btrfs_start_ordered_extent(inode, ordered, 1); + /* + * If we are doing a DIO read and the ordered extent we + * found is for a buffered write, we can not wait for it + * to complete and retry, because if we do so we can + * deadlock with concurrent buffered writes on page + * locks. This happens only if our DIO read covers more + * than one extent map, if at this point has already + * created an ordered extent for a previous extent map + * and locked its range in the inode's io tree, and a + * concurrent write against that previous extent map's + * range and this range started (we unlock the ranges + * in the io tree only when the bios complete and + * buffered writes always lock pages before attempting + * to lock range in the io tree). + */ + if (writing || + test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) + btrfs_start_ordered_extent(inode, ordered, 1); + else + ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); } else { - /* Screw you mmap */ - ret = btrfs_fdatawrite_range(inode, lockstart, lockend); - if (ret) - break; - ret = filemap_fdatawait_range(inode->i_mapping, - lockstart, - lockend); - if (ret) - break; - /* - * If we found a page that couldn't be invalidated just - * fall back to buffered. + * We could trigger writeback for this range (and wait + * for it to complete) and then invalidate the pages for + * this range (through invalidate_inode_pages2_range()), + * but that can lead us to a deadlock with a concurrent + * call to readpages() (a buffered read or a defrag call + * triggered a readahead) on a page lock due to an + * ordered dio extent we created before but did not have + * yet a corresponding bio submitted (whence it can not + * complete), which makes readpages() wait for that + * ordered extent to complete while holding a lock on + * that page. */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - lockstart >> PAGE_CACHE_SHIFT, - lockend >> PAGE_CACHE_SHIFT); - if (ret) - break; + ret = -ENOTBLK; } + if (ret) + break; + cond_resched(); } @@ -7501,10 +7524,27 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, return em; } -struct btrfs_dio_data { - u64 outstanding_extents; - u64 reserve; -}; +static void adjust_dio_outstanding_extents(struct inode *inode, + struct btrfs_dio_data *dio_data, + const u64 len) +{ + unsigned num_extents; + + num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + /* + * If we have an outstanding_extents count still set then we're + * within our reservation, otherwise we need to adjust our inode + * counter appropriately. + */ + if (dio_data->outstanding_extents) { + dio_data->outstanding_extents -= num_extents; + } else { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents += num_extents; + spin_unlock(&BTRFS_I(inode)->lock); + } +} static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -7541,8 +7581,11 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) - return -ENOTBLK; + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, + create)) { + ret = -ENOTBLK; + goto err; + } em = btrfs_get_extent(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { @@ -7660,22 +7703,11 @@ unlock: if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - /* - * If we have an outstanding_extents count still set then we're - * within our reservation, otherwise we need to adjust our inode - * counter appropriately. - */ - if (dio_data->outstanding_extents) { - (dio_data->outstanding_extents)--; - } else { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - } - + adjust_dio_outstanding_extents(inode, dio_data, len); btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; + dio_data->unsubmitted_oe_range_end = start + len; current->journal_info = dio_data; } @@ -7699,8 +7731,17 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); +err: if (dio_data) current->journal_info = dio_data; + /* + * Compensate the delalloc release we do in btrfs_direct_IO() when we + * write less data then expected, so that we don't underflow our inode's + * outstanding extents counter. + */ + if (create && dio_data) + adjust_dio_outstanding_extents(inode, dio_data, len); + return ret; } @@ -7760,9 +7801,9 @@ static int btrfs_check_dio_repairable(struct inode *inode, } static int dio_read_error(struct inode *inode, struct bio *failed_bio, - struct page *page, u64 start, u64 end, - int failed_mirror, bio_end_io_t *repair_endio, - void *repair_arg) + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + bio_end_io_t *repair_endio, void *repair_arg) { struct io_failure_record *failrec; struct bio *bio; @@ -7783,7 +7824,9 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, return -EIO; } - if (failed_bio->bi_vcnt > 1) + if ((failed_bio->bi_vcnt > 1) + || (failed_bio->bi_io_vec->bv_len + > BTRFS_I(inode)->root->sectorsize)) read_mode = READ_SYNC | REQ_FAILFAST_DEV; else read_mode = READ_SYNC; @@ -7791,7 +7834,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, isector = start - btrfs_io_bio(failed_bio)->logical; isector >>= inode->i_sb->s_blocksize_bits; bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, - 0, isector, repair_endio, repair_arg); + pgoff, isector, repair_endio, repair_arg); if (!bio) { free_io_failure(inode, failrec); return -EIO; @@ -7821,12 +7864,17 @@ struct btrfs_retry_complete { static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; + struct inode *inode; struct bio_vec *bvec; int i; if (bio->bi_error) goto end; + ASSERT(bio->bi_vcnt == 1); + inode = bio->bi_io_vec->bv_page->mapping->host; + ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize); + done->uptodate = 1; bio_for_each_segment_all(bvec, bio, i) clean_io_failure(done->inode, done->start, bvec->bv_page, 0); @@ -7838,25 +7886,35 @@ end: static int __btrfs_correct_data_nocsum(struct inode *inode, struct btrfs_io_bio *io_bio) { + struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct btrfs_retry_complete done; u64 start; + unsigned int pgoff; + u32 sectorsize; + int nr_sectors; int i; int ret; + fs_info = BTRFS_I(inode)->root->fs_info; + sectorsize = BTRFS_I(inode)->root->sectorsize; + start = io_bio->logical; done.inode = inode; bio_for_each_segment_all(bvec, &io_bio->bio, i) { -try_again: + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + pgoff = bvec->bv_offset; + +next_block_or_try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, - start + bvec->bv_len - 1, - io_bio->mirror_num, - btrfs_retry_endio_nocsum, &done); + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + pgoff, start, start + sectorsize - 1, + io_bio->mirror_num, + btrfs_retry_endio_nocsum, &done); if (ret) return ret; @@ -7864,10 +7922,15 @@ try_again: if (!done.uptodate) { /* We might have another mirror, so try again */ - goto try_again; + goto next_block_or_try_again; } - start += bvec->bv_len; + start += sectorsize; + + if (nr_sectors--) { + pgoff += sectorsize; + goto next_block_or_try_again; + } } return 0; @@ -7877,7 +7940,9 @@ static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct inode *inode; struct bio_vec *bvec; + u64 start; int uptodate; int ret; int i; @@ -7886,13 +7951,20 @@ static void btrfs_retry_endio(struct bio *bio) goto end; uptodate = 1; + + start = done->start; + + ASSERT(bio->bi_vcnt == 1); + inode = bio->bi_io_vec->bv_page->mapping->host; + ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize); + bio_for_each_segment_all(bvec, bio, i) { ret = __readpage_endio_check(done->inode, io_bio, i, - bvec->bv_page, 0, - done->start, bvec->bv_len); + bvec->bv_page, bvec->bv_offset, + done->start, bvec->bv_len); if (!ret) clean_io_failure(done->inode, done->start, - bvec->bv_page, 0); + bvec->bv_page, bvec->bv_offset); else uptodate = 0; } @@ -7906,20 +7978,34 @@ end: static int __btrfs_subio_endio_read(struct inode *inode, struct btrfs_io_bio *io_bio, int err) { + struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct btrfs_retry_complete done; u64 start; u64 offset = 0; + u32 sectorsize; + int nr_sectors; + unsigned int pgoff; + int csum_pos; int i; int ret; + fs_info = BTRFS_I(inode)->root->fs_info; + sectorsize = BTRFS_I(inode)->root->sectorsize; + err = 0; start = io_bio->logical; done.inode = inode; bio_for_each_segment_all(bvec, &io_bio->bio, i) { - ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, - 0, start, bvec->bv_len); + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + + pgoff = bvec->bv_offset; +next_block: + csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); + ret = __readpage_endio_check(inode, io_bio, csum_pos, + bvec->bv_page, pgoff, start, + sectorsize); if (likely(!ret)) goto next; try_again: @@ -7927,10 +8013,10 @@ try_again: done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, - start + bvec->bv_len - 1, - io_bio->mirror_num, - btrfs_retry_endio, &done); + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + pgoff, start, start + sectorsize - 1, + io_bio->mirror_num, + btrfs_retry_endio, &done); if (ret) { err = ret; goto next; @@ -7943,8 +8029,15 @@ try_again: goto try_again; } next: - offset += bvec->bv_len; - start += bvec->bv_len; + offset += sectorsize; + start += sectorsize; + + ASSERT(nr_sectors); + + if (--nr_sectors) { + pgoff += sectorsize; + goto next_block; + } } return err; @@ -7982,6 +8075,7 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); + dio_bio->bi_error = bio->bi_error; dio_end_io(dio_bio, bio->bi_error); if (io_bio->end_io) @@ -7989,22 +8083,22 @@ static void btrfs_endio_direct_read(struct bio *bio) bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio) +static void btrfs_endio_direct_write_update_ordered(struct inode *inode, + const u64 offset, + const u64 bytes, + const int uptodate) { - struct btrfs_dio_private *dip = bio->bi_private; - struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered = NULL; - u64 ordered_offset = dip->logical_offset; - u64 ordered_bytes = dip->bytes; - struct bio *dio_bio; + u64 ordered_offset = offset; + u64 ordered_bytes = bytes; int ret; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, ordered_bytes, - !bio->bi_error); + uptodate); if (!ret) goto out_test; @@ -8017,16 +8111,26 @@ out_test: * our bio might span multiple ordered extents. If we haven't * completed the accounting for the whole dio, go back and try again */ - if (ordered_offset < dip->logical_offset + dip->bytes) { - ordered_bytes = dip->logical_offset + dip->bytes - - ordered_offset; + if (ordered_offset < offset + bytes) { + ordered_bytes = offset + bytes - ordered_offset; ordered = NULL; goto again; } - dio_bio = dip->dio_bio; +} + +static void btrfs_endio_direct_write(struct bio *bio) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct bio *dio_bio = dip->dio_bio; + + btrfs_endio_direct_write_update_ordered(dip->inode, + dip->logical_offset, + dip->bytes, + !bio->bi_error); kfree(dip); + dio_bio->bi_error = bio->bi_error; dio_end_io(dio_bio, bio->bi_error); bio_put(bio); } @@ -8187,9 +8291,11 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, u64 file_offset = dip->logical_offset; u64 submit_len = 0; u64 map_length; - int nr_pages = 0; - int ret; + u32 blocksize = root->sectorsize; int async_submit = 0; + int nr_sectors; + int ret; + int i; map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, @@ -8219,9 +8325,12 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, atomic_inc(&dip->pending_bios); while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { - if (map_length < submit_len + bvec->bv_len || - bio_add_page(bio, bvec->bv_page, bvec->bv_len, - bvec->bv_offset) < bvec->bv_len) { + nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len); + i = 0; +next_block: + if (unlikely(map_length < submit_len + blocksize || + bio_add_page(bio, bvec->bv_page, blocksize, + bvec->bv_offset + (i * blocksize)) < blocksize)) { /* * inc the count before we submit the bio so * we know the end IO handler won't happen before @@ -8242,7 +8351,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, file_offset += submit_len; submit_len = 0; - nr_pages = 0; bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); @@ -8260,9 +8368,14 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio_put(bio); goto out_err; } + + goto next_block; } else { - submit_len += bvec->bv_len; - nr_pages++; + submit_len += blocksize; + if (--nr_sectors) { + i++; + goto next_block; + } bvec++; } } @@ -8331,6 +8444,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->subio_endio = btrfs_subio_endio_read; } + /* + * Reset the range for unsubmitted ordered extents (to a 0 length range) + * even if we fail to submit a bio, because in such case we do the + * corresponding error handling below and it must not be done a second + * time by btrfs_direct_IO(). + */ + if (write) { + struct btrfs_dio_data *dio_data = current->journal_info; + + dio_data->unsubmitted_oe_range_end = dip->logical_offset + + dip->bytes; + dio_data->unsubmitted_oe_range_start = + dio_data->unsubmitted_oe_range_end; + } + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; @@ -8359,24 +8487,15 @@ free_ordered: dip = NULL; io_bio = NULL; } else { - if (write) { - struct btrfs_ordered_extent *ordered; - - ordered = btrfs_lookup_ordered_extent(inode, - file_offset); - set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); - /* - * Decrements our ref on the ordered extent and removes - * the ordered extent from the inode's ordered tree, - * doing all the proper resource cleanup such as for the - * reserved space and waking up any waiters for this - * ordered extent (through btrfs_remove_ordered_extent). - */ - btrfs_finish_ordered_io(ordered); - } else { + if (write) + btrfs_endio_direct_write_update_ordered(inode, + file_offset, + dio_bio->bi_iter.bi_size, + 0); + else unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - } + dio_bio->bi_error = -EIO; /* * Releases and cleans up our dio_bio, no need to bio_put() @@ -8460,7 +8579,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); relock = true; } ret = btrfs_delalloc_reserve_space(inode, offset, count); @@ -8476,6 +8595,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * originally calculated. Abuse current->journal_info for this. */ dio_data.reserve = round_up(count, root->sectorsize); + dio_data.unsubmitted_oe_range_start = (u64)offset; + dio_data.unsubmitted_oe_range_end = (u64)offset; current->journal_info = &dio_data; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { @@ -8494,6 +8615,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (dio_data.reserve) btrfs_delalloc_release_space(inode, offset, dio_data.reserve); + /* + * On error we might have left some ordered extents + * without submitting corresponding bios for them, so + * cleanup them up to avoid other tasks getting them + * and waiting for them to complete forever. + */ + if (dio_data.unsubmitted_oe_range_start < + dio_data.unsubmitted_oe_range_end) + btrfs_endio_direct_write_update_ordered(inode, + dio_data.unsubmitted_oe_range_start, + dio_data.unsubmitted_oe_range_end - + dio_data.unsubmitted_oe_range_start, + 0); } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, offset, count - (size_t)ret); @@ -8502,7 +8636,7 @@ out: if (wakeup) inode_dio_end(inode); if (relock) - mutex_lock(&inode->i_mutex); + inode_lock(inode); return ret; } @@ -8531,15 +8665,28 @@ int btrfs_readpage(struct file *file, struct page *page) static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; - + struct inode *inode = page->mapping->host; + int ret; if (current->flags & PF_MEMALLOC) { redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; } + + /* + * If we are under memory pressure we will call this directly from the + * VM, we need to make sure we have the inode referenced for the ordered + * extent. If not just return like we didn't do anything. + */ + if (!igrab(inode)) { + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; + } tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_write_full_page(tree, page, btrfs_get_extent, wbc); + ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc); + btrfs_add_delayed_iput(inode); + return ret; } static int btrfs_writepages(struct address_space *mapping, @@ -8572,7 +8719,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) if (ret == 1) { ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } return ret; } @@ -8592,7 +8739,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); - u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 page_end = page_start + PAGE_SIZE - 1; + u64 start; + u64 end; int inode_evicting = inode->i_state & I_FREEING; /* @@ -8611,15 +8760,19 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, } if (!inode_evicting) - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, page_start); + lock_extent_bits(tree, page_start, page_end, &cached_state); +again: + start = page_start; + ordered = btrfs_lookup_ordered_range(inode, start, + page_end - start + 1); if (ordered) { + end = min(page_end, ordered->file_offset + ordered->len - 1); /* * IO on this page will never be started, so we need * to account for any ordered extents now */ if (!inode_evicting) - clear_extent_bit(tree, page_start, page_end, + clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state, @@ -8636,22 +8789,26 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, spin_lock_irq(&tree->lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); - new_len = page_start - ordered->file_offset; + new_len = start - ordered->file_offset; if (new_len < ordered->truncated_len) ordered->truncated_len = new_len; spin_unlock_irq(&tree->lock); if (btrfs_dec_test_ordered_pending(inode, &ordered, - page_start, - PAGE_CACHE_SIZE, 1)) + start, + end - start + 1, 1)) btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); if (!inode_evicting) { cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, + lock_extent_bits(tree, start, end, &cached_state); } + + start = end + 1; + if (start < page_end) + goto again; } /* @@ -8665,7 +8822,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, * 2) Not written to disk * This means the reserved space should be freed here. */ - btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE); + btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -8680,7 +8837,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (PagePrivate(page)) { ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } } @@ -8712,15 +8869,28 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size; int ret; int reserved = 0; + u64 reserved_space; u64 page_start; u64 page_end; + u64 end; + + reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; + page_end = page_start + PAGE_SIZE - 1; + end = page_end; + /* + * Reserving delalloc space after obtaining the page lock can lead to + * deadlock. For example, if a dirty page is locked by this function + * and the call to btrfs_delalloc_reserve_space() ends up triggering + * dirty page write out, then the btrfs_writepage() function could + * end up waiting indefinitely to get a lock on the page currently + * being processed by btrfs_page_mkwrite() function. + */ ret = btrfs_delalloc_reserve_space(inode, page_start, - PAGE_CACHE_SIZE); + reserved_space); if (!ret) { ret = file_update_time(vma->vm_file); reserved = 1; @@ -8747,14 +8917,14 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(io_tree, page_start, page_end, &cached_state); set_page_extent_mapped(page); /* * we can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish */ - ordered = btrfs_lookup_ordered_extent(inode, page_start); + ordered = btrfs_lookup_ordered_range(inode, page_start, page_end); if (ordered) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -8764,6 +8934,18 @@ again: goto again; } + if (page->index == ((size - 1) >> PAGE_SHIFT)) { + reserved_space = round_up(size - page_start, root->sectorsize); + if (reserved_space < PAGE_SIZE) { + end = page_start + reserved_space - 1; + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + btrfs_delalloc_release_space(inode, page_start, + PAGE_SIZE - reserved_space); + } + } + /* * XXX - page_mkwrite gets called every time the page is dirtied, even * if it was already dirty, so for space accounting reasons we need to @@ -8771,12 +8953,12 @@ again: * is probably a better way to do this, but for now keep consistent with * prepare_pages in the normal write path. */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, + ret = btrfs_set_extent_delalloc(inode, page_start, end, &cached_state); if (ret) { unlock_extent_cached(io_tree, page_start, page_end, @@ -8787,14 +8969,14 @@ again: ret = 0; /* page is wholly or partially inside EOF */ - if (page_start + PAGE_CACHE_SIZE > size) - zero_start = size & ~PAGE_CACHE_MASK; + if (page_start + PAGE_SIZE > size) + zero_start = size & ~PAGE_MASK; else - zero_start = PAGE_CACHE_SIZE; + zero_start = PAGE_SIZE; - if (zero_start != PAGE_CACHE_SIZE) { + if (zero_start != PAGE_SIZE) { kaddr = kmap(page); - memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); + memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); flush_dcache_page(page); kunmap(page); } @@ -8815,7 +8997,7 @@ out_unlock: } unlock_page(page); out: - btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, page_start, reserved_space); out_noreserve: sb_end_pagefault(inode->i_sb); return ret; @@ -9021,6 +9203,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_log_commit = 0; + ei->delayed_iput_count = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; @@ -9045,6 +9228,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); + INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); return inode; @@ -9139,25 +9323,19 @@ void btrfs_destroy_cachep(void) * destroy cache. */ rcu_barrier(); - if (btrfs_inode_cachep) - kmem_cache_destroy(btrfs_inode_cachep); - if (btrfs_trans_handle_cachep) - kmem_cache_destroy(btrfs_trans_handle_cachep); - if (btrfs_transaction_cachep) - kmem_cache_destroy(btrfs_transaction_cachep); - if (btrfs_path_cachep) - kmem_cache_destroy(btrfs_path_cachep); - if (btrfs_free_space_cachep) - kmem_cache_destroy(btrfs_free_space_cachep); - if (btrfs_delalloc_work_cachep) - kmem_cache_destroy(btrfs_delalloc_work_cachep); + kmem_cache_destroy(btrfs_inode_cachep); + kmem_cache_destroy(btrfs_trans_handle_cachep); + kmem_cache_destroy(btrfs_transaction_cachep); + kmem_cache_destroy(btrfs_path_cachep); + kmem_cache_destroy(btrfs_free_space_cachep); } int btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + init_once); if (!btrfs_inode_cachep) goto fail; @@ -9185,13 +9363,6 @@ int btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; - btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", - sizeof(struct btrfs_delalloc_work), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - NULL); - if (!btrfs_delalloc_work_cachep) - goto fail; - return 0; fail: btrfs_destroy_cachep(); @@ -9207,7 +9378,6 @@ static int btrfs_getattr(struct vfsmount *mnt, generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; - stat->blksize = PAGE_CACHE_SIZE; spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; @@ -9225,7 +9395,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); - struct timespec ctime = CURRENT_TIME; u64 index = 0; u64 root_objectid; int ret; @@ -9322,9 +9491,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; - old_inode->i_ctime = ctime; + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + old_inode->i_ctime = current_fs_time(old_dir->i_sb); if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); @@ -9349,7 +9518,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode) { inode_inc_iversion(new_inode); - new_inode->i_ctime = CURRENT_TIME; + new_inode->i_ctime = current_fs_time(new_inode->i_sb); if (unlikely(btrfs_ino(new_inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { root_objectid = BTRFS_I(new_inode)->location.objectid; @@ -9415,14 +9584,10 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) delalloc_work = container_of(work, struct btrfs_delalloc_work, work); inode = delalloc_work->inode; - if (delalloc_work->wait) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - } else { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); - } if (delalloc_work->delay_iput) btrfs_add_delayed_iput(inode); @@ -9432,18 +9597,17 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) } struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int wait, int delay_iput) + int delay_iput) { struct btrfs_delalloc_work *work; - work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + work = kmalloc(sizeof(*work), GFP_NOFS); if (!work) return NULL; init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - work->wait = wait; work->delay_iput = delay_iput; WARN_ON_ONCE(!inode); btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, @@ -9455,7 +9619,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) { wait_for_completion(&work->completion); - kmem_cache_free(btrfs_delalloc_work_cachep, work); + kfree(work); } /* @@ -9491,7 +9655,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, } spin_unlock(&root->delalloc_lock); - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + work = btrfs_alloc_delalloc_work(inode, delay_iput); if (!work) { if (delay_iput) btrfs_add_delayed_iput(inode); @@ -9633,9 +9797,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, /* * 2 items for inode item and ref * 2 items for dir items + * 1 item for updating parent inode item + * 1 item for the inline extent item * 1 item for xattr if selinux is on */ - trans = btrfs_start_transaction(root, 5); + trans = btrfs_start_transaction(root, 7); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -9666,10 +9832,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock_inode; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); - if (err) - goto out_unlock_inode; - path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -9702,10 +9864,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, btrfs_free_path(path); inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_symlink_aops; inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); + /* + * Last step, add directory indexes for our symlink inode. This is the + * last step to avoid extra cleanup of these indexes if an error happens + * elsewhere above. + */ + if (!err) + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) { drop_inode = 1; goto out_unlock_inode; @@ -9756,7 +9926,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); + cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* * If we are severely fragmented we could end up with really @@ -9826,7 +9996,7 @@ next: *alloc_hint = ins.objectid + ins.offset; inode_inc_iversion(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_fs_time(inode->i_sb); BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && @@ -9991,7 +10161,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .setattr = btrfs_setattr, .mknod = btrfs_mknod, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, @@ -10020,7 +10190,7 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static struct extent_io_ops btrfs_extent_io_ops = { +static const struct extent_io_ops btrfs_extent_io_ops = { .fill_delalloc = run_delalloc_range, .submit_bio_hook = btrfs_submit_bio_hook, .merge_bio_hook = btrfs_merge_bio_hook, @@ -10068,7 +10238,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, @@ -10082,7 +10252,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .setattr = btrfs_setattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, @@ -10091,13 +10261,12 @@ static const struct inode_operations btrfs_special_inode_operations = { }; static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .update_time = btrfs_update_time, |