diff options
Diffstat (limited to 'fs')
281 files changed, 7013 insertions, 6923 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 83eab52fb3f6..b0e42b6a96b9 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -39,6 +39,7 @@ config FS_DAX depends on MMU depends on !(ARM || MIPS || SPARC) select FS_IOMAP + select DAX help Direct Access (DAX) can be used on memory-backed block devices. If the block device supports DAX and the filesystem supports DAX, diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 3062cceb5c2a..782d4d05a53b 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -350,7 +350,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { struct sockaddr_rxrpc srx; struct afs_server *server; - struct uuid_v1 *r; + struct afs_uuid *r; unsigned loop; __be32 *b; int ret; @@ -380,7 +380,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) } _debug("unmarshall UUID"); - call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); if (!call->request) return -ENOMEM; @@ -453,7 +453,7 @@ static int afs_deliver_cb_probe(struct afs_call *call) static void SRXAFSCB_ProbeUuid(struct work_struct *work) { struct afs_call *call = container_of(work, struct afs_call, work); - struct uuid_v1 *r = call->request; + struct afs_uuid *r = call->request; struct { __be32 match; @@ -476,7 +476,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) */ static int afs_deliver_cb_probe_uuid(struct afs_call *call) { - struct uuid_v1 *r; + struct afs_uuid *r; unsigned loop; __be32 *b; int ret; @@ -502,15 +502,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) } _debug("unmarshall UUID"); - call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); if (!call->request) return -ENOMEM; b = call->buffer; r = call->request; - r->time_low = b[0]; - r->time_mid = htons(ntohl(b[1])); - r->time_hi_and_version = htons(ntohl(b[2])); + r->time_low = ntohl(b[0]); + r->time_mid = ntohl(b[1]); + r->time_hi_and_version = ntohl(b[2]); r->clock_seq_hi_and_reserved = ntohl(b[3]); r->clock_seq_low = ntohl(b[4]); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 393672997cc2..4e2556606623 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -410,6 +410,15 @@ struct afs_interface { unsigned mtu; /* MTU of interface */ }; +struct afs_uuid { + __be32 time_low; /* low part of timestamp */ + __be16 time_mid; /* mid part of timestamp */ + __be16 time_hi_and_version; /* high part of timestamp and version */ + __u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */ + __u8 clock_seq_low; /* clock seq low */ + __u8 node[6]; /* spatially unique node ID (MAC addr) */ +}; + /*****************************************************************************/ /* * cache.c @@ -544,7 +553,7 @@ extern int afs_drop_inode(struct inode *); * main.c */ extern struct workqueue_struct *afs_wq; -extern struct uuid_v1 afs_uuid; +extern struct afs_uuid afs_uuid; /* * misc.c diff --git a/fs/afs/main.c b/fs/afs/main.c index 51d7d17bca57..9944770849da 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -31,7 +31,7 @@ static char *rootcell; module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); -struct uuid_v1 afs_uuid; +struct afs_uuid afs_uuid; struct workqueue_struct *afs_wq; /* diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d5990eb160bd..02781e78ffb6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -341,6 +341,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; size_t offset; + s64 tx_total_len; u32 abort_code; int ret; @@ -364,9 +365,20 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, srx.transport.sin.sin_port = call->port; memcpy(&srx.transport.sin.sin_addr, addr, 4); + /* Work out the length we're going to transmit. This is awkward for + * calls such as FS.StoreData where there's an extra injection of data + * after the initial fixed part. + */ + tx_total_len = call->request_size; + if (call->send_pages) { + tx_total_len += call->last_to - call->first_offset; + tx_total_len += (call->last - call->first) * PAGE_SIZE; + } + /* create a call */ rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, - (unsigned long) call, gfp, + (unsigned long)call, + tx_total_len, gfp, (async ? afs_wake_up_async_call : afs_wake_up_call_waiter)); @@ -738,6 +750,8 @@ void afs_send_empty_reply(struct afs_call *call) _enter(""); + rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, 0); + msg.msg_name = NULL; msg.msg_namelen = 0; iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); @@ -772,6 +786,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) _enter(""); + rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, len); + iov[0].iov_base = (void *) buf; iov[0].iov_len = len; msg.msg_name = NULL; @@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { + if (unlikely(iocb->aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1568,6 +1568,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_pos = iocb->aio_offset; req->common.ki_complete = aio_complete; req->common.ki_flags = iocb_flags(req->common.ki_filp); + req->common.ki_hint = file_write_hint(file); if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* @@ -1586,6 +1587,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_flags |= IOCB_EVENTFD; } + ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags); + if (unlikely(ret)) { + pr_debug("EINVAL: aio_rw_flags\n"); + goto out_put_req; + } + + if ((req->common.ki_flags & IOCB_NOWAIT) && + !(req->common.ki_flags & IOCB_DIRECT)) { + ret = -EOPNOTSUPP; + goto out_put_req; + } + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { pr_debug("EFAULT: aio_key\n"); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index beef981aa54f..974f5346458a 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -83,7 +83,7 @@ struct autofs_info { struct autofs_wait_queue { wait_queue_head_t queue; struct autofs_wait_queue *next; - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; /* We use the following to see what we are waiting for */ struct qstr name; u32 dev; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 734cbf8d9676..dd9f1bebb5a3 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -344,7 +344,7 @@ static int autofs_dev_ioctl_fail(struct file *fp, int status; token = (autofs_wqt_t) param->fail.token; - status = param->fail.status ? param->fail.status : -ENOENT; + status = param->fail.status < 0 ? param->fail.status : -ENOENT; return autofs4_wait_release(sbi, token, status); } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 24a58bf9ca72..7071895b0678 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, size_t pktsz; pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", - (unsigned long) wq->wait_queue_token, + (unsigned long) wq->wait_queue_entry_token, wq->name.len, wq->name.name, type); memset(&pkt, 0, sizeof(pkt)); /* For security reasons */ @@ -120,7 +120,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*mp); - mp->wait_queue_token = wq->wait_queue_token; + mp->wait_queue_entry_token = wq->wait_queue_entry_token; mp->len = wq->name.len; memcpy(mp->name, wq->name.name, wq->name.len); mp->name[wq->name.len] = '\0'; @@ -133,7 +133,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*ep); - ep->wait_queue_token = wq->wait_queue_token; + ep->wait_queue_entry_token = wq->wait_queue_entry_token; ep->len = wq->name.len; memcpy(ep->name, wq->name.name, wq->name.len); ep->name[wq->name.len] = '\0'; @@ -153,7 +153,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*packet); - packet->wait_queue_token = wq->wait_queue_token; + packet->wait_queue_entry_token = wq->wait_queue_entry_token; packet->len = wq->name.len; memcpy(packet->name, wq->name.name, wq->name.len); packet->name[wq->name.len] = '\0'; @@ -428,7 +428,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, return -ENOMEM; } - wq->wait_queue_token = autofs4_next_wait_queue; + wq->wait_queue_entry_token = autofs4_next_wait_queue; if (++autofs4_next_wait_queue == 0) autofs4_next_wait_queue = 1; wq->next = sbi->queues; @@ -461,7 +461,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, } pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->name.len, + (unsigned long) wq->wait_queue_entry_token, wq->name.len, wq->name.name, notify); /* @@ -471,7 +471,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, } else { wq->wait_ctr++; pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->name.len, + (unsigned long) wq->wait_queue_entry_token, wq->name.len, wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); kfree(qstr.name); @@ -550,13 +550,13 @@ int autofs4_wait(struct autofs_sb_info *sbi, } -int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) +int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_entry_token, int status) { struct autofs_wait_queue *wq, **wql; mutex_lock(&sbi->wq_mutex); for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) { - if (wq->wait_queue_token == wait_queue_token) + if (wq->wait_queue_entry_token == wait_queue_entry_token) break; } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index f2deec0a62f0..25e312cb6071 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -1,7 +1,7 @@ /* * fs/bfs/inode.c * BFS superblock and inode operations. - * Copyright (C) 1999-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com> * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. * * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005. @@ -19,7 +19,7 @@ #include <linux/uaccess.h> #include "bfs.h" -MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>"); MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux"); MODULE_LICENSE("GPL"); diff --git a/fs/block_dev.c b/fs/block_dev.c index 2a305c1a2d88..a7df151f8aba 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -225,6 +225,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio_init(&bio, vecs, nr_pages); bio.bi_bdev = bdev; bio.bi_iter.bi_sector = pos >> 9; + bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; @@ -262,8 +263,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (vecs != inline_vecs) kfree(vecs); - if (unlikely(bio.bi_error)) - return bio.bi_error; + if (unlikely(bio.bi_status)) + ret = blk_status_to_errno(bio.bi_status); + + bio_uninit(&bio); + return ret; } @@ -288,16 +292,18 @@ static void blkdev_bio_end_io(struct bio *bio) bool should_dirty = dio->should_dirty; if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_error && !dio->bio.bi_error) - dio->bio.bi_error = bio->bi_error; + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; } else { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; - ssize_t ret = dio->bio.bi_error; + ssize_t ret; - if (likely(!ret)) { + if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret, 0); @@ -334,7 +340,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; - int ret; + int ret = 0; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -358,12 +364,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) for (;;) { bio->bi_bdev = bdev; bio->bi_iter.bi_sector = pos >> 9; + bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { - bio->bi_error = ret; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } @@ -412,7 +419,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } __set_current_state(TASK_RUNNING); - ret = dio->bio.bi_error; + if (!ret) + ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; @@ -436,7 +444,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static __init int blkdev_init(void) { - blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio)); + blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); if (!blkdev_dio_pool) return -ENOMEM; return 0; @@ -717,72 +725,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(bdev_write_page); -int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, - pgoff_t *pgoff) -{ - phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512; - - if (pgoff) - *pgoff = PHYS_PFN(phys_off); - if (phys_off % PAGE_SIZE || size % PAGE_SIZE) - return -EINVAL; - return 0; -} -EXPORT_SYMBOL(bdev_dax_pgoff); - -/** - * bdev_dax_supported() - Check if the device supports dax for filesystem - * @sb: The superblock of the device - * @blocksize: The block size of the device - * - * This is a library function for filesystems to check if the block device - * can be mounted with dax option. - * - * Return: negative errno if unsupported, 0 if supported. - */ -int bdev_dax_supported(struct super_block *sb, int blocksize) -{ - struct block_device *bdev = sb->s_bdev; - struct dax_device *dax_dev; - pgoff_t pgoff; - int err, id; - void *kaddr; - pfn_t pfn; - long len; - - if (blocksize != PAGE_SIZE) { - vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); - return -EINVAL; - } - - err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); - if (err) { - vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax"); - return err; - } - - dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); - if (!dax_dev) { - vfs_msg(sb, KERN_ERR, "error: device does not support dax"); - return -EOPNOTSUPP; - } - - id = dax_read_lock(); - len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); - dax_read_unlock(id); - - put_dax(dax_dev); - - if (len < 1) { - vfs_msg(sb, KERN_ERR, - "error: dax access failed (%ld)", len); - return len < 0 ? len : -EIO; - } - - return 0; -} -EXPORT_SYMBOL_GPL(bdev_dax_supported); - /* * pseudo-fs */ diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 247b8dfaf6e5..8d8370ddb6b2 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -78,12 +78,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (ret) - return ret; - } - ret = 0; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) @@ -119,6 +113,13 @@ out: int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + int ret; + + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (ret) + return ret; + } return __btrfs_set_acl(NULL, inode, acl, type); } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 7699e16784d3..f723c11bb763 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,7 +16,7 @@ * Boston, MA 021110-1307, USA. */ -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/rbtree.h> #include "ctree.h" #include "disk-io.h" @@ -26,6 +26,11 @@ #include "delayed-ref.h" #include "locking.h" +enum merge_mode { + MERGE_IDENTICAL_KEYS = 1, + MERGE_IDENTICAL_PARENTS, +}; + /* Just an arbitrary number so we can be sure this happened */ #define BACKREF_FOUND_SHARED 6 @@ -533,7 +538,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * slot==nritems. In that case, go to the next leaf before we continue. */ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); @@ -577,7 +582,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_next_item(root, path); else ret = btrfs_next_old_item(root, path, time_seq); @@ -629,7 +634,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == (u64)-1) + else if (time_seq == SEQ_LAST) root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -640,7 +645,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } path->lowest_level = level; - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, 0, 0); else @@ -809,14 +814,12 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, /* * merge backrefs and adjust counts accordingly * - * mode = 1: merge identical keys, if key is set - * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. - * additionally, we could even add a key range for the blocks we - * looked into to merge even more (-> replace unresolved refs by those - * having a parent). - * mode = 2: merge identical parents + * FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in __add_prelim_ref + * then we can merge more here. Additionally, we could even add a key + * range for the blocks we looked into to merge even more (-> replace + * unresolved refs by those having a parent). */ -static void __merge_refs(struct list_head *head, int mode) +static void __merge_refs(struct list_head *head, enum merge_mode mode) { struct __prelim_ref *pos1; @@ -829,7 +832,7 @@ static void __merge_refs(struct list_head *head, int mode) if (!ref_for_same_block(ref1, ref2)) continue; - if (mode == 1) { + if (mode == MERGE_IDENTICAL_KEYS) { if (!ref1->parent && ref2->parent) swap(ref1, ref2); } else { @@ -1196,7 +1199,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * * NOTE: This can return values > 0 * - * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave + * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave * much like trans == NULL case, the difference only lies in it will not * commit root. * The special case is for qgroup to search roots in commit_transaction(). @@ -1243,7 +1246,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) path->skip_locking = 1; /* @@ -1273,9 +1276,9 @@ again: #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != (u64)-1) { + time_seq != SEQ_LAST) { #else - if (trans && time_seq != (u64)-1) { + if (trans && time_seq != SEQ_LAST) { #endif /* * look if there are updates for this ref queued and lock the @@ -1286,7 +1289,7 @@ again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -1374,7 +1377,7 @@ again: if (ret) goto out; - __merge_refs(&prefs, 1); + __merge_refs(&prefs, MERGE_IDENTICAL_KEYS); ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, extent_item_pos, total_refs, @@ -1382,7 +1385,7 @@ again: if (ret) goto out; - __merge_refs(&prefs, 2); + __merge_refs(&prefs, MERGE_IDENTICAL_PARENTS); while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); @@ -2302,7 +2305,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) size_t alloc_bytes; alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); - data = vmalloc(alloc_bytes); + data = kvmalloc(alloc_bytes, GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); @@ -2336,9 +2339,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, if (IS_ERR(fspath)) return (void *)fspath; - ifp = kmalloc(sizeof(*ifp), GFP_NOFS); + ifp = kmalloc(sizeof(*ifp), GFP_KERNEL); if (!ifp) { - vfree(fspath); + kvfree(fspath); return ERR_PTR(-ENOMEM); } @@ -2353,6 +2356,6 @@ void free_ipath(struct inode_fs_paths *ipath) { if (!ipath) return; - vfree(ipath->fspath); + kvfree(ipath->fspath); kfree(ipath); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0c6baaba0651..d87ac27a5f2b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -125,6 +125,13 @@ struct btrfs_inode { u64 delalloc_bytes; /* + * Total number of bytes pending delalloc that fall within a file + * range that is either a hole or beyond EOF (and no prealloc extent + * exists in the range). This is always <= delalloc_bytes. + */ + u64 new_delalloc_bytes; + + /* * total number of bytes pending defrag, used by stat to check whether * it needs COW. */ @@ -303,7 +310,8 @@ struct btrfs_dio_private { * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ - int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); + blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *, + blk_status_t); }; /* diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ab14c2e635ca..11d37c94ce05 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -94,7 +94,7 @@ #include <linux/mutex.h> #include <linux/genhd.h> #include <linux/blkdev.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/string.h> #include "ctree.h" #include "disk-io.h" @@ -1638,12 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, struct bio *bio; unsigned int j; - bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); - if (!bio) { - pr_info("btrfsic: bio_alloc() for %u pages failed!\n", - num_pages - i); - return -1; - } + bio = btrfs_io_bio_alloc(num_pages - i); bio->bi_bdev = block_ctx->dev->bdev; bio->bi_iter.bi_sector = dev_bytenr >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); @@ -1668,14 +1663,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, dev_bytenr += (j - i) * PAGE_SIZE; i = j; } - for (i = 0; i < num_pages; i++) { + for (i = 0; i < num_pages; i++) block_ctx->datav[i] = kmap(block_ctx->pagev[i]); - if (!block_ctx->datav[i]) { - pr_info("btrfsic: kmap() failed (dev %s)!\n", - block_ctx->dev->name); - return -1; - } - } return block_ctx->len; } @@ -2129,7 +2118,7 @@ static void btrfsic_bio_end_io(struct bio *bp) /* mutex is not held! This is not save if IO is not yet completed * on umount */ iodone_w_error = 0; - if (bp->bi_error) + if (bp->bi_status) iodone_w_error = 1; BUG_ON(NULL == block); @@ -2143,7 +2132,7 @@ static void btrfsic_bio_end_io(struct bio *bp) if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bp->bi_error, + bp->bi_status, btrfsic_get_block_type(dev_state->state, block), block->logical_bytenr, dev_state->name, block->dev_bytenr, block->mirror_num); @@ -2822,44 +2811,47 @@ static void __btrfsic_submit_bio(struct bio *bio) dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - unsigned int i; + unsigned int i = 0; u64 dev_bytenr; u64 cur_bytenr; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; int bio_is_patched; char **mapped_datav; + unsigned int segs = bio_segments(bio); dev_bytenr = 512 * bio->bi_iter.bi_sector; bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_vcnt, + bio_op(bio), bio->bi_opf, segs, (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - mapped_datav = kmalloc_array(bio->bi_vcnt, + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); if (!mapped_datav) goto leave; cur_bytenr = dev_bytenr; - bio_for_each_segment_all(bvec, bio, i) { - BUG_ON(bvec->bv_len != PAGE_SIZE); - mapped_datav[i] = kmap(bvec->bv_page); + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_len != PAGE_SIZE); + mapped_datav[i] = kmap(bvec.bv_page); + i++; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec->bv_len, bvec->bv_offset); - cur_bytenr += bvec->bv_len; + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); + cur_bytenr += bvec.bv_len; } btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_datav, bio->bi_vcnt, + mapped_datav, segs, bio, &bio_is_patched, NULL, bio->bi_opf); - bio_for_each_segment_all(bvec, bio, i) - kunmap(bvec->bv_page); + bio_for_each_segment(bvec, bio, iter) + kunmap(bvec.bv_page); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & @@ -2923,13 +2915,10 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, fs_info->sectorsize, PAGE_SIZE); return -1; } - state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + state = kvzalloc(sizeof(*state), GFP_KERNEL); if (!state) { - state = vzalloc(sizeof(*state)); - if (!state) { - pr_info("btrfs check-integrity: vzalloc() failed!\n"); - return -1; - } + pr_info("btrfs check-integrity: allocation failed!\n"); + return -1; } if (!btrfsic_is_initialized) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c7721a6aa3bb..2c0b7b57fcd5 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -32,6 +32,7 @@ #include <linux/writeback.h> #include <linux/bit_spinlock.h> #include <linux/slab.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -42,48 +43,7 @@ #include "extent_io.h" #include "extent_map.h" -struct compressed_bio { - /* number of bios pending for this compressed extent */ - atomic_t pending_bios; - - /* the pages with the compressed data on them */ - struct page **compressed_pages; - - /* inode that owns this data */ - struct inode *inode; - - /* starting offset in the inode for our pages */ - u64 start; - - /* number of bytes in the inode we're working on */ - unsigned long len; - - /* number of bytes on disk */ - unsigned long compressed_len; - - /* the compression algorithm for this bio */ - int compress_type; - - /* number of compressed pages in the array */ - unsigned long nr_pages; - - /* IO errors */ - int errors; - int mirror_num; - - /* for reads, this is the bio we are copying the data into */ - struct bio *orig_bio; - - /* - * the start of a variable length array of checksums only - * used by reads - */ - u32 sums; -}; - -static int btrfs_decompress_bio(int type, struct page **pages_in, - u64 disk_start, struct bio *orig_bio, - size_t srclen); +static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, unsigned long disk_size) @@ -94,12 +54,6 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; } -static struct bio *compressed_bio_alloc(struct block_device *bdev, - u64 first_byte, gfp_t gfp_flags) -{ - return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags); -} - static int check_compressed_csum(struct btrfs_inode *inode, struct compressed_bio *cb, u64 disk_start) @@ -155,13 +109,13 @@ static void end_compressed_bio_read(struct bio *bio) unsigned long index; int ret; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed * extent, just exit */ - if (!atomic_dec_and_test(&cb->pending_bios)) + if (!refcount_dec_and_test(&cb->pending_bios)) goto out; inode = cb->inode; @@ -173,11 +127,8 @@ static void end_compressed_bio_read(struct bio *bio) /* ok, we're the last bio for this extent, lets start * the decompression. */ - ret = btrfs_decompress_bio(cb->compress_type, - cb->compressed_pages, - cb->start, - cb->orig_bio, - cb->compressed_len); + ret = btrfs_decompress_bio(cb); + csum_failed: if (ret) cb->errors = 1; @@ -268,13 +219,13 @@ static void end_compressed_bio_write(struct bio *bio) struct page *page; unsigned long index; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed * extent, just exit */ - if (!atomic_dec_and_test(&cb->pending_bios)) + if (!refcount_dec_and_test(&cb->pending_bios)) goto out; /* ok, we're the last bio for this extent, step one is to @@ -287,7 +238,7 @@ static void end_compressed_bio_write(struct bio *bio) cb->start, cb->start + cb->len - 1, NULL, - bio->bi_error ? 0 : 1); + bio->bi_status ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -320,7 +271,7 @@ out: * This also checksums the file bytes and gets things ready for * the end io hooks. */ -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, @@ -335,14 +286,14 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, struct page *page; u64 first_byte = disk_start; struct block_device *bdev; - int ret; + blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; WARN_ON(start & ((u64)PAGE_SIZE - 1)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) - return -ENOMEM; - atomic_set(&cb->pending_bios, 0); + return BLK_STS_RESOURCE; + refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; cb->start = start; @@ -355,30 +306,26 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = fs_info->fs_devices->latest_bdev; - bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - if (!bio) { - kfree(cb); - return -ENOMEM; - } + bio = btrfs_bio_alloc(bdev, first_byte); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; - atomic_inc(&cb->pending_bios); + refcount_set(&cb->pending_bios, 1); /* create and submit bios for the compressed pages */ bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { + int submit = 0; + page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, + submit = io_tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(bio); @@ -388,7 +335,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, * we inc the count. Otherwise, the cb might get * freed before we're done setting it up */ - atomic_inc(&cb->pending_bios); + refcount_inc(&cb->pending_bios); ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -400,14 +347,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } bio_put(bio); - bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - BUG_ON(!bio); + bio = btrfs_bio_alloc(bdev, first_byte); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -434,7 +380,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } @@ -569,7 +515,7 @@ next: * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -586,7 +532,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - int ret = -ENOMEM; + blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; u32 *sums; @@ -600,14 +546,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) - return -EIO; + return BLK_STS_IOERR; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) goto out; - atomic_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; @@ -638,7 +584,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, __GFP_HIGHMEM); if (!cb->compressed_pages[pg_index]) { faili = pg_index - 1; - ret = -ENOMEM; + ret = BLK_STS_RESOURCE; goto fail2; } } @@ -650,28 +596,26 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; - comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); - if (!comp_bio) - goto fail2; + comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; - atomic_inc(&cb->pending_bios); + refcount_set(&cb->pending_bios, 1); for (pg_index = 0; pg_index < nr_pages; pg_index++) { + int submit = 0; + page = cb->compressed_pages[pg_index]; page->mapping = inode->i_mapping; page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - ret = tree->ops->merge_bio_hook(page, 0, + submit = tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, comp_bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(comp_bio); @@ -685,7 +629,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, * we inc the count. Otherwise, the cb might get * freed before we're done setting it up */ - atomic_inc(&cb->pending_bios); + refcount_inc(&cb->pending_bios); if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(inode, comp_bio, @@ -697,15 +641,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } bio_put(comp_bio); - comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, - GFP_NOFS); - BUG_ON(!comp_bio); + comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -726,7 +668,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } @@ -801,6 +743,7 @@ static struct list_head *find_workspace(int type) struct list_head *workspace; int cpus = num_online_cpus(); int idx = type - 1; + unsigned nofs_flag; struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; @@ -830,7 +773,15 @@ again: atomic_inc(total_ws); spin_unlock(ws_lock); + /* + * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have + * to turn it off here because we might get called from the restricted + * context of btrfs_compress_bio/btrfs_compress_pages + */ + nofs_flag = memalloc_nofs_save(); workspace = btrfs_compress_op[idx]->alloc_workspace(); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(workspace)) { atomic_dec(total_ws); wake_up(ws_wait); @@ -961,19 +912,16 @@ int btrfs_compress_pages(int type, struct address_space *mapping, * be contiguous. They all correspond to the range of bytes covered by * the compressed extent. */ -static int btrfs_decompress_bio(int type, struct page **pages_in, - u64 disk_start, struct bio *orig_bio, - size_t srclen) +static int btrfs_decompress_bio(struct compressed_bio *cb) { struct list_head *workspace; int ret; + int type = cb->compress_type; workspace = find_workspace(type); - - ret = btrfs_compress_op[type-1]->decompress_bio(workspace, pages_in, - disk_start, orig_bio, - srclen); + ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb); free_workspace(type, workspace); + return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 39ec43ab8df1..87f6d3332163 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -34,6 +34,45 @@ /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) +struct compressed_bio { + /* number of bios pending for this compressed extent */ + refcount_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* the compression algorithm for this bio */ + int compress_type; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + int mirror_num; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + + /* + * the start of a variable length array of checksums only + * used by reads + */ + u32 sums; +}; + void btrfs_init_compress(void); void btrfs_exit_compress(void); @@ -48,12 +87,12 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio); -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, unsigned long nr_pages); -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); enum btrfs_compression_type { @@ -78,10 +117,7 @@ struct btrfs_compress_op { unsigned long *total_out); int (*decompress_bio)(struct list_head *workspace, - struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen); + struct compressed_bio *cb); int (*decompress)(struct list_head *workspace, unsigned char *data_in, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1c3b6c54d5ee..3f4daa9d6e2c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -19,7 +19,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/rbtree.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -567,7 +567,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, static noinline int tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, - int nr_items, gfp_t flags) + int nr_items) { struct tree_mod_elem *tm = NULL; struct tree_mod_elem **tm_list = NULL; @@ -578,11 +578,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (!tree_mod_need_log(fs_info, eb)) return 0; - tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags); + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; goto free_tms; @@ -596,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -663,7 +663,7 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, static noinline int tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *old_root, - struct extent_buffer *new_root, gfp_t flags, + struct extent_buffer *new_root, int log_removal) { struct tree_mod_elem *tm = NULL; @@ -678,14 +678,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (log_removal && btrfs_header_level(old_root) > 0) { nritems = btrfs_header_nritems(old_root); tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), - flags); + GFP_NOFS); if (!tm_list) { ret = -ENOMEM; goto free_tms; } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -693,7 +693,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, } } - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; goto free_tms; @@ -873,7 +873,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, { int ret; ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, - nr_items, GFP_NOFS); + nr_items); BUG_ON(ret < 0); } @@ -943,7 +943,7 @@ tree_mod_log_set_root_pointer(struct btrfs_root *root, { int ret; ret = tree_mod_log_insert_root(root->fs_info, root->node, - new_root_node, GFP_NOFS, log_removal); + new_root_node, log_removal); BUG_ON(ret < 0); } @@ -3667,14 +3667,14 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, /* make room in the right data area */ data_end = leaf_data_end(fs_info, right); memmove_extent_buffer(right, - btrfs_leaf_data(right) + data_end - push_space, - btrfs_leaf_data(right) + data_end, + BTRFS_LEAF_DATA_OFFSET + data_end - push_space, + BTRFS_LEAF_DATA_OFFSET + data_end, BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); /* copy from the left data area */ - copy_extent_buffer(right, left, btrfs_leaf_data(right) + + copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - btrfs_leaf_data(left) + leaf_data_end(fs_info, left), + BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), @@ -3888,9 +3888,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_offset_nr(right, push_items - 1); - copy_extent_buffer(left, right, btrfs_leaf_data(left) + + copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left) - push_space, - btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset_nr(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); @@ -3917,9 +3917,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - leaf_data_end(fs_info, right); - memmove_extent_buffer(right, btrfs_leaf_data(right) + + memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, right), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(0), @@ -4069,8 +4069,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, nritems * sizeof(struct btrfs_item)); copy_extent_buffer(right, l, - btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(fs_info) - - data_copy_size, btrfs_leaf_data(l) + + BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - + data_copy_size, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, l), data_copy_size); rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); @@ -4607,8 +4607,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, /* shift the data */ if (from_end) { - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + data_end, old_data_start + new_size - data_end); } else { struct btrfs_disk_key disk_key; @@ -4634,8 +4634,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, } } - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + data_end, old_data_start - data_end); offset = btrfs_disk_key_offset(&disk_key); @@ -4707,8 +4707,8 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, } /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - data_size, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end - data_size, BTRFS_LEAF_DATA_OFFSET + data_end, old_data - data_end); data_end = old_data; @@ -4790,8 +4790,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, (nritems - slot) * sizeof(struct btrfs_item)); /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - total_data, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end - total_data, BTRFS_LEAF_DATA_OFFSET + data_end, old_data - data_end); data_end = old_data; } @@ -4983,9 +4983,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (slot + nr != nritems) { int data_end = leaf_data_end(fs_info, leaf); - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, - btrfs_leaf_data(leaf) + data_end, + BTRFS_LEAF_DATA_OFFSET + data_end, last_off - data_end); for (i = slot + nr; i < nritems; i++) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3e21211e99c3..3f3eb7b17cac 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -39,6 +39,7 @@ #include <linux/security.h> #include <linux/sizes.h> #include <linux/dynamic_debug.h> +#include <linux/refcount.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -47,7 +48,6 @@ struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; extern struct kmem_cache *btrfs_trans_handle_cachep; -extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; @@ -518,7 +518,7 @@ struct btrfs_caching_control { struct btrfs_work work; struct btrfs_block_group_cache *block_group; u64 progress; - atomic_t count; + refcount_t count; }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ @@ -538,6 +538,14 @@ struct btrfs_io_ctl { unsigned check_crcs:1; }; +/* + * Tree to record all locked full stripes of a RAID5/6 block group + */ +struct btrfs_full_stripe_locks_tree { + struct rb_root root; + struct mutex lock; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; @@ -648,6 +656,9 @@ struct btrfs_block_group_cache { * Protected by free_space_lock. */ int needs_free_space; + + /* Record locked full stripes for RAID5/6 block group */ + struct btrfs_full_stripe_locks_tree full_stripe_locks_root; }; /* delayed seq elem */ @@ -658,6 +669,8 @@ struct seq_list { #define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } +#define SEQ_LAST ((u64)-1) + enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -702,6 +715,15 @@ struct btrfs_delayed_root; #define BTRFS_FS_BTREE_ERR 11 #define BTRFS_FS_LOG1_ERR 12 #define BTRFS_FS_LOG2_ERR 13 +#define BTRFS_FS_QUOTA_OVERRIDE 14 +/* Used to record internally whether fs has been frozen */ +#define BTRFS_FS_FROZEN 15 + +/* + * Indicate that a whole-filesystem exclusive operation is running + * (device replace, resize, device add/delete, balance) + */ +#define BTRFS_FS_EXCL_OP 14 struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -729,8 +751,7 @@ struct btrfs_fs_info { struct rb_root block_group_cache_tree; /* keep track of unallocated space */ - spinlock_t free_chunk_lock; - u64 free_chunk_space; + atomic64_t free_chunk_space; struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; @@ -778,17 +799,7 @@ struct btrfs_fs_info { * so it is also safe. */ u64 max_inline; - /* - * Protected by ->chunk_mutex and sb->s_umount. - * - * The reason that we use two lock to protect it is because only - * remount and mount operations can change it and these two operations - * are under sb->s_umount, but the read side (chunk allocation) can not - * acquire sb->s_umount or the deadlock would happen. So we use two - * locks to protect it. On the write side, we must acquire two locks, - * and on the read side, we just need acquire one of them. - */ - u64 alloc_start; + struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; @@ -1066,8 +1077,6 @@ struct btrfs_fs_info { /* device replace state */ struct btrfs_dev_replace dev_replace; - atomic_t mutually_exclusive_operation_running; - struct percpu_counter bio_counter; wait_queue_head_t replace_wait; @@ -1090,9 +1099,6 @@ struct btrfs_fs_info { */ struct list_head pinned_chunks; - /* Used to record internally whether fs has been frozen */ - int fs_frozen; - /* Cached block sizes */ u32 nodesize; u32 sectorsize; @@ -1220,7 +1226,7 @@ struct btrfs_root { dev_t anon_dev; spinlock_t root_item_lock; - atomic_t refs; + refcount_t refs; struct mutex delalloc_mutex; spinlock_t delalloc_lock; @@ -1260,21 +1266,20 @@ struct btrfs_root { /* For qgroup metadata space reserve */ atomic64_t qgroup_meta_rsv; }; + static inline u32 btrfs_inode_sectorsize(const struct inode *inode) { return btrfs_sb(inode->i_sb)->sectorsize; } -static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize) -{ - return blocksize - sizeof(struct btrfs_header); -} - static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { - return __BTRFS_LEAF_DATA_SIZE(info->nodesize); + + return info->nodesize - sizeof(struct btrfs_header); } +#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) + static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); @@ -1536,8 +1541,27 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ s->member = cpu_to_le##bits(val); \ } + +static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb, + struct btrfs_dev_item *s) +{ + BUILD_BUG_ON(sizeof(u64) != + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, + total_bytes)); +} +static inline void btrfs_set_device_total_bytes(struct extent_buffer *eb, + struct btrfs_dev_item *s, + u64 val) +{ + BUILD_BUG_ON(sizeof(u64) != + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); + btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); +} + + BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); @@ -2307,10 +2331,6 @@ static inline int btrfs_super_csum_size(struct btrfs_super_block *s) return btrfs_csum_sizes[t]; } -static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) -{ - return offsetof(struct btrfs_leaf, items); -} /* * The leaf data grows from end-to-front in the node. @@ -2521,11 +2541,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ - ((type *)(btrfs_leaf_data(leaf) + \ + ((type *)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ - ((unsigned long)(btrfs_leaf_data(leaf) + \ + ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) @@ -2546,7 +2566,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { - return fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* @@ -2556,7 +2576,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { - return fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, @@ -2663,7 +2683,9 @@ void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); +u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info); +u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info); +u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); enum btrfs_reserve_flush_enum { @@ -2686,9 +2708,13 @@ enum btrfs_flush_state { COMMIT_TRANS = 6, }; -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2705,8 +2731,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); @@ -3014,12 +3040,14 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, const char *name, u16 name_len, int mod); int verify_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, + struct extent_buffer *leaf, int slot, struct btrfs_dir_item *dir_item); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len); +bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, + unsigned long start, u16 name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -3061,8 +3089,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3077,7 +3105,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); @@ -3154,6 +3182,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); +void btrfs_set_range_writeback(void *private_data, u64 start, u64 end); int btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); @@ -3646,6 +3675,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); +static inline void btrfs_init_full_stripe_locks_tree( + struct btrfs_full_stripe_locks_tree *locks_root) +{ + locks_root->root = RB_ROOT; + mutex_init(&locks_root->lock); +} /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); @@ -3670,8 +3705,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct btrfs_key *start, struct btrfs_key *end); int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); -int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int err); +int btree_readahead_hook(struct extent_buffer *eb, int err); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1aff676f0e5b..8ae409b5a61d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -52,7 +52,7 @@ static inline void btrfs_init_delayed_node( { delayed_node->root = root; delayed_node->inode_id = inode_id; - atomic_set(&delayed_node->refs, 0); + refcount_set(&delayed_node->refs, 0); delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); @@ -81,7 +81,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = READ_ONCE(btrfs_inode->delayed_node); if (node) { - atomic_inc(&node->refs); + refcount_inc(&node->refs); return node; } @@ -89,14 +89,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { - atomic_inc(&node->refs); /* can be accessed */ + refcount_inc(&node->refs); /* can be accessed */ BUG_ON(btrfs_inode->delayed_node != node); spin_unlock(&root->inode_lock); return node; } btrfs_inode->delayed_node = node; /* can be accessed and cached in the inode */ - atomic_add(2, &node->refs); + refcount_add(2, &node->refs); spin_unlock(&root->inode_lock); return node; } @@ -125,7 +125,7 @@ again: btrfs_init_delayed_node(node, root, ino); /* cached in the btrfs inode and can be accessed */ - atomic_add(2, &node->refs); + refcount_set(&node->refs, 2); ret = radix_tree_preload(GFP_NOFS); if (ret) { @@ -166,7 +166,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, } else { list_add_tail(&node->n_list, &root->node_list); list_add_tail(&node->p_list, &root->prepare_list); - atomic_inc(&node->refs); /* inserted into list */ + refcount_inc(&node->refs); /* inserted into list */ root->nodes++; set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } @@ -180,7 +180,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, spin_lock(&root->lock); if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; - atomic_dec(&node->refs); /* not in the list */ + refcount_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) list_del_init(&node->p_list); @@ -201,7 +201,7 @@ static struct btrfs_delayed_node *btrfs_first_delayed_node( p = delayed_root->node_list.next; node = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&node->refs); + refcount_inc(&node->refs); out: spin_unlock(&delayed_root->lock); @@ -228,7 +228,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( p = node->n_list.next; next = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&next->refs); + refcount_inc(&next->refs); out: spin_unlock(&delayed_root->lock); @@ -253,11 +253,11 @@ static void __btrfs_release_delayed_node( btrfs_dequeue_delayed_node(delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); - if (atomic_dec_and_test(&delayed_node->refs)) { + if (refcount_dec_and_test(&delayed_node->refs)) { bool free = false; struct btrfs_root *root = delayed_node->root; spin_lock(&root->inode_lock); - if (atomic_read(&delayed_node->refs) == 0) { + if (refcount_read(&delayed_node->refs) == 0) { radix_tree_delete(&root->delayed_nodes_tree, delayed_node->inode_id); free = true; @@ -286,7 +286,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( p = delayed_root->prepare_list.next; list_del_init(p); node = list_entry(p, struct btrfs_delayed_node, p_list); - atomic_inc(&node->refs); + refcount_inc(&node->refs); out: spin_unlock(&delayed_root->lock); @@ -308,7 +308,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) item->ins_or_del = 0; item->bytes_reserved = 0; item->delayed_node = NULL; - atomic_set(&item->refs, 1); + refcount_set(&item->refs, 1); } return item; } @@ -483,7 +483,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) { if (item) { __btrfs_remove_delayed_item(item); - if (atomic_dec_and_test(&item->refs)) + if (refcount_dec_and_test(&item->refs)) kfree(item); } } @@ -1600,14 +1600,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); while (item) { - atomic_inc(&item->refs); + refcount_inc(&item->refs); list_add_tail(&item->readdir_list, ins_list); item = __btrfs_next_delayed_item(item); } item = __btrfs_first_delayed_deletion_item(delayed_node); while (item) { - atomic_inc(&item->refs); + refcount_inc(&item->refs); list_add_tail(&item->readdir_list, del_list); item = __btrfs_next_delayed_item(item); } @@ -1621,7 +1621,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * insert/delete delayed items in this period. So we also needn't * requeue or dequeue this delayed node. */ - atomic_dec(&delayed_node->refs); + refcount_dec(&delayed_node->refs); return true; } @@ -1634,13 +1634,13 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, list_for_each_entry_safe(curr, next, ins_list, readdir_list) { list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); } list_for_each_entry_safe(curr, next, del_list, readdir_list) { list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); } @@ -1667,7 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, list_del(&curr->readdir_list); ret = (curr->key.offset == index); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (ret) @@ -1705,7 +1705,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, list_del(&curr->readdir_list); if (curr->key.offset < ctx->pos) { - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); continue; } @@ -1722,7 +1722,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, over = !dir_emit(ctx, name, name_len, location.objectid, d_type); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (over) @@ -1963,7 +1963,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) inode_id = delayed_nodes[n - 1]->inode_id + 1; for (i = 0; i < n; i++) - atomic_inc(&delayed_nodes[i]->refs); + refcount_inc(&delayed_nodes[i]->refs); spin_unlock(&root->inode_lock); for (i = 0; i < n; i++) { diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 40327cc3b99a..c4189d495934 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -26,7 +26,7 @@ #include <linux/list.h> #include <linux/wait.h> #include <linux/atomic.h> - +#include <linux/refcount.h> #include "ctree.h" /* types of the delayed item */ @@ -67,7 +67,7 @@ struct btrfs_delayed_node { struct rb_root del_root; struct mutex mutex; struct btrfs_inode_item inode_item; - atomic_t refs; + refcount_t refs; u64 index_cnt; unsigned long flags; int count; @@ -80,7 +80,7 @@ struct btrfs_delayed_item { struct list_head readdir_list; /* used for readdir items */ u64 bytes_reserved; struct btrfs_delayed_node *delayed_node; - atomic_t refs; + refcount_t refs; int ins_or_del; u32 data_len; char data[0]; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6eb80952efb3..93ffa898df6d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -164,7 +164,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, if (mutex_trylock(&head->mutex)) return 0; - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); @@ -470,7 +470,8 @@ add_tail: static noinline void update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) + struct btrfs_delayed_ref_node *update, + int *old_ref_mod_ret) { struct btrfs_delayed_ref_head *existing_ref; struct btrfs_delayed_ref_head *ref; @@ -523,6 +524,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing_ref->total_ref_mod; + if (old_ref_mod_ret) + *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; existing_ref->total_ref_mod += update->ref_mod; @@ -550,7 +553,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, - int action, int is_data, int *qrecord_inserted_ret) + int action, int is_data, int *qrecord_inserted_ret, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; @@ -590,7 +594,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = count_mod; @@ -638,7 +642,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, if (existing) { WARN_ON(ref_root && reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, &existing->node, ref); + update_existing_head_ref(delayed_refs, &existing->node, ref, + old_ref_mod); /* * we've updated the existing ref, free the newly * allocated ref @@ -646,6 +651,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + if (old_ref_mod) + *old_ref_mod = 0; if (is_data && count_mod < 0) delayed_refs->pending_csums += num_bytes; delayed_refs->num_heads++; @@ -655,6 +662,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; + if (new_ref_mod) + *new_ref_mod = head_ref->total_ref_mod; return head_ref; } @@ -682,7 +691,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = 1; @@ -739,7 +748,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, seq = atomic64_read(&fs_info->tree_mod_seq); /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = 1; @@ -778,7 +787,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -813,7 +823,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, 0, 0, action, 0, - &qrecord_inserted); + &qrecord_inserted, old_ref_mod, + new_ref_mod); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action); @@ -838,7 +849,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action) + u64 owner, u64 offset, u64 reserved, int action, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -878,7 +890,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, ref_root, reserved, - action, 1, &qrecord_inserted); + action, 1, &qrecord_inserted, + old_ref_mod, new_ref_mod); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, @@ -909,7 +922,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data, NULL); + extent_op->is_data, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 0e537f98f1a1..ce88e4ac5276 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -18,6 +18,8 @@ #ifndef __DELAYED_REF__ #define __DELAYED_REF__ +#include <linux/refcount.h> + /* these are the possible values of struct btrfs_delayed_ref_node->action */ #define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ #define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ @@ -53,7 +55,7 @@ struct btrfs_delayed_ref_node { u64 seq; /* ref count on this data structure */ - atomic_t refs; + refcount_t refs; /* * how many refs is this entry adding or deleting. For @@ -220,8 +222,8 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { - WARN_ON(atomic_read(&ref->refs) == 0); - if (atomic_dec_and_test(&ref->refs)) { + WARN_ON(refcount_read(&ref->refs) == 0); + if (refcount_dec_and_test(&ref->refs)) { WARN_ON(ref->in_tree); switch (ref->type) { case BTRFS_TREE_BLOCK_REF_KEY: @@ -245,12 +247,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op); + struct btrfs_delayed_extent_op *extent_op, + int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action); + u64 owner, u64 offset, u64 reserved, int action, + int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e653921f05d9..bee3edeea7a3 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -388,7 +388,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -546,8 +546,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_rm_dev_replace_unblocked(fs_info); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return scrub_ret; @@ -665,7 +667,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: srcdev = dev_replace->srcdev; - args->status.progress_1000 = div_u64(dev_replace->cursor_left, + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } @@ -784,8 +786,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) } btrfs_dev_replace_unlock(dev_replace, 1); - WARN_ON(atomic_xchg( - &fs_info->mutually_exclusive_operation_running, 1)); + WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); return PTR_ERR_OR_ZERO(task); } @@ -814,7 +815,7 @@ static int btrfs_dev_replace_kthread(void *data) (unsigned int)progress); } btrfs_dev_replace_continue_on_mount(fs_info); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return 0; } diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 60a750678a82..41cb9196eaa8 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -395,8 +395,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); - if (verify_dir_item(fs_info, leaf, dir_item)) - return NULL; total_len = btrfs_item_size_nr(leaf, path->slots[0]); while (cur < total_len) { @@ -405,6 +403,8 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, btrfs_dir_data_len(leaf, dir_item); name_ptr = (unsigned long)(dir_item + 1); + if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item)) + return NULL; if (btrfs_dir_name_len(leaf, dir_item) == name_len && memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) return dir_item; @@ -453,9 +453,11 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, int verify_dir_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, + int slot, struct btrfs_dir_item *dir_item) { u16 namelen = BTRFS_NAME_LEN; + int ret; u8 type = btrfs_dir_type(leaf, dir_item); if (type >= BTRFS_FT_MAX) { @@ -468,10 +470,16 @@ int verify_dir_item(struct btrfs_fs_info *fs_info, if (btrfs_dir_name_len(leaf, dir_item) > namelen) { btrfs_crit(fs_info, "invalid dir item name len: %u", - (unsigned)btrfs_dir_data_len(leaf, dir_item)); + (unsigned)btrfs_dir_name_len(leaf, dir_item)); return 1; } + namelen = btrfs_dir_name_len(leaf, dir_item); + ret = btrfs_is_name_len_valid(leaf, slot, + (unsigned long)(dir_item + 1), namelen); + if (!ret) + return 1; + /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ if ((btrfs_dir_data_len(leaf, dir_item) + btrfs_dir_name_len(leaf, dir_item)) > @@ -484,3 +492,67 @@ int verify_dir_item(struct btrfs_fs_info *fs_info, return 0; } + +bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, + unsigned long start, u16 name_len) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_key key; + u32 read_start; + u32 read_end; + u32 item_start; + u32 item_end; + u32 size; + bool ret = true; + + ASSERT(start > BTRFS_LEAF_DATA_OFFSET); + + read_start = start - BTRFS_LEAF_DATA_OFFSET; + read_end = read_start + name_len; + item_start = btrfs_item_offset_nr(leaf, slot); + item_end = btrfs_item_end_nr(leaf, slot); + + btrfs_item_key_to_cpu(leaf, &key, slot); + + switch (key.type) { + case BTRFS_DIR_ITEM_KEY: + case BTRFS_XATTR_ITEM_KEY: + case BTRFS_DIR_INDEX_KEY: + size = sizeof(struct btrfs_dir_item); + break; + case BTRFS_INODE_REF_KEY: + size = sizeof(struct btrfs_inode_ref); + break; + case BTRFS_INODE_EXTREF_KEY: + size = sizeof(struct btrfs_inode_extref); + break; + case BTRFS_ROOT_REF_KEY: + case BTRFS_ROOT_BACKREF_KEY: + size = sizeof(struct btrfs_root_ref); + break; + default: + ret = false; + goto out; + } + + if (read_start < item_start) { + ret = false; + goto out; + } + if (read_end > item_end) { + ret = false; + goto out; + } + + /* there shall be item(s) before name */ + if (read_start - item_start < size) { + ret = false; + goto out; + } + +out: + if (!ret) + btrfs_crit(fs_info, "invalid dir item name len: %u", + (unsigned int)name_len); + return ret; +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb5c95569300..086dcbadce09 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -87,9 +87,8 @@ struct btrfs_end_io_wq { bio_end_io_t *end_io; void *private; struct btrfs_fs_info *info; - int error; + blk_status_t status; enum btrfs_wq_endio_type metadata; - struct list_head list; struct btrfs_work work; }; @@ -118,9 +117,9 @@ void btrfs_end_io_wq_exit(void) * just before they are sent down the IO stack. */ struct async_submit_bio { - struct inode *inode; + void *private_data; + struct btrfs_fs_info *fs_info; struct bio *bio; - struct list_head list; extent_submit_bio_hook_t *submit_bio_start; extent_submit_bio_hook_t *submit_bio_done; int mirror_num; @@ -131,7 +130,7 @@ struct async_submit_bio { */ u64 bio_offset; struct btrfs_work work; - int error; + blk_status_t status; }; /* @@ -762,7 +761,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(fs_info, eb, ret); + btree_readahead_hook(eb, ret); if (ret) { /* @@ -787,7 +786,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb->fs_info, eb, -EIO); + btree_readahead_hook(eb, -EIO); return -EIO; /* we fixed nothing */ } @@ -799,7 +798,7 @@ static void end_workqueue_bio(struct bio *bio) btrfs_work_func_t func; fs_info = end_io_wq->info; - end_io_wq->error = bio->bi_error; + end_io_wq->status = bio->bi_status; if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { @@ -836,19 +835,19 @@ static void end_workqueue_bio(struct bio *bio) btrfs_queue_work(wq, &end_io_wq->work); } -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata) { struct btrfs_end_io_wq *end_io_wq; end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) - return -ENOMEM; + return BLK_STS_RESOURCE; end_io_wq->private = bio->bi_private; end_io_wq->end_io = bio->bi_end_io; end_io_wq->info = info; - end_io_wq->error = 0; + end_io_wq->status = 0; end_io_wq->bio = bio; end_io_wq->metadata = metadata; @@ -868,14 +867,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; - int ret; + blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->bio, + ret = async->submit_bio_start(async->private_data, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) - async->error = ret; + async->status = ret; } static void run_one_async_done(struct btrfs_work *work) @@ -885,7 +884,7 @@ static void run_one_async_done(struct btrfs_work *work) int limit; async = container_of(work, struct async_submit_bio, work); - fs_info = BTRFS_I(async->inode)->root->fs_info; + fs_info = async->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; @@ -898,13 +897,13 @@ static void run_one_async_done(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); /* If an error occurred we just want to clean up the bio and move on */ - if (async->error) { - async->bio->bi_error = async->error; + if (async->status) { + async->bio->bi_status = async->status; bio_endio(async->bio); return; } - async->submit_bio_done(async->inode, async->bio, async->mirror_num, + async->submit_bio_done(async->private_data, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); } @@ -916,20 +915,20 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, - u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done) +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset, void *private_data, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) { struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return -ENOMEM; + return BLK_STS_RESOURCE; - async->inode = inode; + async->private_data = private_data; + async->fs_info = fs_info; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -941,7 +940,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; async->bio_offset = bio_offset; - async->error = 0; + async->status = 0; atomic_inc(&fs_info->nr_async_submits); @@ -959,7 +958,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return 0; } -static int btree_csum_one_bio(struct bio *bio) +static blk_status_t btree_csum_one_bio(struct bio *bio) { struct bio_vec *bvec; struct btrfs_root *root; @@ -972,12 +971,12 @@ static int btree_csum_one_bio(struct bio *bio) break; } - return ret; + return errno_to_blk_status(ret); } -static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -986,11 +985,12 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { - int ret; + struct inode *inode = private_data; + blk_status_t ret; /* * when we're called for a write, we're already in the async @@ -998,7 +998,7 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, */ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1015,13 +1015,14 @@ static int check_async_write(unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(bio_flags); - int ret; + blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { /* @@ -1043,8 +1044,8 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ - ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 0, - bio_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, + bio_offset, private_data, __btree_submit_bio_start, __btree_submit_bio_done); } @@ -1054,7 +1055,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, return 0; out_w_error: - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); return ret; } @@ -1222,10 +1223,10 @@ int btrfs_write_tree_block(struct extent_buffer *buf) buf->start + buf->len - 1); } -int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); + filemap_fdatawait_range(buf->pages[0]->mapping, + buf->start, buf->start + buf->len - 1); } struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -1340,15 +1341,14 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); - atomic_set(&root->refs, 1); + refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshoted, 0); atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; if (!dummy) - extent_io_tree_init(&root->dirty_log_pages, - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&root->dirty_log_pages, NULL); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -1820,7 +1820,7 @@ static void end_workqueue_fn(struct btrfs_work *work) end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; - bio->bi_error = end_io_wq->error; + bio->bi_status = end_io_wq->status; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); @@ -2309,7 +2309,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) inode->i_mapping->a_ops = &btree_aops; RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode); BTRFS_I(inode)->io_tree.track_uptodate = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); @@ -2626,7 +2626,6 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); - spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->qgroup_op_lock); @@ -2662,12 +2661,11 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); - fs_info->fs_frozen = 0; fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; - fs_info->free_chunk_space = 0; + atomic64_set(&fs_info->free_chunk_space, 0); fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ @@ -2704,10 +2702,8 @@ int open_ctree(struct super_block *sb, fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; - extent_io_tree_init(&fs_info->freed_extents[0], - fs_info->btree_inode->i_mapping); - extent_io_tree_init(&fs_info->freed_extents[1], - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&fs_info->freed_extents[0], NULL); + extent_io_tree_init(&fs_info->freed_extents[1], NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_BARRIER, &fs_info->flags); @@ -3467,10 +3463,12 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - if (i == 0) - ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh); - else + if (i == 0) { + ret = btrfsic_submit_bh(REQ_OP_WRITE, + REQ_SYNC | REQ_FUA, bh); + } else { ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + } if (ret) errors++; } @@ -3483,64 +3481,61 @@ static int write_dev_supers(struct btrfs_device *device, */ static void btrfs_end_empty_barrier(struct bio *bio) { - if (bio->bi_private) - complete(bio->bi_private); - bio_put(bio); + complete(bio->bi_private); } /* - * trigger flushes for one the devices. If you pass wait == 0, the flushes are - * sent down. With wait == 1, it waits for the previous flush. - * - * any device where the flush fails with eopnotsupp are flagged as not-barrier - * capable + * Submit a flush request to the device if it supports it. Error handling is + * done in the waiting counterpart. */ -static int write_dev_flush(struct btrfs_device *device, int wait) +static void write_dev_flush(struct btrfs_device *device) { - struct bio *bio; - int ret = 0; + struct request_queue *q = bdev_get_queue(device->bdev); + struct bio *bio = device->flush_bio; - if (device->nobarriers) - return 0; + if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + return; - if (wait) { - bio = device->flush_bio; - if (!bio) - return 0; + bio_reset(bio); + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; - wait_for_completion(&device->flush_wait); + submit_bio(bio); + device->flush_bio_sent = 1; +} - if (bio->bi_error) { - ret = bio->bi_error; - btrfs_dev_stat_inc_and_print(device, - BTRFS_DEV_STAT_FLUSH_ERRS); - } +/* + * If the flush bio has been submitted by write_dev_flush, wait for it. + */ +static blk_status_t wait_dev_flush(struct btrfs_device *device) +{ + struct bio *bio = device->flush_bio; - /* drop the reference from the wait == 0 run */ - bio_put(bio); - device->flush_bio = NULL; + if (!device->flush_bio_sent) + return 0; - return ret; - } + device->flush_bio_sent = 0; + wait_for_completion_io(&device->flush_wait); - /* - * one reference for us, and we leave it for the - * caller - */ - device->flush_bio = NULL; - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - return -ENOMEM; + return bio->bi_status; +} - bio->bi_end_io = btrfs_end_empty_barrier; - bio->bi_bdev = device->bdev; - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - init_completion(&device->flush_wait); - bio->bi_private = &device->flush_wait; - device->flush_bio = bio; +static int check_barrier_error(struct btrfs_fs_devices *fsdevs) +{ + int dev_flush_error = 0; + struct btrfs_device *dev; + + list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) { + if (!dev->bdev || dev->last_flush_error) + dev_flush_error++; + } - bio_get(bio); - btrfsic_submit_bio(bio); + if (dev_flush_error > + fsdevs->fs_info->num_tolerated_disk_barrier_failures) + return -EIO; return 0; } @@ -3553,25 +3548,21 @@ static int barrier_all_devices(struct btrfs_fs_info *info) { struct list_head *head; struct btrfs_device *dev; - int errors_send = 0; int errors_wait = 0; - int ret; + blk_status_t ret; /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry_rcu(dev, head, dev_list) { if (dev->missing) continue; - if (!dev->bdev) { - errors_send++; + if (!dev->bdev) continue; - } if (!dev->in_fs_metadata || !dev->writeable) continue; - ret = write_dev_flush(dev, 0); - if (ret) - errors_send++; + write_dev_flush(dev); + dev->last_flush_error = 0; } /* wait for all the barriers */ @@ -3585,13 +3576,23 @@ static int barrier_all_devices(struct btrfs_fs_info *info) if (!dev->in_fs_metadata || !dev->writeable) continue; - ret = write_dev_flush(dev, 1); - if (ret) + ret = wait_dev_flush(dev); + if (ret) { + dev->last_flush_error = ret; + btrfs_dev_stat_inc_and_print(dev, + BTRFS_DEV_STAT_FLUSH_ERRS); errors_wait++; + } + } + + if (errors_wait) { + /* + * At some point we need the status of all disks + * to arrive at the volume status. So error checking + * is being pushed to a separate loop. + */ + return check_barrier_error(info->fs_devices); } - if (errors_send > info->num_tolerated_disk_barrier_failures || - errors_wait > info->num_tolerated_disk_barrier_failures) - return -EIO; return 0; } @@ -4321,7 +4322,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); @@ -4575,11 +4576,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); - - /* - memset(cur_trans, 0, sizeof(*cur_trans)); - kmem_cache_free(btrfs_transaction_cachep, cur_trans); - */ } static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) @@ -4593,7 +4589,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); if (t->state >= TRANS_STATE_COMMIT_START) { - atomic_inc(&t->use_count); + refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); btrfs_put_transaction(t); @@ -4635,6 +4631,12 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } +static struct btrfs_fs_info *btree_fs_info(void *private_data) +{ + struct inode *inode = private_data; + return btrfs_sb(inode->i_sb); +} + static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, @@ -4642,6 +4644,8 @@ static const struct extent_io_ops btree_extent_io_ops = { /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, .readpage_io_failed_hook = btree_io_failed_hook, + .set_range_writeback = btrfs_set_range_writeback, + .tree_fs_info = btree_fs_info, /* optional callbacks */ }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2e0ec29bfd69..0a634d3ffc16 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -101,14 +101,14 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); */ static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) { - if (atomic_inc_not_zero(&root->refs)) + if (refcount_inc_not_zero(&root->refs)) return root; return NULL; } static inline void btrfs_put_fs_root(struct btrfs_root *root) { - if (atomic_dec_and_test(&root->refs)) + if (refcount_dec_and_test(&root->refs)) kfree(root); } @@ -118,16 +118,16 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(const char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, u8 *result); -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset, +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset, void *private_data, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); int btrfs_write_tree_block(struct extent_buffer *buf); -int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +void btrfs_wait_tree_block_writeback(struct extent_buffer *buf); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 87144c9f9593..fa66980726c9 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -282,6 +282,11 @@ static int btrfs_get_name(struct dentry *parent, char *name, name_len = btrfs_inode_ref_name_len(leaf, iref); } + ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len); + if (!ret) { + btrfs_free_path(path); + return -EIO; + } read_extent_buffer(leaf, name, name_ptr, name_len); btrfs_free_path(path); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index be5477676cc8..375f8c728d91 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -97,10 +97,11 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -static int __reserve_metadata_bytes(struct btrfs_root *root, +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush); + enum btrfs_reserve_flush_enum flush, + bool system_chunk); static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes); @@ -131,6 +132,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); + + /* + * If not empty, someone is still holding mutex of + * full_stripe_lock, which can only be released by caller. + * And it will definitely cause use-after-free when caller + * tries to release full stripe lock. + * + * No better way to resolve, but only to warn. + */ + WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); kfree(cache->free_space_ctl); kfree(cache); } @@ -316,14 +327,14 @@ get_caching_control(struct btrfs_block_group_cache *cache) } ctl = cache->caching_ctl; - atomic_inc(&ctl->count); + refcount_inc(&ctl->count); spin_unlock(&cache->lock); return ctl; } static void put_caching_control(struct btrfs_caching_control *ctl) { - if (atomic_dec_and_test(&ctl->count)) + if (refcount_dec_and_test(&ctl->count)) kfree(ctl); } @@ -599,7 +610,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; - atomic_set(&caching_ctl->count, 1); + refcount_set(&caching_ctl->count, 1); btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, caching_thread, NULL, NULL); @@ -620,7 +631,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_caching_control *ctl; ctl = cache->caching_ctl; - atomic_inc(&ctl->count); + refcount_inc(&ctl->count); prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&cache->lock); @@ -707,7 +718,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } down_write(&fs_info->commit_root_sem); - atomic_inc(&caching_ctl->count); + refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->commit_root_sem); @@ -756,6 +767,26 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, return NULL; } +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, + u64 owner, u64 root_objectid) +{ + struct btrfs_space_info *space_info; + u64 flags; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; + } + + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); +} + /* * after adding space to the filesystem, we need to clear the full flags * on all the space infos. @@ -892,7 +923,7 @@ search_again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -2082,6 +2113,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + int old_ref_mod, new_ref_mod; int ret; BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && @@ -2089,15 +2121,21 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, (int)owner, + BTRFS_ADD_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, parent, root_objectid, - owner, offset, 0, - BTRFS_ADD_DELAYED_REF); + num_bytes, parent, + root_objectid, owner, offset, + 0, BTRFS_ADD_DELAYED_REF, + &old_ref_mod, &new_ref_mod); } + + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) + add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid); + return ret; } @@ -2401,6 +2439,16 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, head = btrfs_delayed_node_to_head(node); trace_run_delayed_ref_head(fs_info, node, head, node->action); + if (head->total_ref_mod < 0) { + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(fs_info, node->bytenr); + ASSERT(cache); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + -node->num_bytes); + btrfs_put_block_group(cache); + } + if (insert_reserved) { btrfs_pin_extent(fs_info, node->bytenr, node->num_bytes, 1); @@ -2980,7 +3028,7 @@ again: struct btrfs_delayed_ref_node *ref; ref = &head->node; - atomic_inc(&ref->refs); + refcount_inc(&ref->refs); spin_unlock(&delayed_refs->lock); /* @@ -3003,7 +3051,6 @@ again: goto again; } out: - assert_qgroups_uptodate(trans); trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } @@ -3057,7 +3104,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -3355,6 +3402,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; u64 num_pages = 0; @@ -3443,7 +3491,8 @@ again: /* * don't bother trying to write stuff out _if_ * a) we're not cached, - * b) we're with nospace_cache mount option. + * b) we're with nospace_cache mount option, + * c) we're with v2 space_cache (FREE_SPACE_TREE). */ dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); @@ -3473,7 +3522,7 @@ again: num_pages *= 16; num_pages *= PAGE_SIZE; - ret = btrfs_check_data_free_space(inode, 0, num_pages); + ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); if (ret) goto out_put; @@ -3504,6 +3553,7 @@ out: block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); + extent_changeset_free(data_reserved); return ret; } @@ -3914,87 +3964,83 @@ static const char *alloc_name(u64 flags) }; } -static int update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, - struct btrfs_space_info **space_info) +static int create_space_info(struct btrfs_fs_info *info, u64 flags, + struct btrfs_space_info **new) { - struct btrfs_space_info *found; + + struct btrfs_space_info *space_info; int i; - int factor; int ret; - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - - found = __find_space_info(info, flags); - if (found) { - spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - if (total_bytes > 0) - found->full = 0; - space_info_add_new_bytes(info, found, total_bytes - - bytes_used - bytes_readonly); - spin_unlock(&found->lock); - *space_info = found; - return 0; - } - found = kzalloc(sizeof(*found), GFP_NOFS); - if (!found) + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) return -ENOMEM; - ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); + ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, + GFP_KERNEL); if (ret) { - kfree(found); + kfree(space_info); return ret; } for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - INIT_LIST_HEAD(&found->block_groups[i]); - init_rwsem(&found->groups_sem); - spin_lock_init(&found->lock); - found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - found->total_bytes = total_bytes; - found->disk_total = total_bytes * factor; - found->bytes_used = bytes_used; - found->disk_used = bytes_used * factor; - found->bytes_pinned = 0; - found->bytes_reserved = 0; - found->bytes_readonly = bytes_readonly; - found->bytes_may_use = 0; - found->full = 0; - found->max_extent_size = 0; - found->force_alloc = CHUNK_ALLOC_NO_FORCE; - found->chunk_alloc = 0; - found->flush = 0; - init_waitqueue_head(&found->wait); - INIT_LIST_HEAD(&found->ro_bgs); - INIT_LIST_HEAD(&found->tickets); - INIT_LIST_HEAD(&found->priority_tickets); - - ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + INIT_LIST_HEAD(&space_info->block_groups[i]); + init_rwsem(&space_info->groups_sem); + spin_lock_init(&space_info->lock); + space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + init_waitqueue_head(&space_info->wait); + INIT_LIST_HEAD(&space_info->ro_bgs); + INIT_LIST_HEAD(&space_info->tickets); + INIT_LIST_HEAD(&space_info->priority_tickets); + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, info->space_info_kobj, "%s", - alloc_name(found->flags)); + alloc_name(space_info->flags)); if (ret) { - kfree(found); + percpu_counter_destroy(&space_info->total_bytes_pinned); + kfree(space_info); return ret; } - *space_info = found; - list_add_rcu(&found->list, &info->space_info); + *new = space_info; + list_add_rcu(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) - info->data_sinfo = found; + info->data_sinfo = space_info; return ret; } +static void update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + int factor; + + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + + found = __find_space_info(info, flags); + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; + if (total_bytes > 0) + found->full = 0; + space_info_add_new_bytes(info, found, total_bytes - + bytes_used - bytes_readonly); + spin_unlock(&found->lock); + *space_info = found; +} + static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = chunk_to_extended(flags) & @@ -4110,7 +4156,7 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) return btrfs_reduce_alloc_profile(fs_info, flags); } -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; u64 flags; @@ -4127,6 +4173,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return ret; } +u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); +} + +u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); +} + +u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +} + static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) { @@ -4176,7 +4237,7 @@ again: data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; spin_unlock(&data_sinfo->lock); alloc: - alloc_target = btrfs_get_alloc_profile(root, 1); + alloc_target = btrfs_data_alloc_profile(fs_info); /* * It is ugly that we don't call nolock join * transaction for the free space inode case here. @@ -4227,7 +4288,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1, 0, + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } @@ -4267,12 +4328,8 @@ commit_trans: return ret; } -/* - * New check_data_free_space() with ability for precious data reservation - * Will replace old btrfs_check_data_free_space(), but for patch split, - * add a new function first and then replace it. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; @@ -4287,9 +4344,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, start, len); - if (ret) + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; return ret; } @@ -4330,7 +4389,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4340,7 +4400,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) start = round_down(start, root->fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(inode, start, len); - btrfs_qgroup_free_data(inode, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len); } static void force_metadata_allocation(struct btrfs_fs_info *info) @@ -4452,9 +4512,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, } if (left < thresh) { - u64 flags; + u64 flags = btrfs_system_alloc_profile(fs_info); - flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0); /* * Ignore failure to create system chunk. We might end up not * needing it, as we might not need to COW all nodes/leafs from @@ -4495,10 +4554,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info = __find_space_info(fs_info, flags); if (!space_info) { - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); - BUG_ON(ret); /* -ENOMEM */ + ret = create_space_info(fs_info, flags, &space_info); + if (ret) + return ret; } - BUG_ON(!space_info); /* Logic error */ again: spin_lock(&space_info->lock); @@ -4603,11 +4662,11 @@ out: return ret; } -static int can_overcommit(struct btrfs_root *root, +static int can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) + enum btrfs_reserve_flush_enum flush, + bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 profile; u64 space_size; @@ -4618,7 +4677,11 @@ static int can_overcommit(struct btrfs_root *root, if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) return 0; - profile = btrfs_get_alloc_profile(root, 0); + if (system_chunk) + profile = btrfs_system_alloc_profile(fs_info); + else + profile = btrfs_metadata_alloc_profile(fs_info); + used = btrfs_space_info_used(space_info, false); /* @@ -4635,9 +4698,7 @@ static int can_overcommit(struct btrfs_root *root, used += space_info->bytes_may_use; - spin_lock(&fs_info->free_chunk_lock); - avail = fs_info->free_chunk_space; - spin_unlock(&fs_info->free_chunk_lock); + avail = atomic64_read(&fs_info->free_chunk_space); /* * If we have dup, raid1 or raid10 then only half of the free @@ -4687,14 +4748,14 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, } } -static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, +static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 to_reclaim) { u64 bytes; - int nr; + u64 nr; bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - nr = (int)div64_u64(to_reclaim, bytes); + nr = div64_u64(to_reclaim, bytes); if (!nr) nr = 1; return nr; @@ -4705,24 +4766,23 @@ static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, + u64 orig, bool wait_ordered) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 max_reclaim; + u64 items; long time_left; unsigned long nr_pages; int loops; - int items; enum btrfs_reserve_flush_enum flush; /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; + to_reclaim = items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &fs_info->delalloc_block_rsv; @@ -4765,7 +4825,7 @@ skip_async: else flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(root, space_info, orig, flush)) { + if (can_overcommit(fs_info, space_info, orig, flush, false)) { spin_unlock(&space_info->lock); break; } @@ -4827,14 +4887,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, spin_lock(&delayed_rsv->lock); if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes - delayed_rsv->size) >= 0) { + bytes - delayed_rsv->size) < 0) { spin_unlock(&delayed_rsv->lock); return -ENOSPC; } spin_unlock(&delayed_rsv->lock); commit: - trans = btrfs_join_transaction(fs_info->fs_root); + trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return -ENOSPC; @@ -4852,7 +4912,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, u64 orig_bytes, int state) { - struct btrfs_root *root = fs_info->fs_root; + struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; int nr; int ret = 0; @@ -4875,7 +4935,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(root, num_bytes * 2, orig_bytes, + shrink_delalloc(fs_info, num_bytes * 2, orig_bytes, state == FLUSH_DELALLOC_WAIT); break; case ALLOC_CHUNK: @@ -4885,7 +4945,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, break; } ret = do_chunk_alloc(trans, fs_info, - btrfs_get_alloc_profile(root, 0), + btrfs_metadata_alloc_profile(fs_info), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans); if (ret > 0 || ret == -ENOSPC) @@ -4906,8 +4966,9 @@ static int flush_space(struct btrfs_fs_info *fs_info, } static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, - struct btrfs_space_info *space_info) +btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool system_chunk) { struct reserve_ticket *ticket; u64 used; @@ -4922,14 +4983,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) + if (can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) return 0; - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; - if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) + used = btrfs_space_info_used(space_info, true); + + if (can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -4943,17 +5004,18 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, return to_reclaim; } -static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_root *root, u64 used) +static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 used, bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 thresh = div_factor_fine(space_info->total_bytes, 98); /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - if (!btrfs_calc_reclaim_metadata_size(root, space_info)) + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, + system_chunk)) return 0; return (used >= thresh && !btrfs_fs_closing(fs_info) && @@ -4990,8 +5052,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); if (!to_reclaim) { space_info->flush = 0; spin_unlock(&space_info->lock); @@ -5013,8 +5075,9 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, + space_info, + false); ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); if (last_tickets_id == space_info->tickets_id) { @@ -5052,8 +5115,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int flush_state = FLUSH_DELAYED_ITEMS_NR; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); if (!to_reclaim) { spin_unlock(&space_info->lock); return; @@ -5132,12 +5195,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_metadata_bytes(struct btrfs_root *root, +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) + enum btrfs_reserve_flush_enum flush, + bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -5159,7 +5222,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; - } else if (can_overcommit(root, space_info, orig_bytes, flush)) { + } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk)) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); @@ -5186,7 +5250,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, orig_bytes, flush, "enospc"); queue_work(system_unbound_wq, - &root->fs_info->async_reclaim_work); + &fs_info->async_reclaim_work); } } else { list_add_tail(&ticket.list, @@ -5200,7 +5264,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(space_info, root, used) && + need_do_async_reclaim(fs_info, space_info, + used, system_chunk) && !work_busy(&fs_info->async_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); @@ -5258,9 +5323,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; + bool system_chunk = (root == fs_info->chunk_root); - ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, - flush); + ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, + orig_bytes, flush, system_chunk); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && @@ -5369,9 +5435,7 @@ static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, * overcommit, and if we can't then we just need to free up our space * and not satisfy any requests. */ - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = btrfs_space_info_used(space_info, true); if (used - num_bytes >= space_info->total_bytes) check_overcommit = true; again: @@ -5383,8 +5447,7 @@ again: * adding the ticket space would be a double count. */ if (check_overcommit && - !can_overcommit(fs_info->extent_root, space_info, 0, - flush)) + !can_overcommit(fs_info, space_info, 0, flush, false)) break; if (num_bytes >= ticket->bytes) { list_del_init(&ticket->list); @@ -6113,6 +6176,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * @inode: inode we're writing to * @start: start range we are writing to * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. * * This will do the following things * @@ -6130,16 +6195,17 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * Return 0 for success * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, start, len); + ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); if (ret < 0) - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, *reserved, start, len); return ret; } @@ -6158,10 +6224,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) * list if there are no delalloc bytes left. * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { btrfs_delalloc_release_metadata(BTRFS_I(inode), len); - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, reserved, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, @@ -6237,6 +6304,8 @@ static int update_block_group(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(info, "pinned", cache->space_info->flags, num_bytes, 1); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + num_bytes); set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); @@ -6313,6 +6382,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "pinned", cache->space_info->flags, num_bytes, 1); + percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes); set_extent_dirty(fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -6783,27 +6853,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, return 0; } -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, - u64 owner, u64 root_objectid) -{ - struct btrfs_space_info *space_info; - u64 flags; - - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - } else { - flags = BTRFS_BLOCK_GROUP_DATA; - } - - space_info = __find_space_info(fs_info, flags); - BUG_ON(!space_info); /* Logic bug */ - percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); -} - - static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info, struct btrfs_delayed_ref_node *node, u64 parent, @@ -7026,8 +7075,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } - add_pinned_bytes(info, -num_bytes, owner_objectid, - root_objectid); } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != @@ -7159,19 +7206,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(fs_info, trans, - buf->start, buf->len, - parent, + int old_ref_mod, new_ref_mod; + + ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, + buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL); + BTRFS_DROP_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); BUG_ON(ret); /* -ENOMEM */ + pin = old_ref_mod >= 0 && new_ref_mod < 0; } - if (!last_ref) - return; - - if (btrfs_header_generation(buf) == trans->transid) { + if (last_ref && btrfs_header_generation(buf) == trans->transid) { struct btrfs_block_group_cache *cache; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -7180,6 +7227,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } + pin = 0; cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -7195,18 +7243,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_free_reserved_bytes(cache, buf->len, 0); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); - pin = 0; } out: if (pin) add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), root->root_key.objectid); - /* - * Deleting the buffer, clear the corrupt flag since it doesn't matter - * anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + if (last_ref) { + /* + * Deleting the buffer, clear the corrupt flag since it doesn't + * matter anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + } } /* Can return -ENOMEM */ @@ -7215,12 +7264,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) return 0; - add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); /* * tree log blocks never actually go into the extent allocation @@ -7230,19 +7279,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); /* unlocks the pinned mutex */ btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); + old_ref_mod = new_ref_mod = 0; ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, (int)owner, + BTRFS_DROP_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, - offset, 0, - BTRFS_DROP_DELAYED_REF); + num_bytes, parent, + root_objectid, owner, offset, + 0, BTRFS_DROP_DELAYED_REF, + &old_ref_mod, &new_ref_mod); } + + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) + add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); + return ret; } @@ -7945,7 +8000,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 flags; int ret; - flags = btrfs_get_alloc_profile(root, is_data); + flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, @@ -8189,9 +8244,9 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, - ins->offset, 0, - root_objectid, owner, offset, - ram_bytes, BTRFS_ADD_DELAYED_EXTENT); + ins->offset, 0, root_objectid, owner, + offset, ram_bytes, + BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); return ret; } @@ -8411,11 +8466,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; - ret = btrfs_add_delayed_tree_ref(fs_info, trans, - ins.objectid, ins.offset, - parent, root_objectid, level, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, + ins.offset, parent, + root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); + extent_op, NULL, NULL); if (ret) goto out_free_delayed; } @@ -9917,6 +9972,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); mutex_init(&cache->free_space_lock); + btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); return cache; } @@ -10047,19 +10103,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } trace_btrfs_add_block_group(info, cache, 0); - ret = update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - spin_lock(&info->block_group_cache_lock); - rb_erase(&cache->cache_node, - &info->block_group_cache_tree); - RB_CLEAR_NODE(&cache->cache_node); - spin_unlock(&info->block_group_cache_lock); - btrfs_put_block_group(cache); - goto error; - } + update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + cache->bytes_super, &space_info); cache->space_info = space_info; @@ -10191,16 +10237,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, } #endif /* - * Call to ensure the corresponding space_info object is created and - * assigned to our block group, but don't update its counters just yet. - * We want our bg to be added to the rbtree with its ->space_info set. + * Ensure the corresponding space_info object is created and + * assigned to our block group. We want our bg to be added to the rbtree + * with its ->space_info set. */ - ret = update_space_info(fs_info, cache->flags, 0, 0, 0, - &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - return ret; + cache->space_info = __find_space_info(fs_info, cache->flags); + if (!cache->space_info) { + ret = create_space_info(fs_info, cache->flags, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } } ret = btrfs_add_block_group_cache(fs_info, cache); @@ -10215,18 +10264,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); - ret = update_space_info(fs_info, cache->flags, size, bytes_used, + update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&cache->cache_node, - &fs_info->block_group_cache_tree); - RB_CLEAR_NODE(&cache->cache_node); - spin_unlock(&fs_info->block_group_cache_lock); - btrfs_put_block_group(cache); - return ret; - } update_global_block_rsv(fs_info); __link_block_group(cache->space_info, cache); @@ -10416,7 +10455,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, &fs_info->caching_block_groups, list) if (ctl->block_group == block_group) { caching_ctl = ctl; - atomic_inc(&caching_ctl->count); + refcount_inc(&caching_ctl->count); break; } } @@ -10774,21 +10813,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); } else { flags = BTRFS_BLOCK_GROUP_METADATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); } out: return ret; @@ -10850,7 +10889,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; if (trans) - atomic_inc(&trans->use_count); + refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); ret = find_free_dev_extent_start(trans, device, minlen, start, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a3455d216a1d..556484cf5d93 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -68,7 +68,7 @@ void btrfs_leak_debug_check(void) pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), - atomic_read(&state->refs)); + refcount_read(&state->refs)); list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); } @@ -87,19 +87,9 @@ void btrfs_leak_debug_check(void) static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - struct inode *inode; - u64 isize; - - if (!tree->mapping) - return; - - inode = tree->mapping->host; - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } + if (tree->ops && tree->ops->check_extent_io_range) + tree->ops->check_extent_io_range(tree->private_data, caller, + start, end); } #else #define btrfs_leak_debug_add(new, head) do {} while (0) @@ -154,9 +144,9 @@ static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) { - if (!tree->mapping) - return NULL; - return btrfs_sb(tree->mapping->host->i_sb); + if (tree->ops) + return tree->ops->tree_fs_info(tree->private_data); + return NULL; } int __init extent_io_init(void) @@ -174,7 +164,8 @@ int __init extent_io_init(void) goto free_state_cache; btrfs_bioset = bioset_create(BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio)); + offsetof(struct btrfs_io_bio, bio), + BIOSET_NEED_BVECS); if (!btrfs_bioset) goto free_buffer_cache; @@ -213,13 +204,13 @@ void extent_io_exit(void) } void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping) + void *private_data) { tree->state = RB_ROOT; tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); - tree->mapping = mapping; + tree->private_data = private_data; } static struct extent_state *alloc_extent_state(gfp_t mask) @@ -238,7 +229,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->failrec = NULL; RB_CLEAR_NODE(&state->rb_node); btrfs_leak_debug_add(&state->leak_list, &states); - atomic_set(&state->refs, 1); + refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); trace_alloc_extent_state(state, mask, _RET_IP_); return state; @@ -248,7 +239,7 @@ void free_extent_state(struct extent_state *state) { if (!state) return; - if (atomic_dec_and_test(&state->refs)) { + if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del(&state->leak_list); trace_free_extent_state(state, _RET_IP_); @@ -369,8 +360,7 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, struct extent_state *other) { if (tree->ops && tree->ops->merge_extent_hook) - tree->ops->merge_extent_hook(tree->mapping->host, new, - other); + tree->ops->merge_extent_hook(tree->private_data, new, other); } /* @@ -421,15 +411,14 @@ static void set_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->set_bit_hook) - tree->ops->set_bit_hook(tree->mapping->host, state, bits); + tree->ops->set_bit_hook(tree->private_data, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->clear_bit_hook) - tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host), - state, bits); + tree->ops->clear_bit_hook(tree->private_data, state, bits); } static void set_state_bits(struct extent_io_tree *tree, @@ -478,7 +467,7 @@ static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, u64 split) { if (tree->ops && tree->ops->split_extent_hook) - tree->ops->split_extent_hook(tree->mapping->host, orig, split); + tree->ops->split_extent_hook(tree->private_data, orig, split); } /* @@ -641,7 +630,7 @@ again: if (cached && extent_state_in_tree(cached) && cached->start <= start && cached->end > start) { if (clear) - atomic_dec(&cached->refs); + refcount_dec(&cached->refs); state = cached; goto hit_next; } @@ -793,7 +782,7 @@ process_node: if (state->state & bits) { start = state->start; - atomic_inc(&state->refs); + refcount_inc(&state->refs); wait_on_state(tree, state); free_extent_state(state); goto again; @@ -834,7 +823,7 @@ static void cache_state_if_flags(struct extent_state *state, if (cached_ptr && !(*cached_ptr)) { if (!flags || (state->state & flags)) { *cached_ptr = state; - atomic_inc(&state->refs); + refcount_inc(&state->refs); } } } @@ -1402,17 +1391,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) */ static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - set_page_writeback(page); - put_page(page); - index++; - } + tree->ops->set_range_writeback(tree->private_data, start, end); } /* find the first state struct with 'bits' set after 'start', and @@ -1538,7 +1517,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, if (!found) { *start = state->start; *cached_state = state; - atomic_inc(&state->refs); + refcount_inc(&state->refs); } found++; *end = state->end; @@ -1961,11 +1940,12 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) SetPageUptodate(page); } -int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) +int free_io_failure(struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, + struct io_failure_record *rec) { int ret; int err = 0; - struct extent_io_tree *failure_tree = &inode->io_failure_tree; set_state_failrec(failure_tree, rec->start, NULL); ret = clear_extent_bits(failure_tree, rec->start, @@ -1974,7 +1954,7 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) if (ret) err = ret; - ret = clear_extent_bits(&inode->io_tree, rec->start, + ret = clear_extent_bits(io_tree, rec->start, rec->start + rec->len - 1, EXTENT_DAMAGED); if (ret && !err) @@ -1994,29 +1974,21 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, - u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; struct btrfs_device *dev; u64 map_length = 0; u64 sector; struct btrfs_bio *bbio = NULL; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); BUG_ON(!mirror_num); - /* we can't repair anything in raid56 yet */ - if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) - return 0; - - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; @@ -2026,17 +1998,35 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, * read repair operation. */ btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bbio, mirror_num); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; + if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) { + /* + * Note that we don't use BTRFS_MAP_WRITE because it's supposed + * to update all raid stripes, but here we just want to correct + * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad + * stripe's dev and sector. + */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &map_length, &bbio, 0); + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio_put(bio); + return -EIO; + } + ASSERT(bbio->mirror_num == 1); + } else { + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); } - BUG_ON(mirror_num != bbio->mirror_num); - sector = bbio->stripes[mirror_num-1].physical >> 9; + + sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; bio->bi_iter.bi_sector = sector; - dev = bbio->stripes[mirror_num-1].dev; + dev = bbio->stripes[bbio->mirror_num - 1].dev; btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { btrfs_bio_counter_dec(fs_info); @@ -2057,7 +2047,7 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - btrfs_ino(inode), start, + ino, start, rcu_str_deref(dev->name), sector); btrfs_bio_counter_dec(fs_info); bio_put(bio); @@ -2077,8 +2067,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; - ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start, - PAGE_SIZE, start, p, + ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, start - page_offset(p), mirror_num); if (ret) break; @@ -2092,24 +2081,24 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, - unsigned int pg_offset) +int clean_io_failure(struct btrfs_fs_info *fs_info, + struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, u64 start, + struct page *page, u64 ino, unsigned int pg_offset) { u64 private; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *state; int num_copies; int ret; private = 0; - ret = count_range_bits(&inode->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY, 0); + ret = count_range_bits(failure_tree, &private, (u64)-1, 1, + EXTENT_DIRTY, 0); if (!ret) return 0; - ret = get_state_failrec(&inode->io_failure_tree, start, - &failrec); + ret = get_state_failrec(failure_tree, start, &failrec); if (ret) return 0; @@ -2125,25 +2114,25 @@ int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, if (fs_info->sb->s_flags & MS_RDONLY) goto out; - spin_lock(&inode->io_tree.lock); - state = find_first_extent_bit_state(&inode->io_tree, + spin_lock(&io_tree->lock); + state = find_first_extent_bit_state(io_tree, failrec->start, EXTENT_LOCKED); - spin_unlock(&inode->io_tree.lock); + spin_unlock(&io_tree->lock); if (state && state->start <= failrec->start && state->end >= failrec->start + failrec->len - 1) { num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); if (num_copies > 1) { - repair_io_failure(inode, start, failrec->len, - failrec->logical, page, - pg_offset, failrec->failed_mirror); + repair_io_failure(fs_info, ino, start, failrec->len, + failrec->logical, page, pg_offset, + failrec->failed_mirror); } } out: - free_io_failure(inode, failrec); + free_io_failure(failure_tree, io_tree, failrec); return 0; } @@ -2343,10 +2332,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct btrfs_io_bio *btrfs_failed_bio; struct btrfs_io_bio *btrfs_bio; - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return NULL; - + bio = btrfs_io_bio_alloc(1); bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = fs_info->fs_devices->latest_bdev; @@ -2384,8 +2370,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct io_failure_record *failrec; struct inode *inode = page->mapping->host; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct bio *bio; int read_mode = 0; + blk_status_t status; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2396,7 +2384,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, tree, failrec); return -EIO; } @@ -2409,7 +2397,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, (int)phy_offset, failed_bio->bi_end_io, NULL); if (!bio) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, tree, failrec); return -EIO; } bio_set_op_attrs(bio, REQ_OP_READ, read_mode); @@ -2418,11 +2406,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", read_mode, failrec->this_mirror, failrec->in_validation); - ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, + status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, failrec->bio_flags, 0); - if (ret) { - free_io_failure(BTRFS_I(inode), failrec); + if (status) { + free_io_failure(failure_tree, tree, failrec); bio_put(bio); + ret = blk_status_to_errno(status); } return ret; @@ -2445,7 +2434,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) if (!uptodate) { ClearPageUptodate(page); SetPageError(page); - ret = ret < 0 ? ret : -EIO; + ret = err < 0 ? err : -EIO; mapping_set_error(page->mapping, ret); } } @@ -2461,6 +2450,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio) { + int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; u64 end; @@ -2490,7 +2480,7 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - end_extent_writepage(page, bio->bi_error, start, end); + end_extent_writepage(page, error, start, end); end_page_writeback(page); } @@ -2523,9 +2513,9 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct extent_io_tree *tree; + struct extent_io_tree *tree, *failure_tree; u64 offset = 0; u64 start; u64 end; @@ -2543,9 +2533,10 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - (u64)bio->bi_iter.bi_sector, bio->bi_error, + (u64)bio->bi_iter.bi_sector, bio->bi_status, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; /* We always issue full-page reads, but if some block * in a page fails to read, blk_update_request() will @@ -2575,8 +2566,10 @@ static void end_bio_extent_readpage(struct bio *bio) if (ret) uptodate = 0; else - clean_io_failure(BTRFS_I(inode), start, - page, 0); + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, tree, start, + page, + btrfs_ino(BTRFS_I(inode)), 0); } if (likely(uptodate)) @@ -2602,7 +2595,7 @@ static void end_bio_extent_readpage(struct bio *bio) ret = bio_readpage_error(bio, offset, page, start, end, mirror); if (ret == 0) { - uptodate = !bio->bi_error; + uptodate = !bio->bi_status; offset += len; continue; } @@ -2660,77 +2653,80 @@ readpage_ok: endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); if (io_bio->end_io) - io_bio->end_io(io_bio, bio->bi_error); + io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); bio_put(bio); } /* - * this allocates from the btrfs_bioset. We're returning a bio right now - * but you can call btrfs_io_bio for the appropriate container_of magic + * Initialize the members up to but not including 'bio'. Use after allocating a + * new bio by bio_alloc_bioset as it does not initialize the bytes outside of + * 'bio' because use of __GFP_ZERO is not supported. */ -struct bio * -btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) +static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) { - struct btrfs_io_bio *btrfs_bio; - struct bio *bio; - - bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); + memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); +} - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) { - bio = bio_alloc_bioset(gfp_flags, - nr_vecs, btrfs_bioset); - } - } +/* + * The following helpers allocate a bio. As it's backed by a bioset, it'll + * never fail. We're returning a bio right now but you can call btrfs_io_bio + * for the appropriate container_of magic + */ +struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) +{ + struct bio *bio; - if (bio) { - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = first_sector; - btrfs_bio = btrfs_io_bio(bio); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset); + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = first_byte >> 9; + btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } -struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) +struct bio *btrfs_bio_clone(struct bio *bio) { struct btrfs_io_bio *btrfs_bio; struct bio *new; - new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); - if (new) { - btrfs_bio = btrfs_io_bio(new); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + /* Bio allocation backed by a bioset does not fail */ + new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset); + btrfs_bio = btrfs_io_bio(new); + btrfs_io_bio_init(btrfs_bio); + btrfs_bio->iter = bio->bi_iter; return new; } -/* this also allocates from the btrfs_bioset */ -struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) { - struct btrfs_io_bio *btrfs_bio; struct bio *bio; - bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); - if (bio) { - btrfs_bio = btrfs_io_bio(bio); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + /* Bio allocation backed by a bioset does not fail */ + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset); + btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } +struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) +{ + struct bio *bio; + struct btrfs_io_bio *btrfs_bio; + + /* this will never fail when it's backed by a bioset */ + bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset); + ASSERT(bio); + + btrfs_bio = btrfs_io_bio(bio); + btrfs_io_bio_init(btrfs_bio); + + bio_trim(bio, offset >> 9, size >> 9); + btrfs_bio->iter = bio->bi_iter; + return bio; +} static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { - int ret = 0; + blk_status_t ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; @@ -2742,13 +2738,13 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio_get(bio); if (tree->ops) - ret = tree->ops->submit_bio_hook(page->mapping->host, bio, + ret = tree->ops->submit_bio_hook(tree->private_data, bio, mirror_num, bio_flags, start); else btrfsic_submit_bio(bio); bio_put(bio); - return ret; + return blk_status_to_errno(ret); } static int merge_bio(struct extent_io_tree *tree, struct page *page, @@ -2805,14 +2801,11 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, - GFP_NOFS | __GFP_HIGH); - if (!bio) - return -ENOMEM; - + bio = btrfs_bio_alloc(bdev, sector << 9); bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + bio->bi_write_hint = page->mapping->host->i_write_hint; bio_set_op_attrs(bio, op, op_flags); if (wbc) { wbc_init_bio(wbc, bio); @@ -2859,7 +2852,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, em = *em_cached; if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { - atomic_inc(&em->refs); + refcount_inc(&em->refs); return em; } @@ -2870,7 +2863,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); - atomic_inc(&em->refs); + refcount_inc(&em->refs); *em_cached = em; } return em; @@ -3694,7 +3687,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (bio->bi_error || + if (bio->bi_status || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); set_btree_ioerr(page); @@ -3744,7 +3737,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(nritems); - end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb); + end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb); memzero_extent_buffer(eb, start, end - start); } @@ -4364,6 +4357,119 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, return NULL; } +/* + * To cache previous fiemap extent + * + * Will be used for merging fiemap extent + */ +struct fiemap_cache { + u64 offset; + u64 phys; + u64 len; + u32 flags; + bool cached; +}; + +/* + * Helper to submit fiemap extent. + * + * Will try to merge current fiemap extent specified by @offset, @phys, + * @len and @flags with cached one. + * And only when we fails to merge, cached one will be submitted as + * fiemap extent. + * + * Return value is the same as fiemap_fill_next_extent(). + */ +static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + u64 offset, u64 phys, u64 len, u32 flags) +{ + int ret = 0; + + if (!cache->cached) + goto assign; + + /* + * Sanity check, extent_fiemap() should have ensured that new + * fiemap extent won't overlap with cahced one. + * Not recoverable. + * + * NOTE: Physical address can overlap, due to compression + */ + if (cache->offset + cache->len > offset) { + WARN_ON(1); + return -EINVAL; + } + + /* + * Only merges fiemap extents if + * 1) Their logical addresses are continuous + * + * 2) Their physical addresses are continuous + * So truly compressed (physical size smaller than logical size) + * extents won't get merged with each other + * + * 3) Share same flags except FIEMAP_EXTENT_LAST + * So regular extent won't get merged with prealloc extent + */ + if (cache->offset + cache->len == offset && + cache->phys + cache->len == phys && + (cache->flags & ~FIEMAP_EXTENT_LAST) == + (flags & ~FIEMAP_EXTENT_LAST)) { + cache->len += len; + cache->flags |= flags; + goto try_submit_last; + } + + /* Not mergeable, need to submit cached one */ + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret) + return ret; +assign: + cache->cached = true; + cache->offset = offset; + cache->phys = phys; + cache->len = len; + cache->flags = flags; +try_submit_last: + if (cache->flags & FIEMAP_EXTENT_LAST) { + ret = fiemap_fill_next_extent(fieinfo, cache->offset, + cache->phys, cache->len, cache->flags); + cache->cached = false; + } + return ret; +} + +/* + * Emit last fiemap cache + * + * The last fiemap cache may still be cached in the following case: + * 0 4k 8k + * |<- Fiemap range ->| + * |<------------ First extent ----------->| + * + * In this case, the first extent range will be cached but not emitted. + * So we must emit it before ending extent_fiemap(). + */ +static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + int ret; + + if (!cache->cached) + return 0; + + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret > 0) + ret = 0; + return ret; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -4381,6 +4487,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(inode)->root; + struct fiemap_cache cache = { 0 }; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4560,8 +4667,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_LAST; end = 1; } - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); + ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, + em_len, flags); if (ret) { if (ret == 1) ret = 0; @@ -4569,6 +4676,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } } out_free: + if (!ret) + ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache); free_extent_map(em); out: btrfs_free_path(path); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3e4fad4a909d..3fb8513bf02e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -2,6 +2,7 @@ #define __EXTENTIO__ #include <linux/rbtree.h> +#include <linux/refcount.h> #include "ulist.h" /* bits for the extent state */ @@ -14,14 +15,17 @@ #define EXTENT_DEFRAG (1U << 6) #define EXTENT_BOUNDARY (1U << 9) #define EXTENT_NODATASUM (1U << 10) -#define EXTENT_DO_ACCOUNTING (1U << 11) +#define EXTENT_CLEAR_META_RESV (1U << 11) #define EXTENT_FIRST_DELALLOC (1U << 12) #define EXTENT_NEED_WAIT (1U << 13) #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) #define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_CLEAR_DATA_RESV (1U << 17) +#define EXTENT_DELALLOC_NEW (1U << 18) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ + EXTENT_CLEAR_DATA_RESV) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* @@ -88,7 +92,7 @@ struct btrfs_inode; struct btrfs_io_bio; struct io_failure_record; -typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, +typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset); struct extent_io_ops { @@ -104,32 +108,36 @@ struct extent_io_ops { size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); + struct btrfs_fs_info *(*tree_fs_info)(void *private_data); + void (*set_range_writeback)(void *private_data, u64 start, u64 end); /* * Optional hooks, called if the pointer is not NULL */ - int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + int (*fill_delalloc)(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); - void (*set_bit_hook)(struct inode *inode, struct extent_state *state, + void (*set_bit_hook)(void *private_data, struct extent_state *state, unsigned *bits); - void (*clear_bit_hook)(struct btrfs_inode *inode, + void (*clear_bit_hook)(void *private_data, struct extent_state *state, unsigned *bits); - void (*merge_extent_hook)(struct inode *inode, + void (*merge_extent_hook)(void *private_data, struct extent_state *new, struct extent_state *other); - void (*split_extent_hook)(struct inode *inode, + void (*split_extent_hook)(void *private_data, struct extent_state *orig, u64 split); + void (*check_extent_io_range)(void *private_data, const char *caller, + u64 start, u64 end); }; struct extent_io_tree { struct rb_root state; - struct address_space *mapping; + void *private_data; u64 dirty_bytes; int track_uptodate; spinlock_t lock; @@ -143,7 +151,7 @@ struct extent_state { /* ADD NEW ELEMENTS AFTER THIS */ wait_queue_head_t wq; - atomic_t refs; + refcount_t refs; unsigned state; struct io_failure_record *failrec; @@ -201,12 +209,46 @@ struct extent_buffer { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - u64 bytes_changed; + unsigned int bytes_changed; /* Changed ranges */ struct ulist range_changed; }; +static inline void extent_changeset_init(struct extent_changeset *changeset) +{ + changeset->bytes_changed = 0; + ulist_init(&changeset->range_changed); +} + +static inline struct extent_changeset *extent_changeset_alloc(void) +{ + struct extent_changeset *ret; + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + extent_changeset_init(ret); + return ret; +} + +static inline void extent_changeset_release(struct extent_changeset *changeset) +{ + if (!changeset) + return; + changeset->bytes_changed = 0; + ulist_release(&changeset->range_changed); +} + +static inline void extent_changeset_free(struct extent_changeset *changeset) +{ + if (!changeset) + return; + extent_changeset_release(changeset); + kfree(changeset); +} + static inline void extent_set_compress_type(unsigned long *bio_flags, int compress_type) { @@ -226,8 +268,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, u64 start, u64 len, int create); -void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping); +void extent_io_tree_init(struct extent_io_tree *tree, void *private_data); int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); @@ -455,20 +496,21 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, u64 delalloc_end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); -struct bio * -btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags); -struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); +struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte); +struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); +struct bio *btrfs_bio_clone(struct bio *bio); +struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); struct btrfs_fs_info; struct btrfs_inode; -int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, - u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num); -int clean_io_failure(struct btrfs_inode *inode, u64 start, - struct page *page, unsigned int pg_offset); +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); +int clean_io_failure(struct btrfs_fs_info *fs_info, + struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, u64 start, + struct page *page, u64 ino, unsigned int pg_offset); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int mirror_num); @@ -503,7 +545,9 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, struct page *page, int pg_offset, int icsum, bio_end_io_t *endio_func, void *data); -int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec); +int free_io_failure(struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, + struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 26f9ac719d20..69850155870c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -55,7 +55,7 @@ struct extent_map *alloc_extent_map(void) em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; em->generation = 0; - atomic_set(&em->refs, 1); + refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; } @@ -71,8 +71,8 @@ void free_extent_map(struct extent_map *em) { if (!em) return; - WARN_ON(atomic_read(&em->refs) == 0); - if (atomic_dec_and_test(&em->refs)) { + WARN_ON(refcount_read(&em->refs) == 0); + if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) @@ -322,7 +322,7 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified) { - atomic_inc(&em->refs); + refcount_inc(&em->refs); em->mod_start = em->start; em->mod_len = em->len; @@ -381,7 +381,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, if (strict && !(end > em->start && start < extent_map_end(em))) return NULL; - atomic_inc(&em->refs); + refcount_inc(&em->refs); return em; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index eb8b8fae036b..a67b2def5413 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -2,6 +2,7 @@ #define __EXTENTMAP__ #include <linux/rbtree.h> +#include <linux/refcount.h> #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) @@ -41,7 +42,7 @@ struct extent_map { */ struct map_lookup *map_lookup; }; - atomic_t refs; + refcount_t refs; unsigned int compress_type; struct list_head list; }; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 64fcb31d7163..fdcb41002623 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -160,11 +160,12 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) kfree(bio->csum_allocated); } -static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -177,12 +178,12 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 page_bytes_left; u32 diff; int nblocks; - int count = 0, i; + int count = 0; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + return BLK_STS_RESOURCE; nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { @@ -191,7 +192,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, csum_size, GFP_NOFS); if (!btrfs_bio->csum_allocated) { btrfs_free_path(path); - return -ENOMEM; + return BLK_STS_RESOURCE; } btrfs_bio->csum = btrfs_bio->csum_allocated; btrfs_bio->end_io = btrfs_io_bio_endio_readpage; @@ -206,8 +207,6 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (bio->bi_iter.bi_size > PAGE_SIZE * 8) path->reada = READA_FORWARD; - WARN_ON(bio->bi_vcnt <= 0); - /* * the free space stuff is only read when it hasn't been * updated in the current transaction. So, we can safely @@ -223,13 +222,13 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (dio) offset = logical_offset; - bio_for_each_segment_all(bvec, bio, i) { - page_bytes_left = bvec->bv_len; + bio_for_each_segment(bvec, bio, iter) { + page_bytes_left = bvec.bv_len; if (count) goto next; if (!dio) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, (u32 *)csum, nblocks); if (count) @@ -303,12 +302,12 @@ next: return 0; } -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) { return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) { return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1); } @@ -433,26 +432,26 @@ fail: return ret; } -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; char *data; - struct bio_vec *bvec; + struct bvec_iter iter; + struct bio_vec bvec; int index; int nr_sectors; - int i, j; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; + int i; u64 offset; - WARN_ON(bio->bi_vcnt <= 0); sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), GFP_NOFS); if (!sums) - return -ENOMEM; + return BLK_STS_RESOURCE; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); @@ -465,19 +464,19 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; - bio_for_each_segment_all(bvec, bio, j) { + bio_for_each_segment(bvec, bio, iter) { if (!contig) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = page_offset(bvec.bv_page) + bvec.bv_offset; if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ } - data = kmap_atomic(bvec->bv_page); + data = kmap_atomic(bvec.bv_page); nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, - bvec->bv_len + fs_info->sectorsize + bvec.bv_len + fs_info->sectorsize - 1); for (i = 0; i < nr_sectors; i++) { @@ -504,12 +503,12 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, + total_bytes; index = 0; - data = kmap_atomic(bvec->bv_page); + data = kmap_atomic(bvec.bv_page); } sums->sums[index] = ~(u32)0; sums->sums[index] - = btrfs_csum_data(data + bvec->bv_offset + = btrfs_csum_data(data + bvec.bv_offset + (i * fs_info->sectorsize), sums->sums[index], fs_info->sectorsize); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 520cb7230b2d..24338702ea5b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1404,6 +1404,47 @@ fail: } +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, + const u64 start, + const u64 len, + struct extent_state **cached_state) +{ + u64 search_start = start; + const u64 end = start + len - 1; + + while (search_start < end) { + const u64 search_len = end - search_start + 1; + struct extent_map *em; + u64 em_len; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, search_start, + search_len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start != EXTENT_MAP_HOLE) + goto next; + + em_len = em->len; + if (em->start < search_start) + em_len -= search_start - em->start; + if (em_len > search_len) + em_len = search_len; + + ret = set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, + NULL, cached_state, GFP_NOFS); +next: + search_start = extent_map_end(em); + free_extent_map(em); + if (ret) + return ret; + } + return 0; +} + /* * This function locks the extent and properly waits for data=ordered extents * to finish before allowing the pages to be modified if need. @@ -1432,8 +1473,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, + round_up(pos + write_bytes - start_pos, fs_info->sectorsize) - 1; - if (start_pos < inode->vfs_inode.i_size) { + if (start_pos < inode->vfs_inode.i_size || + (inode->flags & BTRFS_INODE_PREALLOC)) { struct btrfs_ordered_extent *ordered; + unsigned int clear_bits; + lock_extent_bits(&inode->io_tree, start_pos, last_pos, cached_state); ordered = btrfs_lookup_ordered_range(inode, start_pos, @@ -1454,11 +1498,19 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } if (ordered) btrfs_put_ordered_extent(ordered); - + ret = btrfs_find_new_delalloc_bytes(inode, start_pos, + last_pos - start_pos + 1, + cached_state); + clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG; + if (ret) + clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED; clear_extent_bit(&inode->io_tree, start_pos, - last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached_state, GFP_NOFS); + last_pos, clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, + 0, cached_state, GFP_NOFS); + if (ret) + return ret; *lockstart = start_pos; *lockend = last_pos; ret = 1; @@ -1529,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; u64 lockend; @@ -1576,7 +1629,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); - ret = btrfs_check_data_free_space(inode, pos, write_bytes); + extent_changeset_release(data_reserved); + ret = btrfs_check_data_free_space(inode, &data_reserved, pos, + write_bytes); if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && @@ -1605,8 +1660,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, pos, - write_bytes); + btrfs_free_reserved_data_space(inode, + data_reserved, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1688,8 +1744,9 @@ again: __pos = round_down(pos, fs_info->sectorsize) + (dirty_pages << PAGE_SHIFT); - btrfs_delalloc_release_space(inode, __pos, - release_bytes); + btrfs_delalloc_release_space(inode, + data_reserved, __pos, + release_bytes); } } @@ -1744,12 +1801,13 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); } else { - btrfs_delalloc_release_space(inode, - round_down(pos, fs_info->sectorsize), - release_bytes); + btrfs_delalloc_release_space(inode, data_reserved, + round_down(pos, fs_info->sectorsize), + release_bytes); } } + extent_changeset_free(data_reserved); return num_written ? num_written : ret; } @@ -1823,12 +1881,29 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, ssize_t num_written = 0; bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); ssize_t err; - loff_t pos; - size_t count; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); loff_t oldsize; int clean_page = 0; - inode_lock(inode); + if ((iocb->ki_flags & IOCB_NOWAIT) && + (iocb->ki_flags & IOCB_DIRECT)) { + /* Don't sleep on inode rwsem */ + if (!inode_trylock(inode)) + return -EAGAIN; + /* + * We will allocate space in case nodatacow is not set, + * so bail + */ + if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) || + check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { + inode_unlock(inode); + return -EAGAIN; + } + } else + inode_lock(inode); + err = generic_write_checks(iocb, from); if (err <= 0) { inode_unlock(inode); @@ -1862,8 +1937,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ update_time_for_write(inode); - pos = iocb->ki_pos; - count = iov_iter_count(from); start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); if (start_pos > oldsize) { @@ -2338,17 +2411,15 @@ out: */ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; int ret = 0; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0); - if (IS_ERR_OR_NULL(em)) { - if (!em) - ret = -ENOMEM; - else - ret = PTR_ERR(em); - return ret; - } + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + round_down(*start, fs_info->sectorsize), + round_up(*len, fs_info->sectorsize), 0); + if (IS_ERR(em)) + return PTR_ERR(em); /* Hole or vacuum extent(only exists in no-hole mode) */ if (em->block_start == EXTENT_MAP_HOLE) { @@ -2722,6 +2793,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; struct list_head reserve_list; @@ -2835,11 +2907,8 @@ static long btrfs_fallocate(struct file *file, int mode, while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset, 0); - if (IS_ERR_OR_NULL(em)) { - if (!em) - ret = -ENOMEM; - else - ret = PTR_ERR(em); + if (IS_ERR(em)) { + ret = PTR_ERR(em); break; } last_byte = min(extent_map_end(em), alloc_end); @@ -2854,18 +2923,20 @@ static long btrfs_fallocate(struct file *file, int mode, free_extent_map(em); break; } - ret = btrfs_qgroup_reserve_data(inode, cur_offset, - last_byte - cur_offset); - if (ret < 0) + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + cur_offset, last_byte - cur_offset); + if (ret < 0) { + free_extent_map(em); break; + } } else { /* * Do not need to reserve unwritten extent for this * range, free reserved data space first, otherwise * it'll result in false ENOSPC error. */ - btrfs_free_reserved_data_space(inode, cur_offset, - last_byte - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2884,8 +2955,9 @@ static long btrfs_fallocate(struct file *file, int mode, range->len, i_blocksize(inode), offset + len, &alloc_hint); else - btrfs_free_reserved_data_space(inode, range->start, - range->len); + btrfs_free_reserved_data_space(inode, + data_reserved, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2923,8 +2995,9 @@ out: inode_unlock(inode); /* Let go of our reservation. */ if (ret != 0) - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + alloc_start, alloc_end - cur_offset); + extent_changeset_free(data_reserved); return ret; } @@ -3025,13 +3098,19 @@ out: return offset; } +static int btrfs_file_open(struct inode *inode, struct file *filp) +{ + filp->f_mode |= FMODE_AIO_NOWAIT; + return generic_file_open(inode, filp); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, - .open = generic_file_open, + .open = btrfs_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, .fallocate = btrfs_fallocate, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index da6841efac26..c5e6180cdb8c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -355,7 +355,7 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) io_ctl->orig = io_ctl->cur; io_ctl->size = PAGE_SIZE; if (clear) - memset(io_ctl->cur, 0, PAGE_SIZE); + clear_page(io_ctl->cur); } static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index fc0bd8406758..a5e34de06c2f 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -17,7 +17,7 @@ */ #include <linux/kernel.h> -#include <linux/vmalloc.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" #include "locking.h" @@ -153,21 +153,21 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) static u8 *alloc_bitmap(u32 bitmap_size) { - void *mem; + u8 *ret; + unsigned int nofs_flag; /* - * The allocation size varies, observed numbers were < 4K up to 16K. - * Using vmalloc unconditionally would be too heavy, we'll try - * contiguous allocations first. + * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse + * into the filesystem as the free space bitmap can be modified in the + * critical section of a transaction commit. + * + * TODO: push the memalloc_nofs_{save,restore}() to the caller where we + * know that recursion is unsafe. */ - if (bitmap_size <= PAGE_SIZE) - return kzalloc(bitmap_size, GFP_NOFS); - - mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN); - if (mem) - return mem; - - return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL); + nofs_flag = memalloc_nofs_save(); + ret = kvzalloc(bitmap_size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + return ret; } int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, @@ -1188,11 +1188,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - return 0; + return btrfs_commit_transaction(trans); abort: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); @@ -1277,11 +1273,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) free_extent_buffer(free_space_root->commit_root); kfree(free_space_root); - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - return 0; + return btrfs_commit_transaction(trans); abort: btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index a97fdc156a03..baacc1866861 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -38,6 +38,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) { SHASH_DESC_ON_STACK(shash, tfm); u32 *ctx = (u32 *)shash_desc_ctx(shash); + u32 retval; int err; shash->tfm = tfm; @@ -47,5 +48,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *ctx; + retval = *ctx; + barrier_data(ctx); + return retval; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 5c6c20ec64d8..d02019747d00 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_path *path; struct inode *inode; struct btrfs_block_rsv *rsv; + struct extent_changeset *data_reserved = NULL; u64 num_bytes; u64 alloc_hint = 0; int ret; @@ -492,7 +493,7 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc); if (ret) goto out_put; @@ -516,6 +517,7 @@ out: trans->bytes_reserved = num_bytes; btrfs_free_path(path); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aabdcb9f234f..06dea7c89bbd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -86,7 +86,6 @@ static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; -struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; @@ -115,6 +114,31 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, u64 ram_bytes, int compress_type, int type); +static void __endio_write_update_ordered(struct inode *inode, + const u64 offset, const u64 bytes, + const bool uptodate); + +/* + * Cleanup all submitted ordered extents in specified range to handle errors + * from the fill_dellaloc() callback. + * + * NOTE: caller must ensure that when an error happens, it can not call + * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING + * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata + * to be released, which we want to happen only when finishing the ordered + * extent (btrfs_finish_ordered_io()). Also note that the caller of the + * fill_delalloc() callback already does proper cleanup for the first page of + * the range, that is, it invokes the callback writepage_end_io_hook() for the + * range of the first page. + */ +static inline void btrfs_cleanup_ordered_extents(struct inode *inode, + const u64 offset, + const u64 bytes) +{ + return __endio_write_update_ordered(inode, offset + PAGE_SIZE, + bytes - PAGE_SIZE, false); +} + static int btrfs_dirty_inode(struct inode *inode); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -153,7 +177,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; - int err = 0; int ret; size_t cur_size = size; unsigned long offset; @@ -175,10 +198,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); - if (ret) { - err = ret; + if (ret) goto fail; - } } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -233,9 +254,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, BTRFS_I(inode)->disk_i_size = inode->i_size; ret = btrfs_update_inode(trans, root, inode); - return ret; fail: - return err; + return ret; } @@ -325,7 +345,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -547,7 +567,7 @@ cont: } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DEFRAG; + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG; unsigned long page_error_op; clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; @@ -565,8 +585,10 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); - btrfs_free_reserved_data_space_noquota(inode, start, - end - start + 1); + if (ret == 0) + btrfs_free_reserved_data_space_noquota(inode, + start, + end - start + 1); goto free_pages_out; } } @@ -581,12 +603,11 @@ cont: /* * one last check to make sure the compression is really a - * win, compare the page count read with the blocks on disk + * win, compare the page count read with the blocks on disk, + * compression must free at least one sector size */ total_in = ALIGN(total_in, PAGE_SIZE); - if (total_compressed >= total_in) { - will_compress = 0; - } else { + if (total_compressed + blocksize <= total_in) { num_bytes = total_in; *num_added += 1; @@ -815,13 +836,12 @@ retry: NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); - ret = btrfs_submit_compressed_write(inode, + if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, - async_extent->nr_pages); - if (ret) { + async_extent->nr_pages)) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; @@ -852,6 +872,7 @@ out_free: async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | @@ -918,10 +939,13 @@ static noinline int cow_file_range(struct inode *inode, u64 num_bytes; unsigned long ram_size; u64 disk_num_bytes; - u64 cur_alloc_size; + u64 cur_alloc_size = 0; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; + unsigned clear_bits; + unsigned long page_ops; + bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(BTRFS_I(inode))) { @@ -944,6 +968,7 @@ static noinline int cow_file_range(struct inode *inode, extent_clear_unlock_delalloc(inode, start, end, delalloc_end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); @@ -966,14 +991,14 @@ static noinline int cow_file_range(struct inode *inode, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { - unsigned long op; - cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock; + cur_alloc_size = ins.offset; + extent_reserved = true; ram_size = ins.offset; em = create_io_em(inode, start, ins.offset, /* len */ @@ -988,7 +1013,6 @@ static noinline int cow_file_range(struct inode *inode, goto out_reserve; free_extent_map(em); - cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); if (ret) @@ -998,15 +1022,24 @@ static noinline int cow_file_range(struct inode *inode, BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); + /* + * Only drop cache here, and process as normal. + * + * We must not allow extent_clear_unlock_delalloc() + * at out_unlock label to free meta of this ordered + * extent, as its meta should be freed by + * btrfs_finish_ordered_io(). + * + * So we must continue until @start is increased to + * skip current ordered extent. + */ if (ret) - goto out_drop_extent_cache; + btrfs_drop_extent_cache(BTRFS_I(inode), start, + start + ram_size - 1, 0); } btrfs_dec_block_group_reservations(fs_info, ins.objectid); - if (disk_num_bytes < cur_alloc_size) - break; - /* we're not doing compressed IO, don't unlock the first * page (which the caller expects to stay locked), don't * clear any dirty bits and don't set any writeback bits @@ -1014,18 +1047,30 @@ static noinline int cow_file_range(struct inode *inode, * Do set the Private2 bit so we know this page was properly * setup for writepage */ - op = unlock ? PAGE_UNLOCK : 0; - op |= PAGE_SET_PRIVATE2; + page_ops = unlock ? PAGE_UNLOCK : 0; + page_ops |= PAGE_SET_PRIVATE2; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, delalloc_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, - op); - disk_num_bytes -= cur_alloc_size; + page_ops); + if (disk_num_bytes < cur_alloc_size) + disk_num_bytes = 0; + else + disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; + extent_reserved = false; + + /* + * btrfs_reloc_clone_csums() error, since start is increased + * extent_clear_unlock_delalloc() at out_unlock label won't + * free metadata of current ordered extent, we're OK to exit. + */ + if (ret) + goto out_unlock; } out: return ret; @@ -1036,12 +1081,35 @@ out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK; + /* + * If we reserved an extent for our delalloc range (or a subrange) and + * failed to create the respective ordered extent, then it means that + * when we reserved the extent we decremented the extent's size from + * the data space_info's bytes_may_use counter and incremented the + * space_info's bytes_reserved counter by the same amount. We must make + * sure extent_clear_unlock_delalloc() does not try to decrement again + * the data space_info's bytes_may_use counter, therefore we do not pass + * it the flag EXTENT_CLEAR_DATA_RESV. + */ + if (extent_reserved) { + extent_clear_unlock_delalloc(inode, start, + start + cur_alloc_size, + start + cur_alloc_size, + locked_page, + clear_bits, + page_ops); + start += cur_alloc_size; + if (start >= end) + goto out; + } extent_clear_unlock_delalloc(inode, start, end, delalloc_end, locked_page, - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DELALLOC | EXTENT_DEFRAG, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); + clear_bits | EXTENT_CLEAR_DATA_RESV, + page_ops); goto out; } @@ -1414,15 +1482,14 @@ out_check: BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + BTRFS_DATA_RELOC_TREE_OBJECTID) + /* + * Error handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler + * from freeing metadata of created ordered extent. + */ ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - if (ret) { - if (!nolock && nocow) - btrfs_end_write_no_snapshoting(root); - goto error; - } - } extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, end, @@ -1434,6 +1501,14 @@ out_check: if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; + + /* + * btrfs_reloc_clone_csums() error, now we're OK to call error + * handler, as metadata for created ordered extent will only + * be freed by btrfs_finish_ordered_io(). + */ + if (ret) + goto error; if (cur_offset > end) break; } @@ -1487,10 +1562,11 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) /* * extent_io.c call back to do delayed allocation processing */ -static int run_delalloc_range(struct inode *inode, struct page *locked_page, +static int run_delalloc_range(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written) { + struct inode *inode = private_data; int ret; int force_cow = need_force_cow(inode, start, end); @@ -1509,12 +1585,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); } + if (ret) + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } -static void btrfs_split_extent_hook(struct inode *inode, +static void btrfs_split_extent_hook(void *private_data, struct extent_state *orig, u64 split) { + struct inode *inode = private_data; u64 size; /* not delalloc, ignore it */ @@ -1549,10 +1628,11 @@ static void btrfs_split_extent_hook(struct inode *inode, * extents, such as when we are doing sequential writes, so we can properly * account for the metadata space we'll need. */ -static void btrfs_merge_extent_hook(struct inode *inode, +static void btrfs_merge_extent_hook(void *private_data, struct extent_state *new, struct extent_state *other) { + struct inode *inode = private_data; u64 new_size, old_size; u32 num_extents; @@ -1652,9 +1732,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ -static void btrfs_set_bit_hook(struct inode *inode, +static void btrfs_set_bit_hook(void *private_data, struct extent_state *state, unsigned *bits) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1693,15 +1774,24 @@ static void btrfs_set_bit_hook(struct inode *inode, btrfs_add_delalloc_inodes(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } + + if (!(state->state & EXTENT_DELALLOC_NEW) && + (*bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - + state->start; + spin_unlock(&BTRFS_I(inode)->lock); + } } /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static void btrfs_clear_bit_hook(struct btrfs_inode *inode, +static void btrfs_clear_bit_hook(void *private_data, struct extent_state *state, unsigned *bits) { + struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data); struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); @@ -1722,7 +1812,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; - } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { + } else if (!(*bits & EXTENT_CLEAR_META_RESV)) { spin_lock(&inode->lock); inode->outstanding_extents -= num_extents; spin_unlock(&inode->lock); @@ -1733,7 +1823,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, * don't need to call dellalloc_release_metadata if there is an * error. */ - if (*bits & EXTENT_DO_ACCOUNTING && + if (*bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); @@ -1741,10 +1831,9 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, if (btrfs_is_testing(fs_info)) return; - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list && !(state->state & EXTENT_NORESERVE) - && (*bits & (EXTENT_DO_ACCOUNTING | - EXTENT_CLEAR_DATA_RESV))) + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && + do_list && !(state->state & EXTENT_NORESERVE) && + (*bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota( &inode->vfs_inode, state->start, len); @@ -1759,6 +1848,14 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, btrfs_del_delalloc_inode(root, inode); spin_unlock(&inode->lock); } + + if ((state->state & EXTENT_DELALLOC_NEW) && + (*bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&inode->lock); + ASSERT(inode->new_delalloc_bytes >= len); + inode->new_delalloc_bytes -= len; + spin_unlock(&inode->lock); + } } /* @@ -1802,11 +1899,12 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { - int ret = 0; + struct inode *inode = private_data; + blk_status_t ret = 0; ret = btrfs_csum_one_bio(inode, bio, 0, 0); BUG_ON(ret); /* -ENOMEM */ @@ -1821,16 +1919,17 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; + blk_status_t ret; ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1840,14 +1939,15 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read */ -static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - int ret = 0; + blk_status_t ret = 0; int skip_sum; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -1877,8 +1977,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, - bio_flags, bio_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, + bio_offset, inode, __btrfs_submit_bio_start, __btrfs_submit_bio_done); goto out; @@ -1892,8 +1992,8 @@ mapit: ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); out: - if (ret < 0) { - bio->bi_error = ret; + if (ret) { + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1936,6 +2036,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_writepage_fixup *fixup; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct page *page; struct inode *inode; u64 page_start; @@ -1973,7 +2074,7 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, PAGE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); @@ -1993,6 +2094,7 @@ out_page: unlock_page(page); put_page(page); kfree(fixup); + extent_changeset_free(data_reserved); } /* @@ -2044,6 +2146,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; + u64 qg_released; int extent_inserted = 0; int ret; @@ -2099,13 +2202,17 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, &ins); + /* * Release the reserved range from inode dirty range map, as it is * already moved into delayed_ref_head */ - btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + if (ret < 0) + goto out; + qg_released = ret; + ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins); out: btrfs_free_path(path); @@ -2791,6 +2898,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) u64 logical_len = ordered_extent->len; bool nolock; bool truncated = false; + bool range_locked = false; + bool clear_new_delalloc_bytes = false; + + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) + clear_new_delalloc_bytes = true; nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); @@ -2820,7 +2934,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * space for NOCOW range. * As NOCOW won't cause a new delayed ref, just free the space */ - btrfs_qgroup_free_data(inode, ordered_extent->file_offset, + btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (nolock) @@ -2839,13 +2953,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + range_locked = true; lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, &cached_state); ret = test_range_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 1, cached_state); + EXTENT_DEFRAG, 0, cached_state); if (ret) { u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); if (0 && last_snapshot >= BTRFS_I(inode)->generation) @@ -2864,7 +2979,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; - goto out_unlock; + goto out; } trans->block_rsv = &fs_info->delalloc_block_rsv; @@ -2896,7 +3011,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto out_unlock; + goto out; } add_pending_csums(trans, inode, &ordered_extent->list); @@ -2905,14 +3020,26 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); - goto out_unlock; + goto out; } ret = 0; -out_unlock: - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); out: + if (range_locked || clear_new_delalloc_bytes) { + unsigned int clear_bits = 0; + + if (range_locked) + clear_bits |= EXTENT_LOCKED; + if (clear_new_delalloc_bytes) + clear_bits |= EXTENT_DELALLOC_NEW; + clear_extent_bit(&BTRFS_I(inode)->io_tree, + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, + clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, + 0, &cached_state, GFP_NOFS); + } + if (root != fs_info->tree_root) btrfs_delalloc_release_metadata(BTRFS_I(inode), ordered_extent->len); @@ -4401,9 +4528,17 @@ search_again: if (extent_type != BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_num_bytes(leaf, fi); + + trace_btrfs_truncate_show_fi_regular( + BTRFS_I(inode), leaf, fi, + found_key.offset); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_inline_len(leaf, path->slots[0], fi); + + trace_btrfs_truncate_show_fi_inline( + BTRFS_I(inode), leaf, fi, path->slots[0], + found_key.offset); } item_end--; } @@ -4603,13 +4738,6 @@ error: btrfs_free_path(path); - if (err == 0) { - /* only inline file may have last_size != new_size */ - if (new_size >= fs_info->sectorsize || - new_size > fs_info->max_inline) - ASSERT(last_size == new_size); - } - if (be_nice && bytes_deleted > SZ_32M) { unsigned long updates = trans->delayed_ref_updates; if (updates) { @@ -4642,6 +4770,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4656,7 +4785,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, round_down(from, blocksize), blocksize); if (ret) goto out; @@ -4664,7 +4793,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, round_down(from, blocksize), blocksize); ret = -ENOMEM; @@ -4736,11 +4865,12 @@ again: out_unlock: if (ret) - btrfs_delalloc_release_space(inode, block_start, + btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize); unlock_page(page); put_page(page); out: + extent_changeset_free(data_reserved); return ret; } @@ -5135,7 +5265,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state->state & EXTENT_DELALLOC) - btrfs_qgroup_free_data(inode, start, end - start + 1); + btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -5748,7 +5878,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_item *item; struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; @@ -5799,7 +5928,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) continue; } - item = btrfs_item_nr(slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) @@ -5814,7 +5942,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) ctx->pos = found_key.offset; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - if (verify_dir_item(fs_info, leaf, di)) + if (verify_dir_item(fs_info, leaf, slot, di)) goto next; name_len = btrfs_dir_name_len(leaf, di); @@ -6735,7 +6863,6 @@ static noinline int uncompress_inline(struct btrfs_path *path, * * This also copies inline extents directly into the page. */ - struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, @@ -6835,11 +6962,18 @@ again: found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); + + trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, + extent_start); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_end = ALIGN(extent_start + size, fs_info->sectorsize); + + trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, + path->slots[0], + extent_start); } next: if (start >= extent_end) { @@ -7037,19 +7171,17 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, em = btrfs_get_extent(inode, page, pg_offset, start, len, create); if (IS_ERR(em)) return em; - if (em) { - /* - * if our em maps to - * - a hole or - * - a pre-alloc extent, - * there might actually be delalloc bytes behind it. - */ - if (em->block_start != EXTENT_MAP_HOLE && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return em; - else - hole_em = em; - } + /* + * If our em maps to: + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. + */ + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + return em; + else + hole_em = em; /* check to see if we've wrapped (len == -1 or similar) */ end = start + len; @@ -7356,11 +7488,11 @@ out: bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) { struct radix_tree_root *root = &inode->i_mapping->page_tree; - int found = false; + bool found = false; void **pagep = NULL; struct page *page = NULL; - int start_idx; - int end_idx; + unsigned long start_idx; + unsigned long end_idx; start_idx = start >> PAGE_SHIFT; @@ -7854,9 +7986,12 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, bio_end_io_t *repair_endio, void *repair_arg) { struct io_failure_record *failrec; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct bio *bio; int isector; int read_mode = 0; + int segs; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -7868,13 +8003,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, io_tree, failrec); return -EIO; } - if ((failed_bio->bi_vcnt > 1) - || (failed_bio->bi_io_vec->bv_len - > btrfs_inode_sectorsize(inode))) + segs = bio_segments(failed_bio); + if (segs > 1 || + (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode))) read_mode |= REQ_FAILFAST_DEV; isector = start - btrfs_io_bio(failed_bio)->logical; @@ -7882,7 +8017,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, pgoff, isector, repair_endio, repair_arg); if (!bio) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, io_tree, failrec); return -EIO; } bio_set_op_attrs(bio, REQ_OP_READ, read_mode); @@ -7893,7 +8028,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); if (ret) { - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(failure_tree, io_tree, failrec); bio_put(bio); } @@ -7910,19 +8045,24 @@ struct btrfs_retry_complete { static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; + struct inode *inode = done->inode; struct bio_vec *bvec; + struct extent_io_tree *io_tree, *failure_tree; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; ASSERT(bio->bi_vcnt == 1); - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); + io_tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; + ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); done->uptodate = 1; bio_for_each_segment_all(bvec, bio, i) - clean_io_failure(BTRFS_I(done->inode), done->start, - bvec->bv_page, 0); + clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, + io_tree, done->start, bvec->bv_page, + btrfs_ino(BTRFS_I(inode)), 0); end: complete(&done->done); bio_put(bio); @@ -7932,36 +8072,40 @@ static int __btrfs_correct_data_nocsum(struct inode *inode, struct btrfs_io_bio *io_bio) { struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_retry_complete done; u64 start; unsigned int pgoff; u32 sectorsize; int nr_sectors; - int i; int ret; + int err = 0; fs_info = BTRFS_I(inode)->root->fs_info; sectorsize = fs_info->sectorsize; start = io_bio->logical; done.inode = inode; + io_bio->bio.bi_iter = io_bio->iter; - bio_for_each_segment_all(bvec, &io_bio->bio, i) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - pgoff = bvec->bv_offset; + bio_for_each_segment(bvec, &io_bio->bio, iter) { + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); + pgoff = bvec.bv_offset; next_block_or_try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, pgoff, start, start + sectorsize - 1, io_bio->mirror_num, btrfs_retry_endio_nocsum, &done); - if (ret) - return ret; + if (ret) { + err = ret; + goto next; + } wait_for_completion(&done.done); @@ -7970,6 +8114,7 @@ next_block_or_try_again: goto next_block_or_try_again; } +next: start += sectorsize; nr_sectors--; @@ -7980,19 +8125,21 @@ next_block_or_try_again: } } - return 0; + return err; } static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct extent_io_tree *io_tree, *failure_tree; + struct inode *inode = done->inode; struct bio_vec *bvec; int uptodate; int ret; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; uptodate = 1; @@ -8000,13 +8147,19 @@ static void btrfs_retry_endio(struct bio *bio) ASSERT(bio->bi_vcnt == 1); ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); + io_tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; + bio_for_each_segment_all(bvec, bio, i) { - ret = __readpage_endio_check(done->inode, io_bio, i, - bvec->bv_page, bvec->bv_offset, - done->start, bvec->bv_len); + ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, + bvec->bv_offset, done->start, + bvec->bv_len); if (!ret) - clean_io_failure(BTRFS_I(done->inode), done->start, - bvec->bv_page, bvec->bv_offset); + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, io_tree, done->start, + bvec->bv_page, + btrfs_ino(BTRFS_I(inode)), + bvec->bv_offset); else uptodate = 0; } @@ -8017,11 +8170,12 @@ end: bio_put(bio); } -static int __btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_retry_complete done; u64 start; u64 offset = 0; @@ -8029,7 +8183,7 @@ static int __btrfs_subio_endio_read(struct inode *inode, int nr_sectors; unsigned int pgoff; int csum_pos; - int i; + bool uptodate = (err == 0); int ret; fs_info = BTRFS_I(inode)->root->fs_info; @@ -8038,29 +8192,31 @@ static int __btrfs_subio_endio_read(struct inode *inode, err = 0; start = io_bio->logical; done.inode = inode; + io_bio->bio.bi_iter = io_bio->iter; - bio_for_each_segment_all(bvec, &io_bio->bio, i) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + bio_for_each_segment(bvec, &io_bio->bio, iter) { + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - pgoff = bvec->bv_offset; + pgoff = bvec.bv_offset; next_block: - csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); - ret = __readpage_endio_check(inode, io_bio, csum_pos, - bvec->bv_page, pgoff, start, - sectorsize); - if (likely(!ret)) - goto next; + if (uptodate) { + csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); + ret = __readpage_endio_check(inode, io_bio, csum_pos, + bvec.bv_page, pgoff, start, sectorsize); + if (likely(!ret)) + goto next; + } try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, pgoff, start, start + sectorsize - 1, io_bio->mirror_num, btrfs_retry_endio, &done); if (ret) { - err = ret; + err = errno_to_blk_status(ret); goto next; } @@ -8087,8 +8243,8 @@ next: return err; } -static int btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -8108,10 +8264,13 @@ static void btrfs_endio_direct_read(struct bio *bio) struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - int err = bio->bi_error; + blk_status_t err = bio->bi_status; - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) { err = btrfs_subio_endio_read(inode, io_bio, err); + if (!err) + bio->bi_status = 0; + } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -8119,25 +8278,34 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); if (io_bio->end_io) - io_bio->end_io(io_bio, err); + io_bio->end_io(io_bio, blk_status_to_errno(err)); bio_put(bio); } -static void btrfs_endio_direct_write_update_ordered(struct inode *inode, - const u64 offset, - const u64 bytes, - const int uptodate) +static void __endio_write_update_ordered(struct inode *inode, + const u64 offset, const u64 bytes, + const bool uptodate) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered = NULL; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; int ret; + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + wq = fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else { + wq = fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } + again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, @@ -8146,9 +8314,8 @@ again: if (!ret) goto out_test; - btrfs_init_work(&ordered->work, btrfs_endio_write_helper, - finish_ordered_fn, NULL, NULL); - btrfs_queue_work(fs_info->endio_write_workers, &ordered->work); + btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(wq, &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -8166,23 +8333,22 @@ static void btrfs_endio_direct_write(struct bio *bio) struct btrfs_dio_private *dip = bio->bi_private; struct bio *dio_bio = dip->dio_bio; - btrfs_endio_direct_write_update_ordered(dip->inode, - dip->logical_offset, - dip->bytes, - !bio->bi_error); + __endio_write_update_ordered(dip->inode, dip->logical_offset, + dip->bytes, !bio->bi_status); kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); bio_put(bio); } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, +static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 offset) { - int ret; + struct inode *inode = private_data; + blk_status_t ret; ret = btrfs_csum_one_bio(inode, bio, offset, 1); BUG_ON(ret); /* -ENOMEM */ return 0; @@ -8191,7 +8357,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, @@ -8221,31 +8387,21 @@ static void btrfs_end_dio_bio(struct bio *bio) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - dip->dio_bio->bi_error = 0; + dip->dio_bio->bi_status = 0; bio_endio(dip->orig_bio); } out: bio_put(bio); } -static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, - u64 first_sector, gfp_t gfp_flags) -{ - struct bio *bio; - bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); - if (bio) - bio_associate_current(bio); - return bio; -} - -static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode, +static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, u64 file_offset) { struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); - int ret; + blk_status_t ret; /* * We load all the csum data we need when we submit @@ -8276,7 +8432,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; bool write = bio_op(bio) == REQ_OP_WRITE; - int ret; + blk_status_t ret; if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -8293,8 +8449,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, goto map; if (write && async_submit) { - ret = btrfs_wq_submit_bio(fs_info, inode, bio, 0, 0, - file_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0, + file_offset, inode, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; @@ -8324,103 +8480,83 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *bio; struct bio *orig_bio = dip->orig_bio; - struct bio_vec *bvec; u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; - u64 submit_len = 0; u64 map_length; - u32 blocksize = fs_info->sectorsize; int async_submit = 0; - int nr_sectors; + u64 submit_len; + int clone_offset = 0; + int clone_len; int ret; - int i, j; map_length = orig_bio->bi_iter.bi_size; + submit_len = map_length; ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) return -EIO; - if (map_length >= orig_bio->bi_iter.bi_size) { + if (map_length >= submit_len) { bio = orig_bio; dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; } /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) + if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK) async_submit = 0; else async_submit = 1; - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); - if (!bio) - return -ENOMEM; - - bio->bi_opf = orig_bio->bi_opf; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; + /* bio split */ + ASSERT(map_length <= INT_MAX); atomic_inc(&dip->pending_bios); + do { + clone_len = min_t(int, submit_len, map_length); - bio_for_each_segment_all(bvec, orig_bio, j) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - i = 0; -next_block: - if (unlikely(map_length < submit_len + blocksize || - bio_add_page(bio, bvec->bv_page, blocksize, - bvec->bv_offset + (i * blocksize)) < blocksize)) { - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the dip might get freed - * before we're done setting it up - */ - atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, - file_offset, skip_sum, - async_submit); - if (ret) { - bio_put(bio); - atomic_dec(&dip->pending_bios); - goto out_err; - } - - start_sector += submit_len >> 9; - file_offset += submit_len; + /* + * This will never fail as it's passing GPF_NOFS and + * the allocation is backed by btrfs_bioset. + */ + bio = btrfs_bio_clone_partial(orig_bio, clone_offset, + clone_len); + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; + + ASSERT(submit_len >= clone_len); + submit_len -= clone_len; + if (submit_len == 0) + break; - submit_len = 0; + /* + * Increase the count before we submit the bio so we know + * the end IO handler won't happen before we increase the + * count. Otherwise, the dip might get freed before we're + * done setting it up. + */ + atomic_inc(&dip->pending_bios); - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, - start_sector, GFP_NOFS); - if (!bio) - goto out_err; - bio->bi_opf = orig_bio->bi_opf; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + async_submit); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } - map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), - start_sector << 9, - &map_length, NULL, 0); - if (ret) { - bio_put(bio); - goto out_err; - } + clone_offset += clone_len; + start_sector += clone_len >> 9; + file_offset += clone_len; - goto next_block; - } else { - submit_len += blocksize; - if (--nr_sectors) { - i++; - goto next_block; - } - } - } + map_length = submit_len; + ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), + start_sector << 9, &map_length, NULL, 0); + if (ret) + goto out_err; + } while (submit_len > 0); submit: ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, @@ -8447,19 +8583,15 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, loff_t file_offset) { struct btrfs_dio_private *dip = NULL; - struct bio *io_bio = NULL; - struct btrfs_io_bio *btrfs_bio; + struct bio *bio = NULL; + struct btrfs_io_bio *io_bio; int skip_sum; bool write = (bio_op(dio_bio) == REQ_OP_WRITE); int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); - if (!io_bio) { - ret = -ENOMEM; - goto free_ordered; - } + bio = btrfs_bio_clone(dio_bio); dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { @@ -8472,17 +8604,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, dip->logical_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; - io_bio->bi_private = dip; - dip->orig_bio = io_bio; + bio->bi_private = dip; + dip->orig_bio = bio; dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); - btrfs_bio = btrfs_io_bio(io_bio); - btrfs_bio->logical = file_offset; + io_bio = btrfs_io_bio(bio); + io_bio->logical = file_offset; if (write) { - io_bio->bi_end_io = btrfs_endio_direct_write; + bio->bi_end_io = btrfs_endio_direct_write; } else { - io_bio->bi_end_io = btrfs_endio_direct_read; + bio->bi_end_io = btrfs_endio_direct_read; dip->subio_endio = btrfs_subio_endio_read; } @@ -8505,8 +8637,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, if (!ret) return; - if (btrfs_bio->end_io) - btrfs_bio->end_io(btrfs_bio, ret); + if (io_bio->end_io) + io_bio->end_io(io_bio, ret); free_ordered: /* @@ -8518,35 +8650,34 @@ free_ordered: * same as btrfs_endio_direct_[write|read] because we can't call these * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (io_bio && dip) { - io_bio->bi_error = -EIO; - bio_endio(io_bio); + if (bio && dip) { + bio_io_error(bio); /* - * The end io callbacks free our dip, do the final put on io_bio + * The end io callbacks free our dip, do the final put on bio * and all the cleanup and final put for dio_bio (through * dio_end_io()). */ dip = NULL; - io_bio = NULL; + bio = NULL; } else { if (write) - btrfs_endio_direct_write_update_ordered(inode, + __endio_write_update_ordered(inode, file_offset, dio_bio->bi_iter.bi_size, - 0); + false); else unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - dio_bio->bi_error = -EIO; + dio_bio->bi_status = BLK_STS_IOERR; /* * Releases and cleans up our dio_bio, no need to bio_put() * nor bio_endio()/bio_io_error() against dio_bio. */ - dio_end_io(dio_bio, ret); + dio_end_io(dio_bio); } - if (io_bio) - bio_put(io_bio); + if (bio) + bio_put(bio); kfree(dip); } @@ -8590,6 +8721,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_data dio_data = { 0 }; + struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; int flags = 0; @@ -8625,8 +8757,12 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.overwrite = 1; inode_unlock(inode); relock = true; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; } - ret = btrfs_delalloc_reserve_space(inode, offset, count); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + offset, count); if (ret) goto out; dio_data.outstanding_extents = count_max_extents(count); @@ -8658,8 +8794,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, offset, - dio_data.reserve); + btrfs_delalloc_release_space(inode, data_reserved, + offset, dio_data.reserve); /* * On error we might have left some ordered extents * without submitting corresponding bios for them, so @@ -8668,14 +8804,14 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ if (dio_data.unsubmitted_oe_range_start < dio_data.unsubmitted_oe_range_end) - btrfs_endio_direct_write_update_ordered(inode, + __endio_write_update_ordered(inode, dio_data.unsubmitted_oe_range_start, dio_data.unsubmitted_oe_range_end - dio_data.unsubmitted_oe_range_start, - 0); + false); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, offset, - count - (size_t)ret); + btrfs_delalloc_release_space(inode, data_reserved, + offset, count - (size_t)ret); } out: if (wakeup) @@ -8683,6 +8819,7 @@ out: if (relock) inode_lock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -8819,6 +8956,7 @@ again: if (!inode_evicting) clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); @@ -8872,12 +9010,12 @@ again: * free the entire extent. */ if (PageDirty(page)) - btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); @@ -8914,6 +9052,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; unsigned long zero_start; loff_t size; @@ -8939,7 +9078,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, reserved_space); if (!ret) { ret = file_update_time(vmf->vma->vm_file); @@ -8993,8 +9132,8 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, page_start, - PAGE_SIZE - reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE - reserved_space); } } @@ -9045,13 +9184,16 @@ again: out_unlock: if (!ret) { sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } unlock_page(page); out: - btrfs_delalloc_release_space(inode, page_start, reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, page_start, + reserved_space); out_noreserve: sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return ret; } @@ -9248,6 +9390,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; + ei->new_delalloc_bytes = 0; ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; @@ -9272,8 +9415,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(&ei->io_tree, &inode->i_data); - extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); + extent_io_tree_init(&ei->io_tree, inode); + extent_io_tree_init(&ei->io_failure_tree, inode); ei->io_tree.track_uptodate = 1; ei->io_failure_tree.track_uptodate = 1; atomic_set(&ei->sync_writers, 0); @@ -9313,6 +9456,7 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(BTRFS_I(inode)->outstanding_extents); WARN_ON(BTRFS_I(inode)->reserved_extents); WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); WARN_ON(BTRFS_I(inode)->csum_bytes); WARN_ON(BTRFS_I(inode)->defrag_bytes); @@ -9381,7 +9525,6 @@ void btrfs_destroy_cachep(void) rcu_barrier(); kmem_cache_destroy(btrfs_inode_cachep); kmem_cache_destroy(btrfs_trans_handle_cachep); - kmem_cache_destroy(btrfs_transaction_cachep); kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); } @@ -9401,12 +9544,6 @@ int btrfs_init_cachep(void) if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", - sizeof(struct btrfs_transaction), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); - if (!btrfs_transaction_cachep) - goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_MEM_SPREAD, NULL); @@ -9431,12 +9568,30 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, u64 delalloc_bytes; struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; + u32 bi_flags = BTRFS_I(inode)->flags; + + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; + stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; + if (bi_flags & BTRFS_INODE_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + if (bi_flags & BTRFS_INODE_COMPRESS) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (bi_flags & BTRFS_INODE_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (bi_flags & BTRFS_INODE_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); - delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + ALIGN(delalloc_bytes, blocksize)) >> 9; @@ -10405,7 +10560,7 @@ next: btrfs_end_transaction(trans); } if (cur_offset < end) - btrfs_free_reserved_data_space(inode, cur_offset, + btrfs_free_reserved_data_space(inode, NULL, cur_offset, end - cur_offset + 1); return ret; } @@ -10526,6 +10681,42 @@ static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) return -EAGAIN; } +static struct btrfs_fs_info *iotree_fs_info(void *private_data) +{ + struct inode *inode = private_data; + return btrfs_sb(inode->i_sb); +} + +static void btrfs_check_extent_io_range(void *private_data, const char *caller, + u64 start, u64 end) +{ + struct inode *inode = private_data; + u64 isize; + + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + } +} + +void btrfs_set_range_writeback(void *private_data, u64 start, u64 end) +{ + struct inode *inode = private_data; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + ASSERT(page); /* Pages should be in the extent_io_tree */ + set_page_writeback(page); + put_page(page); + index++; + } +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10569,6 +10760,8 @@ static const struct extent_io_ops btrfs_extent_io_ops = { .readpage_end_io_hook = btrfs_readpage_end_io_hook, .merge_bio_hook = btrfs_merge_bio_hook, .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, + .tree_fs_info = iotree_fs_info, + .set_range_writeback = btrfs_set_range_writeback, /* optional callbacks */ .fill_delalloc = run_delalloc_range, @@ -10578,6 +10771,7 @@ static const struct extent_io_ops btrfs_extent_io_ops = { .clear_bit_hook = btrfs_clear_bit_hook, .merge_extent_hook = btrfs_merge_extent_hook, .split_extent_hook = btrfs_split_extent_hook, + .check_extent_io_range = btrfs_check_extent_io_range, }; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 922a66fce401..fa1b78cf25f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -37,7 +37,7 @@ #include <linux/bit_spinlock.h> #include <linux/security.h> #include <linux/xattr.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/uuid.h> @@ -689,7 +689,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; - btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -1127,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_io_tree *tree; + struct extent_changeset *data_reserved = NULL; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); file_end = (isize - 1) >> PAGE_SHIFT; @@ -1135,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); if (ret) @@ -1226,7 +1227,7 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, (page_cnt - i_done) << PAGE_SHIFT); } @@ -1247,15 +1248,17 @@ again: unlock_page(pages[i]); put_page(pages[i]); } + extent_changeset_free(data_reserved); return i_done; out: for (i = 0; i < i_done; i++) { unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); + extent_changeset_free(data_reserved); return ret; } @@ -1504,7 +1507,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1619,7 +1622,7 @@ out_free: kfree(vol_args); out: mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); mnt_drop_write_file(file); return ret; } @@ -2661,7 +2664,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; mutex_lock(&fs_info->volume_mutex); @@ -2680,7 +2683,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return ret; } @@ -2708,7 +2711,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) return -EOPNOTSUPP; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -2721,7 +2724,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) ret = btrfs_rm_device(fs_info, vol_args->name, 0); } mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -2752,7 +2755,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -2772,7 +2775,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out_drop_write: mnt_drop_write_file(file); @@ -4439,13 +4442,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (atomic_xchg( - &fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - atomic_set( - &fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -4590,7 +4591,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, out: btrfs_free_path(path); - vfree(inodes); + kvfree(inodes); kfree(loi); return ret; @@ -4640,7 +4641,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); need_unlock = true; @@ -4686,7 +4687,7 @@ again: } locked: - BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); + BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4742,11 +4743,10 @@ locked: do_balance: /* - * Ownership of bctl and mutually_exclusive_operation_running + * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP * goes to to btrfs_balance. bctl is freed in __cancel_balance, * or, if restriper was paused all the way until unmount, in - * free_fs_info. mutually_exclusive_operation_running is - * cleared in __cancel_balance. + * free_fs_info. The flag is cleared in __cancel_balance. */ need_unlock = false; @@ -4766,7 +4766,7 @@ out_unlock: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); if (need_unlock) - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out: mnt_drop_write_file(file); return ret; @@ -4900,7 +4900,6 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) goto out; } - /* FIXME: check if the IDs really exist */ if (sa->assign) { ret = btrfs_add_qgroup_relation(trans, fs_info, sa->src, sa->dst); @@ -4959,7 +4958,6 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } - /* FIXME: check if the IDs really exist */ if (sa->create) { ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid); } else { @@ -5013,7 +5011,6 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) qgroupid = root->root_key.objectid; } - /* FIXME: check if the IDs really exist */ ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim); err = btrfs_end_transaction(trans); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index f48c8c14dc14..d433e75d489a 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -18,13 +18,14 @@ #include <linux/kernel.h> #include <linux/slab.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/init.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/lzo.h> +#include <linux/refcount.h> #include "compression.h" #define LZO_LEN 4 @@ -40,9 +41,9 @@ static void lzo_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); - vfree(workspace->buf); - vfree(workspace->cbuf); - vfree(workspace->mem); + kvfree(workspace->buf); + kvfree(workspace->cbuf); + kvfree(workspace->mem); kfree(workspace); } @@ -50,13 +51,13 @@ static struct list_head *lzo_alloc_workspace(void) { struct workspace *workspace; - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); - workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); - workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); - workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); + workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); + workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -141,7 +142,7 @@ static int lzo_compress_pages(struct list_head *ws, ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); if (ret != LZO_E_OK) { - pr_debug("BTRFS: deflate in loop returned %d\n", + pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; goto out; @@ -229,8 +230,10 @@ static int lzo_compress_pages(struct list_head *ws, in_len = min(bytes_left, PAGE_SIZE); } - if (tot_out > tot_in) + if (tot_out >= tot_in) { + ret = -E2BIG; goto out; + } /* store the size of all chunks of compressed data */ cpage_out = kmap(pages[0]); @@ -254,16 +257,13 @@ out: return ret; } -static int lzo_decompress_bio(struct list_head *ws, - struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen) +static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; char *data_in; unsigned long page_in_index = 0; + size_t srclen = cb->compressed_len; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long buf_offset = 0; @@ -278,6 +278,9 @@ static int lzo_decompress_bio(struct list_head *ws, unsigned long tot_len; char *buf; bool may_late_unmap, need_unmap; + struct page **pages_in = cb->compressed_pages; + u64 disk_start = cb->start; + struct bio *orig_bio = cb->orig_bio; data_in = kmap(pages_in[0]); tot_len = read_compress_length(data_in); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9a46878ba60f..a3aca495e33e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -212,7 +212,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); /* one ref for the tree */ - atomic_set(&entry->refs, 1); + refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); @@ -358,7 +358,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, out: if (!ret && cached && entry) { *cached = entry; - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; @@ -425,7 +425,7 @@ have_entry: out: if (!ret && cached && entry) { *cached = entry; - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; @@ -456,7 +456,7 @@ void btrfs_get_logged_extents(struct btrfs_inode *inode, if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) continue; list_add(&ordered->log_list, logged_list); - atomic_inc(&ordered->refs); + refcount_inc(&ordered->refs); } spin_unlock_irq(&tree->lock); } @@ -565,7 +565,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); - if (atomic_dec_and_test(&entry->refs)) { + if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->log_list)); ASSERT(list_empty(&entry->trans_list)); ASSERT(list_empty(&entry->root_extent_list)); @@ -623,7 +623,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; if (trans) - atomic_inc(&trans->use_count); + refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); ASSERT(trans); @@ -663,7 +663,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -671,7 +671,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, LIST_HEAD(skipped); LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; - int count = 0; + u64 count = 0; const u64 range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); @@ -690,7 +690,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, list_move_tail(&ordered->root_extent_list, &root->ordered_extents); - atomic_inc(&ordered->refs); + refcount_inc(&ordered->refs); spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, @@ -701,7 +701,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, cond_resched(); spin_lock(&root->ordered_extent_lock); - if (nr != -1) + if (nr != U64_MAX) nr--; count++; } @@ -720,13 +720,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, return count; } -int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, - const u64 range_start, const u64 range_len) +u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, + const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; - int done; - int total_done = 0; + u64 total_done = 0; + u64 done; INIT_LIST_HEAD(&splice); @@ -748,9 +748,8 @@ int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, total_done += done; spin_lock(&fs_info->ordered_root_lock); - if (nr != -1) { + if (nr != U64_MAX) { nr -= done; - WARN_ON(nr < 0); } } list_splice_tail(&splice, &fs_info->ordered_roots); @@ -870,7 +869,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, if (!offset_in_entry(entry, file_offset)) entry = NULL; if (entry) - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); out: spin_unlock_irq(&tree->lock); return entry; @@ -911,7 +910,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( } out: if (entry) - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); spin_unlock_irq(&tree->lock); return entry; } @@ -948,7 +947,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) goto out; entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); out: spin_unlock_irq(&tree->lock); return entry; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 195c93b67fe0..56c4c0ee6381 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -113,7 +113,7 @@ struct btrfs_ordered_extent { int compress_type; /* reference count */ - atomic_t refs; + refcount_t refs; /* the inode we belong to */ struct inode *inode; @@ -200,9 +200,9 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); -int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, +u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct btrfs_inode *inode, struct list_head *logged_list, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index cdafbf92ef0c..fcae61e175f3 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -261,8 +261,11 @@ void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l) case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); - pr_info("\t\tblock group used %llu\n", - btrfs_disk_block_group_used(l, bi)); + pr_info( + "\t\tblock group used %llu chunk_objectid %llu flags %llu\n", + btrfs_disk_block_group_used(l, bi), + btrfs_disk_block_group_chunk_objectid(l, bi), + btrfs_disk_block_group_flags(l, bi)); break; case BTRFS_CHUNK_ITEM_KEY: print_chunk(l, btrfs_item_ptr(l, i, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index d6cb155ef7a1..4b23ae5d0e5c 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -164,6 +164,7 @@ static int iterate_object_props(struct btrfs_root *root, size_t), void *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *name_buf = NULL; char *value_buf = NULL; @@ -214,6 +215,12 @@ static int iterate_object_props(struct btrfs_root *root, name_ptr = (unsigned long)(di + 1); data_ptr = name_ptr + name_len; + if (verify_dir_item(fs_info, leaf, + path->slots[0], di)) { + ret = -EIO; + goto out; + } + if (name_len <= XATTR_BTRFS_PREFIX_LEN || memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, name_ptr, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index afbea61d957e..4ce351efe281 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -47,50 +47,6 @@ * - check all ioctl parameters */ -/* - * one struct for each qgroup, organized in fs_info->qgroup_tree. - */ -struct btrfs_qgroup { - u64 qgroupid; - - /* - * state - */ - u64 rfer; /* referenced */ - u64 rfer_cmpr; /* referenced compressed */ - u64 excl; /* exclusive */ - u64 excl_cmpr; /* exclusive compressed */ - - /* - * limits - */ - u64 lim_flags; /* which limits are set */ - u64 max_rfer; - u64 max_excl; - u64 rsv_rfer; - u64 rsv_excl; - - /* - * reservation tracking - */ - u64 reserved; - - /* - * lists - */ - struct list_head groups; /* groups this group is member of */ - struct list_head members; /* groups that are members of this group */ - struct list_head dirty; /* dirty groups */ - struct rb_node node; /* tree of qgroups */ - - /* - * temp variables for accounting operations - * Refer to qgroup_shared_accounting() for details. - */ - u64 old_refcnt; - u64 new_refcnt; -}; - static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { @@ -1078,6 +1034,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup->excl += sign * num_bytes; qgroup->excl_cmpr += sign * num_bytes; if (sign > 0) { + trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes); if (qgroup->reserved < num_bytes) report_reserved_underflow(fs_info, qgroup, num_bytes); else @@ -1103,6 +1060,8 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; if (sign > 0) { + trace_qgroup_update_reserve(fs_info, qgroup, + -(s64)num_bytes); if (qgroup->reserved < num_bytes) report_reserved_underflow(fs_info, qgroup, num_bytes); @@ -1447,38 +1406,6 @@ out: return ret; } -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_extent_record *record; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - u64 qgroup_to_skip; - int ret = 0; - - delayed_refs = &trans->transaction->delayed_refs; - qgroup_to_skip = delayed_refs->qgroup_to_skip; - - /* - * No need to do lock, since this function will only be called in - * btrfs_commit_transaction(). - */ - node = rb_first(&delayed_refs->dirty_extent_root); - while (node) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); - if (WARN_ON(!record->old_roots)) - ret = btrfs_find_all_roots(NULL, fs_info, - record->bytenr, 0, &record->old_roots); - if (ret < 0) - break; - if (qgroup_to_skip) - ulist_del(record->old_roots, qgroup_to_skip, 0); - node = rb_next(node); - } - return ret; -} - int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) @@ -1600,6 +1527,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, if (ret) return ret; } + cond_resched(); return 0; } @@ -1959,6 +1887,35 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, return 0; } +/* + * Check if the @roots potentially is a list of fs tree roots + * + * Return 0 for definitely not a fs/subvol tree roots ulist + * Return 1 for possible fs/subvol tree roots in the list (considering an empty + * one as well) + */ +static int maybe_fs_roots(struct ulist *roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + + /* Empty one, still possible for fs roots */ + if (!roots || roots->nnodes == 0) + return 1; + + ULIST_ITER_INIT(&uiter); + unode = ulist_next(roots, &uiter); + if (!unode) + return 1; + + /* + * If it contains fs tree roots, then it must belong to fs/subvol + * trees. + * If it contains a non-fs tree, it won't be shared with fs/subvol trees. + */ + return is_fstree(unode->val); +} + int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -1975,10 +1932,20 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; - if (new_roots) + if (new_roots) { + if (!maybe_fs_roots(new_roots)) + goto out_free; nr_new_roots = new_roots->nnodes; - if (old_roots) + } + if (old_roots) { + if (!maybe_fs_roots(old_roots)) + goto out_free; nr_old_roots = old_roots->nnodes; + } + + /* Quick exit, either not fs tree roots, or won't affect any qgroup */ + if (nr_old_roots == 0 && nr_new_roots == 0) + goto out_free; BUG_ON(!fs_info->quota_root); @@ -2058,16 +2025,32 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, if (!ret) { /* - * Use (u64)-1 as time_seq to do special search, which + * Old roots should be searched when inserting qgroup + * extent record + */ + if (WARN_ON(!record->old_roots)) { + /* Search commit root to find old_roots */ + ret = btrfs_find_all_roots(NULL, fs_info, + record->bytenr, 0, + &record->old_roots); + if (ret < 0) + goto cleanup; + } + + /* + * Use SEQ_LAST as time_seq to do special search, which * doesn't lock tree or delayed_refs and search current * root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, (u64)-1, &new_roots); + record->bytenr, SEQ_LAST, &new_roots); if (ret < 0) goto cleanup; - if (qgroup_to_skip) + if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); + ulist_del(record->old_roots, qgroup_to_skip, + 0); + } ret = btrfs_qgroup_account_extent(trans, fs_info, record->bytenr, record->num_bytes, record->old_roots, new_roots); @@ -2370,6 +2353,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; + int retried = 0; struct ulist_node *unode; struct ulist_iterator uiter; @@ -2379,6 +2363,11 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) if (num_bytes == 0) return 0; + if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && + capable(CAP_SYS_RESOURCE)) + enforce = false; + +retry: spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; if (!quota_root) @@ -2405,6 +2394,27 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) qg = unode_aux_to_qgroup(unode); if (enforce && !qgroup_check_limits(qg, num_bytes)) { + /* + * Commit the tree and retry, since we may have + * deletions which would free up space. + */ + if (!retried && qg->reserved > 0) { + struct btrfs_trans_handle *trans; + + spin_unlock(&fs_info->qgroup_lock); + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + retried++; + goto retry; + } ret = -EDQUOT; goto out; } @@ -2427,6 +2437,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) qg = unode_aux_to_qgroup(unode); + trace_qgroup_update_reserve(fs_info, qg, num_bytes); qg->reserved += num_bytes; } @@ -2472,6 +2483,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); + trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes); if (qg->reserved < num_bytes) report_reserved_underflow(fs_info, qg, num_bytes); else @@ -2490,18 +2502,6 @@ out: spin_unlock(&fs_info->qgroup_lock); } -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) -{ - if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) - return; - btrfs_err(trans->fs_info, - "qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x", - trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", - (u32)(trans->delayed_ref_elem.seq >> 32), - (u32)trans->delayed_ref_elem.seq); - BUG(); -} - /* * returns < 0 on error, 0 when more leafs are to be scanned. * returns 1 when done. @@ -2835,70 +2835,146 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * Return <0 for error (including -EQUOT) * * NOTE: this function may sleep for memory allocation. + * if btrfs_qgroup_reserve_data() is called multiple times with + * same @reserved, caller must ensure when error happens it's OK + * to free *ALL* reserved space. */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_changeset changeset; struct ulist_node *unode; struct ulist_iterator uiter; + struct extent_changeset *reserved; + u64 orig_reserved; + u64 to_reserve; int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || !is_fstree(root->objectid) || len == 0) return 0; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + /* @reserved parameter is mandatory for qgroup */ + if (WARN_ON(!reserved_ret)) + return -EINVAL; + if (!*reserved_ret) { + *reserved_ret = extent_changeset_alloc(); + if (!*reserved_ret) + return -ENOMEM; + } + reserved = *reserved_ret; + /* Record already reserved space */ + orig_reserved = reserved->bytes_changed; ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, reserved); + + /* Newly reserved space */ + to_reserve = reserved->bytes_changed - orig_reserved; trace_btrfs_qgroup_reserve_data(inode, start, len, - changeset.bytes_changed, - QGROUP_RESERVE); + to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; - ret = qgroup_reserve(root, changeset.bytes_changed, true); + ret = qgroup_reserve(root, to_reserve, true); if (ret < 0) goto cleanup; - ulist_release(&changeset.range_changed); return ret; cleanup: - /* cleanup already reserved ranges */ + /* cleanup *ALL* already reserved ranges */ ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(&changeset.range_changed, &uiter))) + while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, GFP_NOFS); - ulist_release(&changeset.range_changed); + extent_changeset_release(reserved); return ret; } -static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, - int free) +/* Free ranges specified by @reserved, normally in error path */ +static int qgroup_free_reserved_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct extent_changeset changeset; + int freed = 0; + int ret; + + extent_changeset_init(&changeset); + len = round_up(start + len, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(&reserved->range_changed, &uiter))) { + u64 range_start = unode->val; + /* unode->aux is the inclusive end */ + u64 range_len = unode->aux - range_start + 1; + u64 free_start; + u64 free_len; + + extent_changeset_release(&changeset); + + /* Only free range in range [start, start + len) */ + if (range_start >= start + len || + range_start + range_len <= start) + continue; + free_start = max(range_start, start); + free_len = min(start + len, range_start + range_len) - + free_start; + /* + * TODO: To also modify reserved->ranges_reserved to reflect + * the modification. + * + * However as long as we free qgroup reserved according to + * EXTENT_QGROUP_RESERVED, we won't double free. + * So not need to rush. + */ + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + free_start, free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) + goto out; + freed += changeset.bytes_changed; + } + btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed); + ret = freed; +out: + extent_changeset_release(&changeset); + return ret; +} + +static int __btrfs_qgroup_release_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len, + int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + /* In release case, we shouldn't have @reserved */ + WARN_ON(!free && reserved); + if (free && reserved) + return qgroup_free_reserved_data(inode, reserved, start, len); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; - if (free) { - btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, - changeset.bytes_changed); + if (free) trace_op = QGROUP_FREE; - } trace_btrfs_qgroup_release_data(inode, start, len, changeset.bytes_changed, trace_op); + if (free) + btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, + BTRFS_I(inode)->root->objectid, + changeset.bytes_changed); + ret = changeset.bytes_changed; out: - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); return ret; } @@ -2907,14 +2983,17 @@ out: * * Should be called when a range of pages get invalidated before reaching disk. * Or for error cleanup case. + * if @reserved is given, only reserved range in [@start, @start + @len) will + * be freed. * * For data written to disk, use btrfs_qgroup_release_data(). * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); } /* @@ -2934,7 +3013,7 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) */ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, @@ -2948,6 +3027,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); + trace_qgroup_meta_reserve(root, (s64)num_bytes); ret = qgroup_reserve(root, num_bytes, enforce); if (ret < 0) return ret; @@ -2967,6 +3047,7 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root) reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0); if (reserved == 0) return; + trace_qgroup_meta_reserve(root, -(s64)reserved); btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved); } @@ -2981,6 +3062,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes); atomic64_sub(num_bytes, &root->qgroup_meta_rsv); + trace_qgroup_meta_reserve(root, -(s64)num_bytes); btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes); } @@ -2995,8 +3077,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) struct ulist_iterator iter; int ret; - changeset.bytes_changed = 0; - ulist_init(&changeset.range_changed); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); @@ -3013,5 +3094,5 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) changeset.bytes_changed); } - ulist_release(&changeset.range_changed); + extent_changeset_release(&changeset); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 26932a8a1993..d9984e87cddf 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -62,6 +62,50 @@ struct btrfs_qgroup_extent_record { }; /* + * one struct for each qgroup, organized in fs_info->qgroup_tree. + */ +struct btrfs_qgroup { + u64 qgroupid; + + /* + * state + */ + u64 rfer; /* referenced */ + u64 rfer_cmpr; /* referenced compressed */ + u64 excl; /* exclusive */ + u64 excl_cmpr; /* exclusive compressed */ + + /* + * limits + */ + u64 lim_flags; /* which limits are set */ + u64 max_rfer; + u64 max_excl; + u64 rsv_rfer; + u64 rsv_excl; + + /* + * reservation tracking + */ + u64 reserved; + + /* + * lists + */ + struct list_head groups; /* groups this group is member of */ + struct list_head members; /* groups that are members of this group */ + struct list_head dirty; /* dirty groups */ + struct rb_node node; /* tree of qgroups */ + + /* + * temp variables for accounting operations + * Refer to qgroup_shared_accounting() for details. + */ + u64 old_refcnt; + u64 new_refcnt; +}; + +/* * For qgroup event trace points only */ #define QGROUP_RESERVE (1<<0) @@ -90,8 +134,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); + /* * Inform qgroup to trace one dirty extent, its info is recorded in @record. * So qgroup can account it at transaction committing time. @@ -186,17 +229,12 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes); -/* - * TODO: Add proper trace point for it, as btrfs_qgroup_free() is - * called by everywhere, can't provide good trace for delayed ref case. - */ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { - btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); } -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, @@ -204,9 +242,11 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, #endif /* New io_tree based accurate qgroup reserve API */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, bool enforce); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 1571bf26dc07..6f845d219cd6 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -31,7 +31,7 @@ #include <linux/hash.h> #include <linux/list_sort.h> #include <linux/raid/xor.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <asm/div64.h> #include "ctree.h" #include "extent_map.h" @@ -149,7 +149,7 @@ struct btrfs_raid_bio { int generic_bio_cnt; - atomic_t refs; + refcount_t refs; atomic_t stripes_pending; @@ -218,12 +218,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) * of a failing mount. */ table_size = sizeof(*table) + sizeof(*h) * num_entries; - table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!table) { - table = vzalloc(table_size); - if (!table) - return -ENOMEM; - } + table = kvzalloc(table_size, GFP_KERNEL); + if (!table) + return -ENOMEM; spin_lock_init(&table->cache_lock); INIT_LIST_HEAD(&table->stripe_cache); @@ -389,7 +386,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (bio_list_empty(&rbio->bio_list)) { if (!list_empty(&rbio->hash_list)) { list_del_init(&rbio->hash_list); - atomic_dec(&rbio->refs); + refcount_dec(&rbio->refs); BUG_ON(!list_empty(&rbio->plug_list)); } } @@ -480,7 +477,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) /* bump our ref if we were not in the list before */ if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) - atomic_inc(&rbio->refs); + refcount_inc(&rbio->refs); if (!list_empty(&rbio->stripe_cache)){ list_move(&rbio->stripe_cache, &table->stripe_cache); @@ -689,7 +686,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) test_bit(RBIO_CACHE_BIT, &cur->flags) && !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { list_del_init(&cur->hash_list); - atomic_dec(&cur->refs); + refcount_dec(&cur->refs); steal_rbio(cur, rbio); cache_drop = cur; @@ -738,7 +735,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) } } lockit: - atomic_inc(&rbio->refs); + refcount_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: spin_unlock_irqrestore(&h->lock, flags); @@ -784,7 +781,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } list_del_init(&rbio->hash_list); - atomic_dec(&rbio->refs); + refcount_dec(&rbio->refs); /* * we use the plug list to hold all the rbios @@ -801,7 +798,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) list_del_init(&rbio->plug_list); list_add(&next->hash_list, &h->hash_list); - atomic_inc(&next->refs); + refcount_inc(&next->refs); spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); @@ -843,8 +840,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) { int i; - WARN_ON(atomic_read(&rbio->refs) < 0); - if (!atomic_dec_and_test(&rbio->refs)) + if (!refcount_dec_and_test(&rbio->refs)) return; WARN_ON(!list_empty(&rbio->stripe_cache)); @@ -872,7 +868,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; @@ -885,7 +881,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_error = err; + cur->bi_status = err; bio_endio(cur); cur = next; } @@ -898,7 +894,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; int max_errors; if (err) @@ -915,7 +911,7 @@ static void raid_write_end_io(struct bio *bio) max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 0 : rbio->bbio->max_errors; if (atomic_read(&rbio->error) > max_errors) - err = -EIO; + err = BLK_STS_IOERR; rbio_orig_end_io(rbio, err); } @@ -997,7 +993,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->stripe_npages = stripe_npages; rbio->faila = -1; rbio->failb = -1; - atomic_set(&rbio->refs, 1); + refcount_set(&rbio->refs, 1); atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); @@ -1093,7 +1089,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && stripe->dev->bdev && - !last->bi_error && + !last->bi_status && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) @@ -1102,10 +1098,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); - if (!bio) - return -ENOMEM; - + bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); bio->bi_iter.bi_size = 0; bio->bi_bdev = stripe->dev->bdev; bio->bi_iter.bi_sector = disk_start >> 9; @@ -1449,7 +1442,7 @@ static void raid_rmw_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -1992,7 +1985,7 @@ static void raid_recover_end_io(struct bio *bio) * we only read stripe pages off the disk, set them * up to date if there were no errors */ - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2118,6 +2111,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_raid_bio *rbio; int ret; + if (generic_io) { + ASSERT(bbio->mirror_num == mirror_num); + btrfs_io_bio(bio)->mirror_num = mirror_num; + } + rbio = alloc_rbio(fs_info, bbio, stripe_len); if (IS_ERR(rbio)) { if (generic_io) @@ -2194,6 +2192,8 @@ static void read_rebuild_work(struct btrfs_work *work) /* * The following code is used to scrub/replace the parity stripe * + * Caller must have already increased bio_counter for getting @bbio. + * * Note: We need make sure all the pages that add into the scrub/replace * raid bio are correct and not be changed during the scrub/replace. That * is those pages just hold metadata or file data with checksum. @@ -2231,6 +2231,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, ASSERT(rbio->stripe_npages == stripe_nsectors); bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + /* + * We have already increased bio_counter when getting bbio, record it + * so we can free it at rbio_orig_end_io(). + */ + rbio->generic_bio_cnt = 1; + return rbio; } @@ -2518,7 +2524,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2673,6 +2679,12 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, return NULL; } + /* + * When we get bbio, we have already increased bio_counter, record it + * so we can free it at rbio_orig_end_io() + */ + rbio->generic_bio_cnt = 1; + return rbio; } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index e88bca87f5d2..ab852b8e3e37 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -66,7 +66,6 @@ struct reada_extctl { struct reada_extent { u64 logical; struct btrfs_key top; - int err; struct list_head extctl; int refcnt; spinlock_t lock; @@ -209,9 +208,9 @@ cleanup: return; } -int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int err) +int btree_readahead_hook(struct extent_buffer *eb, int err) { + struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; struct reada_extent *re; @@ -235,10 +234,10 @@ start_machine: return ret; } -static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev, u64 logical, +static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = dev->fs_info; int ret; struct reada_zone *zone; struct btrfs_block_group_cache *cache = NULL; @@ -270,6 +269,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, if (!zone) return NULL; + ret = radix_tree_preload(GFP_KERNEL); + if (ret) { + kfree(zone); + return NULL; + } + zone->start = start; zone->end = end; INIT_LIST_HEAD(&zone->list); @@ -299,6 +304,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, zone = NULL; } spin_unlock(&fs_info->reada_lock); + radix_tree_preload_end(); return zone; } @@ -313,7 +319,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; struct btrfs_device *prev_dev; - u32 blocksize; u64 length; int real_stripes; int nzones = 0; @@ -334,7 +339,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!re) return NULL; - blocksize = fs_info->nodesize; re->logical = logical; re->top = *top; INIT_LIST_HEAD(&re->extctl); @@ -344,10 +348,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, /* * map block */ - length = blocksize; + length = fs_info->nodesize; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &length, &bbio, 0); - if (ret || !bbio || length < blocksize) + if (ret || !bbio || length < fs_info->nodesize) goto error; if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { @@ -367,7 +371,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!dev->bdev) continue; - zone = reada_find_zone(fs_info, dev, logical, bbio); + zone = reada_find_zone(dev, logical, bbio); if (!zone) continue; @@ -386,6 +390,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, goto error; } + ret = radix_tree_preload(GFP_KERNEL); + if (ret) + goto error; + /* insert extent in reada_tree + all per-device trees, all or nothing */ btrfs_dev_replace_lock(&fs_info->dev_replace, 0); spin_lock(&fs_info->reada_lock); @@ -395,13 +403,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + radix_tree_preload_end(); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + radix_tree_preload_end(); goto error; } + radix_tree_preload_end(); prev_dev = NULL; dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( &fs_info->dev_replace); @@ -639,9 +650,9 @@ static int reada_pick_zone(struct btrfs_device *dev) return 1; } -static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev) +static int reada_start_machine_dev(struct btrfs_device *dev) { + struct btrfs_fs_info *fs_info = dev->fs_info; struct reada_extent *re = NULL; int mirror_num = 0; struct extent_buffer *eb = NULL; @@ -754,8 +765,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) - enqueued += reada_start_machine_dev(fs_info, - device); + enqueued += reada_start_machine_dev(device); } mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d60df51959f7..65661d1aae4e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset; + struct extent_changeset *data_reserved = NULL; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, prealloc_start, + ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, prealloc_end + 1 - prealloc_start); if (ret) goto out; @@ -3113,8 +3114,8 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; if (cur_offset < start) - btrfs_free_reserved_data_space(inode, cur_offset, - start - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); @@ -3125,10 +3126,11 @@ int prealloc_file_extent_cluster(struct inode *inode, nr++; } if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space(inode, cur_offset, - prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, prealloc_end + 1 - cur_offset); out: inode_unlock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -4269,8 +4271,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->reloc_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&rc->processed_blocks, NULL); return rc; } @@ -4372,7 +4373,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); - btrfs_wait_ordered_roots(fs_info, -1, + btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group->key.objectid, rc->block_group->key.offset); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index a08224eab8b4..460db0cb2d07 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -390,6 +390,13 @@ again: WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ptr = (unsigned long)(ref + 1); + ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr, + name_len); + if (!ret) { + err = -EIO; + goto out; + } + WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); *sequence = btrfs_root_ref_sequence(leaf, ref); @@ -501,8 +508,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_root_item *item = &root->root_item; - struct timespec ct = current_fs_time(root->fs_info->sb); + struct timespec ct; + ktime_get_real_ts(&ct); spin_lock(&root->root_item_lock); btrfs_set_root_ctransid(item, trans->transid); btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b0251eb1239f..6f1e4c984b94 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -18,6 +18,7 @@ #include <linux/blkdev.h> #include <linux/ratelimit.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "volumes.h" #include "disk-io.h" @@ -64,7 +65,7 @@ struct scrub_ctx; #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_recover { - atomic_t refs; + refcount_t refs; struct btrfs_bio *bbio; u64 map_length; }; @@ -95,7 +96,7 @@ struct scrub_bio { struct scrub_ctx *sctx; struct btrfs_device *dev; struct bio *bio; - int err; + blk_status_t status; u64 logical; u64 physical; #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO @@ -112,7 +113,7 @@ struct scrub_block { struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; - atomic_t refs; /* free mem on transition to zero */ + refcount_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; struct { @@ -140,9 +141,9 @@ struct scrub_parity { int nsectors; - int stripe_len; + u64 stripe_len; - atomic_t refs; + refcount_t refs; struct list_head spages; @@ -161,14 +162,6 @@ struct scrub_parity { unsigned long bitmap[0]; }; -struct scrub_wr_ctx { - struct scrub_bio *wr_curr_bio; - struct btrfs_device *tgtdev; - int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ - atomic_t flush_all_writes; - struct mutex wr_lock; -}; - struct scrub_ctx { struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; struct btrfs_fs_info *fs_info; @@ -183,11 +176,14 @@ struct scrub_ctx { atomic_t cancel_req; int readonly; int pages_per_rd_bio; - u32 sectorsize; - u32 nodesize; int is_dev_replace; - struct scrub_wr_ctx wr_ctx; + + struct scrub_bio *wr_curr_bio; + struct mutex wr_lock; + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ + atomic_t flush_all_writes; + struct btrfs_device *wr_tgtdev; /* * statistics @@ -202,7 +198,7 @@ struct scrub_ctx { * doesn't free the scrub context before or while the workers are * doing the wakeup() call. */ - atomic_t refs; + refcount_t refs; }; struct scrub_fixup_nodatasum { @@ -240,6 +236,13 @@ struct scrub_warning { struct btrfs_device *dev; }; +struct full_stripe_lock { + struct rb_node node; + u64 logical; + u64 refs; + struct mutex mutex; +}; + static void scrub_pending_bio_inc(struct scrub_ctx *sctx); static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); @@ -282,10 +285,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, u64 *extent_physical, struct btrfs_device **extent_dev, int *extent_mirror_num); -static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx, - struct btrfs_device *dev, - int is_dev_replace); -static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static void scrub_wr_submit(struct scrub_ctx *sctx); @@ -305,7 +304,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { - atomic_inc(&sctx->refs); + refcount_inc(&sctx->refs); atomic_inc(&sctx->bios_in_flight); } @@ -349,6 +348,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) } /* + * Insert new full stripe lock into full stripe locks tree + * + * Return pointer to existing or newly inserted full_stripe_lock structure if + * everything works well. + * Return ERR_PTR(-ENOMEM) if we failed to allocate memory + * + * NOTE: caller must hold full_stripe_locks_root->lock before calling this + * function + */ +static struct full_stripe_lock *insert_full_stripe_lock( + struct btrfs_full_stripe_locks_tree *locks_root, + u64 fstripe_logical) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct full_stripe_lock *entry; + struct full_stripe_lock *ret; + + WARN_ON(!mutex_is_locked(&locks_root->lock)); + + p = &locks_root->root.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct full_stripe_lock, node); + if (fstripe_logical < entry->logical) { + p = &(*p)->rb_left; + } else if (fstripe_logical > entry->logical) { + p = &(*p)->rb_right; + } else { + entry->refs++; + return entry; + } + } + + /* Insert new lock */ + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + ret->logical = fstripe_logical; + ret->refs = 1; + mutex_init(&ret->mutex); + + rb_link_node(&ret->node, parent, p); + rb_insert_color(&ret->node, &locks_root->root); + return ret; +} + +/* + * Search for a full stripe lock of a block group + * + * Return pointer to existing full stripe lock if found + * Return NULL if not found + */ +static struct full_stripe_lock *search_full_stripe_lock( + struct btrfs_full_stripe_locks_tree *locks_root, + u64 fstripe_logical) +{ + struct rb_node *node; + struct full_stripe_lock *entry; + + WARN_ON(!mutex_is_locked(&locks_root->lock)); + + node = locks_root->root.rb_node; + while (node) { + entry = rb_entry(node, struct full_stripe_lock, node); + if (fstripe_logical < entry->logical) + node = node->rb_left; + else if (fstripe_logical > entry->logical) + node = node->rb_right; + else + return entry; + } + return NULL; +} + +/* + * Helper to get full stripe logical from a normal bytenr. + * + * Caller must ensure @cache is a RAID56 block group. + */ +static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, + u64 bytenr) +{ + u64 ret; + + /* + * Due to chunk item size limit, full stripe length should not be + * larger than U32_MAX. Just a sanity check here. + */ + WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); + + /* + * round_down() can only handle power of 2, while RAID56 full + * stripe length can be 64KiB * n, so we need to manually round down. + */ + ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * + cache->full_stripe_len + cache->key.objectid; + return ret; +} + +/* + * Lock a full stripe to avoid concurrency of recovery and read + * + * It's only used for profiles with parities (RAID5/6), for other profiles it + * does nothing. + * + * Return 0 if we locked full stripe covering @bytenr, with a mutex held. + * So caller must call unlock_full_stripe() at the same context. + * + * Return <0 if encounters error. + */ +static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, + bool *locked_ret) +{ + struct btrfs_block_group_cache *bg_cache; + struct btrfs_full_stripe_locks_tree *locks_root; + struct full_stripe_lock *existing; + u64 fstripe_start; + int ret = 0; + + *locked_ret = false; + bg_cache = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg_cache) { + ASSERT(0); + return -ENOENT; + } + + /* Profiles not based on parity don't need full stripe lock */ + if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) + goto out; + locks_root = &bg_cache->full_stripe_locks_root; + + fstripe_start = get_full_stripe_logical(bg_cache, bytenr); + + /* Now insert the full stripe lock */ + mutex_lock(&locks_root->lock); + existing = insert_full_stripe_lock(locks_root, fstripe_start); + mutex_unlock(&locks_root->lock); + if (IS_ERR(existing)) { + ret = PTR_ERR(existing); + goto out; + } + mutex_lock(&existing->mutex); + *locked_ret = true; +out: + btrfs_put_block_group(bg_cache); + return ret; +} + +/* + * Unlock a full stripe. + * + * NOTE: Caller must ensure it's the same context calling corresponding + * lock_full_stripe(). + * + * Return 0 if we unlock full stripe without problem. + * Return <0 for error + */ +static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, + bool locked) +{ + struct btrfs_block_group_cache *bg_cache; + struct btrfs_full_stripe_locks_tree *locks_root; + struct full_stripe_lock *fstripe_lock; + u64 fstripe_start; + bool freeit = false; + int ret = 0; + + /* If we didn't acquire full stripe lock, no need to continue */ + if (!locked) + return 0; + + bg_cache = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg_cache) { + ASSERT(0); + return -ENOENT; + } + if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) + goto out; + + locks_root = &bg_cache->full_stripe_locks_root; + fstripe_start = get_full_stripe_logical(bg_cache, bytenr); + + mutex_lock(&locks_root->lock); + fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); + /* Unpaired unlock_full_stripe() detected */ + if (!fstripe_lock) { + WARN_ON(1); + ret = -ENOENT; + mutex_unlock(&locks_root->lock); + goto out; + } + + if (fstripe_lock->refs == 0) { + WARN_ON(1); + btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", + fstripe_lock->logical); + } else { + fstripe_lock->refs--; + } + + if (fstripe_lock->refs == 0) { + rb_erase(&fstripe_lock->node, &locks_root->root); + freeit = true; + } + mutex_unlock(&locks_root->lock); + + mutex_unlock(&fstripe_lock->mutex); + if (freeit) + kfree(fstripe_lock); +out: + btrfs_put_block_group(bg_cache); + return ret; +} + +/* * used for workers that require transaction commits (i.e., for the * NOCOW case) */ @@ -356,7 +571,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->fs_info; - atomic_inc(&sctx->refs); + refcount_inc(&sctx->refs); /* * increment scrubs_running to prevent cancel requests from * completing as long as a worker is running. we must also @@ -420,8 +635,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; - scrub_free_wr_ctx(&sctx->wr_ctx); - /* this can happen when scrub is cancelled */ if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; @@ -441,13 +654,14 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) kfree(sbio); } + kfree(sctx->wr_curr_bio); scrub_free_csums(sctx); kfree(sctx); } static void scrub_put_ctx(struct scrub_ctx *sctx) { - if (atomic_dec_and_test(&sctx->refs)) + if (refcount_dec_and_test(&sctx->refs)) scrub_free_ctx(sctx); } @@ -457,12 +671,11 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->fs_info; - int ret; sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) goto nomem; - atomic_set(&sctx->refs, 1); + refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; @@ -487,8 +700,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx->bios[i]->next_free = -1; } sctx->first_free = 0; - sctx->nodesize = fs_info->nodesize; - sctx->sectorsize = fs_info->sectorsize; atomic_set(&sctx->bios_in_flight, 0); atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); @@ -499,12 +710,16 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) spin_lock_init(&sctx->stat_lock); init_waitqueue_head(&sctx->list_wait); - ret = scrub_setup_wr_ctx(&sctx->wr_ctx, - fs_info->dev_replace.tgtdev, is_dev_replace); - if (ret) { - scrub_free_ctx(sctx); - return ERR_PTR(ret); + WARN_ON(sctx->wr_curr_bio != NULL); + mutex_init(&sctx->wr_lock); + sctx->wr_curr_bio = NULL; + if (is_dev_replace) { + WARN_ON(!fs_info->dev_replace.tgtdev); + sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; + sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; + atomic_set(&sctx->flush_all_writes, 0); } + return sctx; nomem: @@ -519,6 +734,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, u32 nlink; int ret; int i; + unsigned nofs_flag; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; struct scrub_warning *swarn = warn_ctx; @@ -557,7 +773,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(swarn->path); + /* + * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub + * uses GFP_NOFS in this context, so we keep it consistent but it does + * not seem to be strictly necessary. + */ + nofs_flag = memalloc_nofs_save(); ipath = init_ipath(4096, local_root, swarn->path); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { ret = PTR_ERR(ipath); ipath = NULL; @@ -731,7 +954,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) ret = -EIO; goto out; } - ret = repair_io_failure(BTRFS_I(inode), offset, PAGE_SIZE, + ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE, fixup->logical, page, offset - page_offset(page), fixup->mirror_num); @@ -857,12 +1080,14 @@ out: static inline void scrub_get_recover(struct scrub_recover *recover) { - atomic_inc(&recover->refs); + refcount_inc(&recover->refs); } -static inline void scrub_put_recover(struct scrub_recover *recover) +static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, + struct scrub_recover *recover) { - if (atomic_dec_and_test(&recover->refs)) { + if (refcount_dec_and_test(&recover->refs)) { + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(recover->bbio); kfree(recover); } @@ -892,6 +1117,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int mirror_index; int page_num; int success; + bool full_stripe_locked; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -917,6 +1143,24 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; + /* + * For RAID5/6, race can happen for a different device scrub thread. + * For data corruption, Parity and Data threads will both try + * to recovery the data. + * Race can lead to doubly added csum error, or even unrecoverable + * error. + */ + ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); + if (ret < 0) { + spin_lock(&sctx->stat_lock); + if (ret == -ENOMEM) + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + return ret; + } + if (sctx->is_dev_replace && !is_metadata && !have_csum) { sblocks_for_recheck = NULL; goto nodatasum_case; @@ -1241,7 +1485,7 @@ out: sblock->pagev[page_index]->sblock = NULL; recover = sblock->pagev[page_index]->recover; if (recover) { - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); sblock->pagev[page_index]->recover = NULL; } @@ -1251,6 +1495,9 @@ out: kfree(sblocks_for_recheck); } + ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); + if (ret < 0) + return ret; return 0; } @@ -1330,20 +1577,23 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &mapped_length, &bbio, 0, 1); + logical, &mapped_length, &bbio); if (ret || !bbio || mapped_length < sublen) { btrfs_put_bbio(bbio); + btrfs_bio_counter_dec(fs_info); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { btrfs_put_bbio(bbio); + btrfs_bio_counter_dec(fs_info); return -ENOMEM; } - atomic_set(&recover->refs, 1); + refcount_set(&recover->refs, 1); recover->bbio = bbio; recover->map_length = mapped_length; @@ -1365,7 +1615,7 @@ leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); return -ENOMEM; } scrub_page_get(page); @@ -1407,7 +1657,7 @@ leave_nomem: scrub_get_recover(recover); page->recover = recover; } - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); length -= sublen; logical += sublen; page_index++; @@ -1418,14 +1668,14 @@ leave_nomem: struct scrub_bio_ret { struct completion event; - int error; + blk_status_t status; }; static void scrub_bio_wait_endio(struct bio *bio) { struct scrub_bio_ret *ret = bio->bi_private; - ret->error = bio->bi_error; + ret->status = bio->bi_status; complete(&ret->event); } @@ -1443,7 +1693,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, int ret; init_completion(&done.event); - done.error = 0; + done.status = 0; bio->bi_iter.bi_sector = page->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; @@ -1455,7 +1705,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, return ret; wait_for_completion(&done.event); - if (done.error) + if (done.status) return -EIO; return 0; @@ -1487,24 +1737,23 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!page->page); - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) { - page->io_error = 1; - sblock->no_io_error_seen = 0; - continue; - } + bio = btrfs_io_bio_alloc(1); bio->bi_bdev = page->dev->bdev; bio_add_page(bio, page->page, PAGE_SIZE, 0); if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { - if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) + if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) { + page->io_error = 1; sblock->no_io_error_seen = 0; + } } else { bio->bi_iter.bi_sector = page->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - if (btrfsic_submit_bio_wait(bio)) + if (btrfsic_submit_bio_wait(bio)) { + page->io_error = 1; sblock->no_io_error_seen = 0; + } } bio_put(bio); @@ -1576,9 +1825,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; + bio = btrfs_io_bio_alloc(1); bio->bi_bdev = page_bad->dev->bdev; bio->bi_iter.bi_sector = page_bad->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -1634,7 +1881,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, if (spage->io_error) { void *mapped_buffer = kmap_atomic(spage->page); - memset(mapped_buffer, 0, PAGE_SIZE); + clear_page(mapped_buffer); flush_dcache_page(spage->page); kunmap_atomic(mapped_buffer); } @@ -1644,37 +1891,31 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { - struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; struct scrub_bio *sbio; int ret; - mutex_lock(&wr_ctx->wr_lock); + mutex_lock(&sctx->wr_lock); again: - if (!wr_ctx->wr_curr_bio) { - wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), + if (!sctx->wr_curr_bio) { + sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), GFP_KERNEL); - if (!wr_ctx->wr_curr_bio) { - mutex_unlock(&wr_ctx->wr_lock); + if (!sctx->wr_curr_bio) { + mutex_unlock(&sctx->wr_lock); return -ENOMEM; } - wr_ctx->wr_curr_bio->sctx = sctx; - wr_ctx->wr_curr_bio->page_count = 0; + sctx->wr_curr_bio->sctx = sctx; + sctx->wr_curr_bio->page_count = 0; } - sbio = wr_ctx->wr_curr_bio; + sbio = sctx->wr_curr_bio; if (sbio->page_count == 0) { struct bio *bio; sbio->physical = spage->physical_for_dev_replace; sbio->logical = spage->logical; - sbio->dev = wr_ctx->tgtdev; + sbio->dev = sctx->wr_tgtdev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(GFP_KERNEL, - wr_ctx->pages_per_wr_bio); - if (!bio) { - mutex_unlock(&wr_ctx->wr_lock); - return -ENOMEM; - } + bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio); sbio->bio = bio; } @@ -1683,7 +1924,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -1697,7 +1938,7 @@ again: if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; - mutex_unlock(&wr_ctx->wr_lock); + mutex_unlock(&sctx->wr_lock); return -EIO; } scrub_wr_submit(sctx); @@ -1707,23 +1948,22 @@ again: sbio->pagev[sbio->page_count] = spage; scrub_page_get(spage); sbio->page_count++; - if (sbio->page_count == wr_ctx->pages_per_wr_bio) + if (sbio->page_count == sctx->pages_per_wr_bio) scrub_wr_submit(sctx); - mutex_unlock(&wr_ctx->wr_lock); + mutex_unlock(&sctx->wr_lock); return 0; } static void scrub_wr_submit(struct scrub_ctx *sctx) { - struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; struct scrub_bio *sbio; - if (!wr_ctx->wr_curr_bio) + if (!sctx->wr_curr_bio) return; - sbio = wr_ctx->wr_curr_bio; - wr_ctx->wr_curr_bio = NULL; + sbio = sctx->wr_curr_bio; + sctx->wr_curr_bio = NULL; WARN_ON(!sbio->bio->bi_bdev); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer @@ -1738,7 +1978,7 @@ static void scrub_wr_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, @@ -1753,7 +1993,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) int i; WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); - if (sbio->err) { + if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; @@ -1827,7 +2067,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) page = sblock->pagev[0]->page; buffer = kmap_atomic(page); - len = sctx->sectorsize; + len = sctx->fs_info->sectorsize; index = 0; for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); @@ -1892,7 +2132,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) sblock->header_error = 1; - len = sctx->nodesize - BTRFS_CSUM_SIZE; + len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; index = 0; @@ -1998,12 +2238,12 @@ static int scrub_checksum_super(struct scrub_block *sblock) static void scrub_block_get(struct scrub_block *sblock) { - atomic_inc(&sblock->refs); + refcount_inc(&sblock->refs); } static void scrub_block_put(struct scrub_block *sblock) { - if (atomic_dec_and_test(&sblock->refs)) { + if (refcount_dec_and_test(&sblock->refs)) { int i; if (sblock->sparity) @@ -2075,10 +2315,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(GFP_KERNEL, - sctx->pages_per_rd_bio); - if (!bio) - return -ENOMEM; + bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio); sbio->bio = bio; } @@ -2087,7 +2324,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -2123,7 +2360,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio) struct scrub_block *sblock = bio->bi_private; struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) sblock->no_io_error_seen = 0; bio_put(bio); @@ -2166,10 +2403,10 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) scrub_block_put(sblock); if (sctx->is_dev_replace && - atomic_read(&sctx->wr_ctx.flush_all_writes)) { - mutex_lock(&sctx->wr_ctx.wr_lock); + atomic_read(&sctx->flush_all_writes)) { + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); } scrub_pending_bio_dec(sctx); @@ -2187,8 +2424,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) int ret; int i; + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bbio, 0, 1); + &length, &bbio); if (ret || !bbio || !bbio->raid_map) goto bbio_out; @@ -2203,10 +2441,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) goto bbio_out; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - goto bbio_out; - + bio = btrfs_io_bio_alloc(0); bio->bi_iter.bi_sector = logical >> 9; bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; @@ -2231,6 +2466,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) rbio_out: bio_put(bio); bbio_out: + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(bbio); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2255,7 +2491,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->refs, 1); + refcount_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; @@ -2332,7 +2568,7 @@ static void scrub_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_queue_work(fs_info->scrub_workers, &sbio->work); @@ -2345,7 +2581,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) int i; BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); - if (sbio->err) { + if (sbio->status) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -2372,10 +2608,10 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) spin_unlock(&sctx->list_lock); if (sctx->is_dev_replace && - atomic_read(&sctx->wr_ctx.flush_all_writes)) { - mutex_lock(&sctx->wr_ctx.wr_lock); + atomic_read(&sctx->flush_all_writes)) { + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); } scrub_pending_bio_dec(sctx); @@ -2385,7 +2621,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, unsigned long *bitmap, u64 start, u64 len) { - u32 offset; + u64 offset; int nsectors; int sectorsize = sparity->sctx->fs_info->sectorsize; @@ -2395,8 +2631,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, } start -= sparity->logic_start; - start = div_u64_rem(start, sparity->stripe_len, &offset); - offset /= sectorsize; + start = div64_u64_rem(start, sparity->stripe_len, &offset); + offset = div_u64(offset, sectorsize); nsectors = (int)len / sectorsize; if (offset + nsectors <= sparity->nsectors) { @@ -2470,8 +2706,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) if (!sum) return 0; - index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; - num_sectors = sum->len / sctx->sectorsize; + index = ((u32)(logical - sum->bytenr)) / sctx->fs_info->sectorsize; + num_sectors = sum->len / sctx->fs_info->sectorsize; memcpy(csum, sum->sums + index, sctx->csum_size); if (index == num_sectors - 1) { list_del(&sum->list); @@ -2490,19 +2726,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, u32 blocksize; if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed++; sctx->stat.data_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sctx->nodesize; + blocksize = sctx->fs_info->nodesize; spin_lock(&sctx->stat_lock); sctx->stat.tree_extents_scrubbed++; sctx->stat.tree_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; WARN_ON(1); } @@ -2555,7 +2791,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->refs, 1); + refcount_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; sblock->sparity = sparity; @@ -2636,11 +2872,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, } if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sctx->nodesize; + blocksize = sctx->fs_info->nodesize; } else { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; WARN_ON(1); } @@ -2694,7 +2930,7 @@ static int get_raid56_logic_offset(u64 physical, int num, for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; - stripe_nr = div_u64(*offset, map->stripe_len); + stripe_nr = div64_u64(*offset, map->stripe_len); stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); /* Work out the disk rotation on this stripe-set */ @@ -2748,7 +2984,7 @@ static void scrub_parity_bio_endio(struct bio *bio) struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); @@ -2765,7 +3001,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) struct btrfs_fs_info *fs_info = sctx->fs_info; struct bio *bio; struct btrfs_raid_bio *rbio; - struct scrub_page *spage; struct btrfs_bio *bbio = NULL; u64 length; int ret; @@ -2775,15 +3010,14 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) goto out; length = sparity->logic_end - sparity->logic_start; + + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, - &length, &bbio, 0, 1); + &length, &bbio); if (ret || !bbio || !bbio->raid_map) goto bbio_out; - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - goto bbio_out; - + bio = btrfs_io_bio_alloc(0); bio->bi_iter.bi_sector = sparity->logic_start >> 9; bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; @@ -2795,9 +3029,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) if (!rbio) goto rbio_out; - list_for_each_entry(spage, &sparity->spages, list) - raid56_add_scrub_pages(rbio, spage->page, spage->logical); - scrub_pending_bio_inc(sctx); raid56_parity_submit_scrub_rbio(rbio); return; @@ -2805,6 +3036,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio_out: bio_put(bio); bbio_out: + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(bbio); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); @@ -2822,12 +3054,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors) static void scrub_parity_get(struct scrub_parity *sparity) { - atomic_inc(&sparity->refs); + refcount_inc(&sparity->refs); } static void scrub_parity_put(struct scrub_parity *sparity) { - if (!atomic_dec_and_test(&sparity->refs)) + if (!refcount_dec_and_test(&sparity->refs)) return; scrub_parity_check_and_repair(sparity); @@ -2879,7 +3111,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->scrub_dev = sdev; sparity->logic_start = logic_start; sparity->logic_end = logic_end; - atomic_set(&sparity->refs, 1); + refcount_set(&sparity->refs, 1); INIT_LIST_HEAD(&sparity->spages); sparity->dbitmap = sparity->bitmap; sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; @@ -3050,9 +3282,9 @@ out: logic_end - logic_start); scrub_parity_put(sparity); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); btrfs_release_path(path); return ret < 0 ? ret : 0; @@ -3098,7 +3330,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, physical = map->stripes[num].physical; offset = 0; - nstripes = div_u64(length, map->stripe_len); + nstripes = div64_u64(length, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; @@ -3208,14 +3440,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ if (atomic_read(&fs_info->scrub_pause_req)) { /* push queued extents */ - atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + atomic_set(&sctx->flush_all_writes, 1); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_set(&sctx->flush_all_writes, 0); scrub_blocked_if_needed(fs_info); } @@ -3422,9 +3654,9 @@ skip: out: /* push queued extents */ scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); blk_finish_plug(&plug); btrfs_free_path(path); @@ -3604,7 +3836,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ btrfs_wait_block_group_reservations(cache); btrfs_wait_nocow_writers(cache); - ret = btrfs_wait_ordered_roots(fs_info, -1, + ret = btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->key.objectid, cache->key.offset); if (ret > 0) { @@ -3661,11 +3893,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * write requests are really completed when bios_in_flight * changes to 0. */ - atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + atomic_set(&sctx->flush_all_writes, 1); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); @@ -3679,7 +3911,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_set(&sctx->flush_all_writes, 0); scrub_pause_off(fs_info); @@ -4082,32 +4314,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, btrfs_put_bbio(bbio); } -static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx, - struct btrfs_device *dev, - int is_dev_replace) -{ - WARN_ON(wr_ctx->wr_curr_bio != NULL); - - mutex_init(&wr_ctx->wr_lock); - wr_ctx->wr_curr_bio = NULL; - if (!is_dev_replace) - return 0; - - WARN_ON(!dev->bdev); - wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; - wr_ctx->tgtdev = dev; - atomic_set(&wr_ctx->flush_all_writes, 0); - return 0; -} - -static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) -{ - mutex_lock(&wr_ctx->wr_lock); - kfree(wr_ctx->wr_curr_bio); - wr_ctx->wr_curr_bio = NULL; - mutex_unlock(&wr_ctx->wr_lock); -} - static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, int mirror_num, u64 physical_for_dev_replace) { @@ -4410,7 +4616,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, struct btrfs_device *dev; int ret; - dev = sctx->wr_ctx.tgtdev; + dev = sctx->wr_tgtdev; if (!dev) return -EIO; if (!dev->bdev) { @@ -4418,13 +4624,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, "scrub write_page_nocow(bdev == NULL) is unexpected"); return -EIO; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - return -ENOMEM; - } + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3f645cd67b54..e937c10b8287 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1069,6 +1069,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } } + ret = btrfs_is_name_len_valid(eb, path->slots[0], + (unsigned long)(di + 1), name_len + data_len); + if (!ret) { + ret = -EIO; + goto out; + } if (name_len + data_len > buf_len) { buf_len = name_len + data_len; if (is_vmalloc_addr(buf)) { @@ -1083,7 +1089,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, buf = tmp; } if (!buf) { - buf = vmalloc(buf_len); + buf = kvmalloc(buf_len, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto out; @@ -2769,15 +2775,20 @@ out: struct recorded_ref { struct list_head list; - char *dir_path; char *name; struct fs_path *full_path; u64 dir; u64 dir_gen; - int dir_path_len; int name_len; }; +static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +{ + ref->full_path = path; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; +} + /* * We need to process new refs before deleted refs, but compare_tree gives us * everything mixed. So we first record all refs and later process them. @@ -2794,17 +2805,7 @@ static int __record_ref(struct list_head *head, u64 dir, ref->dir = dir; ref->dir_gen = dir_gen; - ref->full_path = path; - - ref->name = (char *)kbasename(ref->full_path->start); - ref->name_len = ref->full_path->end - ref->name; - ref->dir_path = ref->full_path->start; - if (ref->name == ref->full_path->start) - ref->dir_path_len = 0; - else - ref->dir_path_len = ref->full_path->end - - ref->full_path->start - 1 - ref->name_len; - + set_ref_path(ref, path); list_add_tail(&ref->list, head); return 0; } @@ -3546,9 +3547,17 @@ static int is_ancestor(struct btrfs_root *root, struct fs_path *fs_path) { u64 ino = ino2; + bool free_path = false; + int ret = 0; + + if (!fs_path) { + fs_path = fs_path_alloc(); + if (!fs_path) + return -ENOMEM; + free_path = true; + } while (ino > BTRFS_FIRST_FREE_OBJECTID) { - int ret; u64 parent; u64 parent_gen; @@ -3557,13 +3566,18 @@ static int is_ancestor(struct btrfs_root *root, if (ret < 0) { if (ret == -ENOENT && ino == ino2) ret = 0; - return ret; + goto out; + } + if (parent == ino1) { + ret = parent_gen == ino1_gen ? 1 : 0; + goto out; } - if (parent == ino1) - return parent_gen == ino1_gen ? 1 : 0; ino = parent; } - return 0; + out: + if (free_path) + fs_path_free(fs_path); + return ret; } static int wait_for_parent_move(struct send_ctx *sctx, @@ -3686,6 +3700,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) int is_orphan = 0; u64 last_dir_ino_rm = 0; bool can_rename = true; + bool orphanized_ancestor = false; btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); @@ -3837,9 +3852,16 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * might contain the pre-orphanization name of * ow_inode, which is no longer valid. */ - fs_path_reset(valid_path); - ret = get_cur_path(sctx, sctx->cur_ino, - sctx->cur_inode_gen, valid_path); + ret = is_ancestor(sctx->parent_root, + ow_inode, ow_gen, + sctx->cur_ino, NULL); + if (ret > 0) { + orphanized_ancestor = true; + fs_path_reset(valid_path); + ret = get_cur_path(sctx, sctx->cur_ino, + sctx->cur_inode_gen, + valid_path); + } if (ret < 0) goto out; } else { @@ -3960,6 +3982,43 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; if (!ret) { + /* + * If we orphanized any ancestor before, we need + * to recompute the full path for deleted names, + * since any such path was computed before we + * processed any references and orphanized any + * ancestor inode. + */ + if (orphanized_ancestor) { + struct fs_path *new_path; + + /* + * Our reference's name member points to + * its full_path member string, so we + * use here a new path. + */ + new_path = fs_path_alloc(); + if (!new_path) { + ret = -ENOMEM; + goto out; + } + ret = get_cur_path(sctx, cur->dir, + cur->dir_gen, + new_path); + if (ret < 0) { + fs_path_free(new_path); + goto out; + } + ret = fs_path_add(new_path, + cur->name, + cur->name_len); + if (ret < 0) { + fs_path_free(new_path); + goto out; + } + fs_path_free(cur->full_path); + set_ref_path(cur, new_path); + } ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; @@ -5184,13 +5243,19 @@ static int is_extent_unchanged(struct send_ctx *sctx, while (key.offset < ekey->offset + left_len) { ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); - if (right_type != BTRFS_FILE_EXTENT_REG) { + if (right_type != BTRFS_FILE_EXTENT_REG && + right_type != BTRFS_FILE_EXTENT_INLINE) { ret = 0; goto out; } right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); - right_len = btrfs_file_extent_num_bytes(eb, ei); + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + right_len = btrfs_file_extent_inline_len(eb, slot, ei); + right_len = PAGE_ALIGN(right_len); + } else { + right_len = btrfs_file_extent_num_bytes(eb, ei); + } right_offset = btrfs_file_extent_offset(eb, ei); right_gen = btrfs_file_extent_generation(eb, ei); @@ -5204,6 +5269,19 @@ static int is_extent_unchanged(struct send_ctx *sctx, goto out; } + /* + * We just wanted to see if when we have an inline extent, what + * follows it is a regular extent (wanted to check the above + * condition for inline extents too). This should normally not + * happen but it's possible for example when we have an inline + * compressed extent representing data with a size matching + * the page size (currently the same as sector size). + */ + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + ret = 0; + goto out; + } + left_offset_fixed = left_offset; if (key.offset < ekey->offset) { /* Fix the right offset for 2a and 7. */ @@ -6378,13 +6456,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); + sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL); if (!sctx->clone_roots) { - sctx->clone_roots = vzalloc(alloc_size); - if (!sctx->clone_roots) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 72a053c9a7f0..74e47794e63f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -601,18 +601,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } break; case Opt_alloc_start: - num = match_strdup(&args[0]); - if (num) { - mutex_lock(&info->chunk_mutex); - info->alloc_start = memparse(num, NULL); - mutex_unlock(&info->chunk_mutex); - kfree(num); - btrfs_info(info, "allocations start at %llu", - info->alloc_start); - } else { - ret = -ENOMEM; - goto out; - } + btrfs_info(info, + "option alloc_start is obsolete, ignored"); break; case Opt_acl: #ifdef CONFIG_BTRFS_FS_POSIX_ACL @@ -1187,7 +1177,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1232,8 +1222,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",nobarrier"); if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); - if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", info->alloc_start); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); @@ -1716,7 +1704,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned long old_opts = fs_info->mount_opt; unsigned long old_compress_type = fs_info->compress_type; u64 old_max_inline = fs_info->max_inline; - u64 old_alloc_start = fs_info->alloc_start; int old_thread_pool_size = fs_info->thread_pool_size; unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; @@ -1795,8 +1782,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } if (fs_info->fs_devices->missing_devices > - fs_info->num_tolerated_disk_barrier_failures && - !(*flags & MS_RDONLY)) { + fs_info->num_tolerated_disk_barrier_failures) { btrfs_warn(fs_info, "too many missing devices, writeable remount is not allowed"); ret = -EACCES; @@ -1856,9 +1842,6 @@ restore: fs_info->mount_opt = old_opts; fs_info->compress_type = old_compress_type; fs_info->max_inline = old_max_inline; - mutex_lock(&fs_info->chunk_mutex); - fs_info->alloc_start = old_alloc_start; - mutex_unlock(&fs_info->chunk_mutex); btrfs_resize_thread_pool(fs_info, old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; @@ -1899,18 +1882,15 @@ static inline void btrfs_descending_sort_devices( static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 *free_bytes) { - struct btrfs_root *root = fs_info->tree_root; struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 skip_space; u64 type; u64 avail_space; - u64 used_space; u64 min_stripe_size; int min_stripes = 1, num_stripes = 1; int i = 0, nr_devices; - int ret; /* * We aren't under the device list lock, so this is racy-ish, but good @@ -1928,12 +1908,12 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, } devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), - GFP_NOFS); + GFP_KERNEL); if (!devices_info) return -ENOMEM; /* calc min stripe number for data space allocation */ - type = btrfs_get_alloc_profile(root, 1); + type = btrfs_data_alloc_profile(fs_info); if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; num_stripes = nr_devices; @@ -1950,8 +1930,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, else min_stripe_size = BTRFS_STRIPE_LEN; - if (fs_info->alloc_start) - mutex_lock(&fs_devices->device_list_mutex); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (!device->in_fs_metadata || !device->bdev || @@ -1974,34 +1952,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, */ skip_space = SZ_1M; - /* user can set the offset in fs_info->alloc_start. */ - if (fs_info->alloc_start && - fs_info->alloc_start + BTRFS_STRIPE_LEN <= - device->total_bytes) { - rcu_read_unlock(); - skip_space = max(fs_info->alloc_start, skip_space); - - /* - * btrfs can not use the free space in - * [0, skip_space - 1], we must subtract it from the - * total. In order to implement it, we account the used - * space in this range first. - */ - ret = btrfs_account_dev_extents_size(device, 0, - skip_space - 1, - &used_space); - if (ret) { - kfree(devices_info); - mutex_unlock(&fs_devices->device_list_mutex); - return ret; - } - - rcu_read_lock(); - - /* calc the free space in [0, skip_space - 1] */ - skip_space -= used_space; - } - /* * we can use the free space in [0, skip_space - 1], subtract * it from the total. @@ -2020,8 +1970,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, i++; } rcu_read_unlock(); - if (fs_info->alloc_start) - mutex_unlock(&fs_devices->device_list_mutex); nr_devices = i; @@ -2058,10 +2006,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, * multiplier to scale the sizes. * * Unused device space usage is based on simulating the chunk allocator - * algorithm that respects the device sizes, order of allocations and the - * 'alloc_start' value, this is a close approximation of the actual use but - * there are other factors that may change the result (like a new metadata - * chunk). + * algorithm that respects the device sizes and order of allocations. This is + * a close approximation of the actual use but there are other factors that may + * change the result (like a new metadata chunk). * * If metadata is exhausted, f_bavail will be 0. */ @@ -2244,7 +2191,7 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - fs_info->fs_frozen = 1; + set_bit(BTRFS_FS_FROZEN, &fs_info->flags); /* * We don't need a barrier here, we'll wait for any transaction that * could be in progress on other threads (and do delayed iputs that @@ -2263,7 +2210,9 @@ static int btrfs_freeze(struct super_block *sb) static int btrfs_unfreeze(struct super_block *sb) { - btrfs_sb(sb)->fs_frozen = 0; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + clear_bit(BTRFS_FS_FROZEN, &fs_info->flags); return 0; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 1f157fba8940..c2d5f3580b4c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -447,11 +447,52 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); +static ssize_t quota_override_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + int quota_override; + + quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + return snprintf(buf, PAGE_SIZE, "%d\n", quota_override); +} + +static ssize_t quota_override_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long knob; + int err; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + err = kstrtoul(buf, 10, &knob); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + if (knob) + set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + else + clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + + return len; +} + +BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), BTRFS_ATTR_PTR(clone_alignment), + BTRFS_ATTR_PTR(quota_override), NULL, }; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ea272432c930..b18ab8f327a5 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -237,7 +237,6 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) { memset(trans, 0, sizeof(*trans)); trans->transid = 1; - INIT_LIST_HEAD(&trans->qgroup_ref_list); trans->type = __TRANS_DUMMY; } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 133753232a94..d06b1c931d05 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -87,7 +87,7 @@ static int test_find_delalloc(u32 sectorsize) return -ENOMEM; } - extent_io_tree_init(&tmp, &inode->i_data); + extent_io_tree_init(&tmp, inode); /* * First go through and create and mark all of our pages dirty, we pin diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 61b807de3e16..f615d59b0489 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -60,8 +60,8 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { void btrfs_put_transaction(struct btrfs_transaction *transaction) { - WARN_ON(atomic_read(&transaction->use_count) == 0); - if (atomic_dec_and_test(&transaction->use_count)) { + WARN_ON(refcount_read(&transaction->use_count) == 0); + if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); if (transaction->delayed_refs.pending_csums) @@ -93,7 +93,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) btrfs_put_block_group_trimming(cache); btrfs_put_block_group(cache); } - kmem_cache_free(btrfs_transaction_cachep, transaction); + kfree(transaction); } } @@ -207,7 +207,7 @@ loop: spin_unlock(&fs_info->trans_lock); return -EBUSY; } - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); @@ -228,7 +228,7 @@ loop: */ BUG_ON(type == TRANS_JOIN_NOLOCK); - cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); + cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS); if (!cur_trans) return -ENOMEM; @@ -238,11 +238,11 @@ loop: * someone started a transaction after we unlocked. Make sure * to redo the checks above */ - kmem_cache_free(btrfs_transaction_cachep, cur_trans); + kfree(cur_trans); goto loop; } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); - kmem_cache_free(btrfs_transaction_cachep, cur_trans); + kfree(cur_trans); return -EROFS; } @@ -257,7 +257,7 @@ loop: * One for this trans handle, one so it will live on until we * commit the transaction. */ - atomic_set(&cur_trans->use_count, 2); + refcount_set(&cur_trans->use_count, 2); atomic_set(&cur_trans->pending_ordered, 0); cur_trans->flags = 0; cur_trans->start_time = get_seconds(); @@ -294,7 +294,7 @@ loop: spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, - fs_info->btree_inode->i_mapping); + fs_info->btree_inode); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -432,7 +432,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->trans_lock); cur_trans = fs_info->running_transaction; if (cur_trans && is_transaction_blocked(cur_trans)) { - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_event(fs_info->transaction_wait, @@ -572,7 +572,6 @@ again: h->type = type; h->can_flush_pending_bgs = true; - INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); smp_mb(); @@ -744,7 +743,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) list_for_each_entry(t, &fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); ret = 0; break; } @@ -773,7 +772,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) if (t->state == TRANS_STATE_COMPLETED) break; cur_trans = t; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); break; } } @@ -917,7 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, wake_up_process(info->transaction_kthread); err = -EIO; } - assert_qgroups_uptodate(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); if (must_run_delayed_refs) { @@ -1376,9 +1374,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = commit_fs_roots(trans, fs_info); if (ret) goto out; - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret < 0) - goto out; ret = btrfs_qgroup_account_extents(trans, fs_info); if (ret < 0) goto out; @@ -1839,7 +1834,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, /* take transaction reference */ cur_trans = trans->transaction; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); btrfs_end_transaction(trans); @@ -1928,7 +1923,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static inline void @@ -2015,7 +2010,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&fs_info->trans_lock); - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans); @@ -2035,7 +2030,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (prev_trans->state != TRANS_STATE_COMPLETED) { - atomic_inc(&prev_trans->use_count); + refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_for_commit(prev_trans); @@ -2130,13 +2125,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - /* Reocrd old roots for later qgroup accounting */ - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } - /* * make sure none of the code above managed to slip in a * delayed item @@ -2179,6 +2167,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_free_log_root_tree(trans, fs_info); /* + * commit_fs_roots() can call btrfs_save_ino_cache(), which generates + * new delayed refs. Must handle them or qgroup can be wrong. + */ + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&fs_info->reloc_mutex); + goto scrub_continue; + } + + /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. */ @@ -2223,7 +2222,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) switch_commit_roots(cur_trans, fs_info); - assert_qgroups_uptodate(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); update_super_roots(fs_info); @@ -2306,7 +2304,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * it'll result in deadlock about SB_FREEZE_FS. */ if (current != fs_info->transaction_kthread && - current != fs_info->cleaner_kthread && !fs_info->fs_frozen) + current != fs_info->cleaner_kthread && + !test_bit(BTRFS_FS_FROZEN, &fs_info->flags)) btrfs_run_delayed_iputs(fs_info); return ret; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 5dfb5590fff6..c55e44560103 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -18,6 +18,8 @@ #ifndef __BTRFS_TRANSACTION__ #define __BTRFS_TRANSACTION__ + +#include <linux/refcount.h> #include "btrfs_inode.h" #include "delayed-ref.h" #include "ctree.h" @@ -49,7 +51,7 @@ struct btrfs_transaction { * transaction can end */ atomic_t num_writers; - atomic_t use_count; + refcount_t use_count; atomic_t pending_ordered; unsigned long flags; @@ -125,8 +127,6 @@ struct btrfs_trans_handle { unsigned int type; struct btrfs_root *root; struct btrfs_fs_info *fs_info; - struct seq_list delayed_ref_elem; - struct list_head qgroup_ref_list; struct list_head new_bgs; }; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a59674c3e69e..f20ef211a73d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1175,15 +1175,19 @@ next: return 0; } -static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index, - u64 *parent_objectid) +static int extref_get_fields(struct extent_buffer *eb, int slot, + unsigned long ref_ptr, u32 *namelen, char **name, + u64 *index, u64 *parent_objectid) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ref_ptr; *namelen = btrfs_inode_extref_name_len(eb, extref); + if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, + *namelen)) + return -EIO; + *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1198,14 +1202,19 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, return 0; } -static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index) +static int ref_get_fields(struct extent_buffer *eb, int slot, + unsigned long ref_ptr, u32 *namelen, char **name, + u64 *index) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ref_ptr; *namelen = btrfs_inode_ref_name_len(eb, ref); + if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), + *namelen)) + return -EIO; + *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1280,8 +1289,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, while (ref_ptr < ref_end) { if (log_ref_ver) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index, &parent_objectid); + ret = extref_get_fields(eb, slot, ref_ptr, &namelen, + &name, &ref_index, &parent_objectid); /* * parent object can change from one array * item to another. @@ -1293,8 +1302,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; } } else { - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index); + ret = ref_get_fields(eb, slot, ref_ptr, &namelen, + &name, &ref_index); } if (ret) goto out; @@ -1841,7 +1850,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, di)) + if (verify_dir_item(fs_info, eb, slot, di)) return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); @@ -2017,7 +2026,7 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, di)) { + if (verify_dir_item(fs_info, eb, slot, di)) { ret = -EIO; goto out; } @@ -2102,6 +2111,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, struct btrfs_path *path, const u64 ino) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key search_key; struct btrfs_path *log_path; int i; @@ -2143,6 +2153,12 @@ process_leaf: u32 this_len = sizeof(*di) + name_len + data_len; char *name; + ret = verify_dir_item(fs_info, path->nodes[0], + path->slots[0], di); + if (ret) { + ret = -EIO; + goto out; + } name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; @@ -4196,7 +4212,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; /* Need a ref to keep it from getting evicted from cache */ - atomic_inc(&em->refs); + refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); list_add_tail(&em->list, &extents); num++; @@ -4546,6 +4562,12 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, this_len = sizeof(*extref) + this_name_len; } + ret = btrfs_is_name_len_valid(eb, slot, name_ptr, + this_name_len); + if (!ret) { + ret = -EIO; + goto out; + } if (this_name_len > name_len) { char *new_name; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ab8a66d852f9..5eb7217738ed 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -139,6 +139,11 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, + enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, + int mirror_num, int need_raid_map); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -237,6 +242,17 @@ static struct btrfs_device *__alloc_device(void) if (!dev) return ERR_PTR(-ENOMEM); + /* + * Preallocate a bio that's always going to be used for flushing device + * barriers and matches the device lifespan + */ + dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); + if (!dev->flush_bio) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + bio_get(dev->flush_bio); + INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->resized_list); @@ -833,6 +849,7 @@ static void __free_device(struct work_struct *work) device = container_of(work, struct btrfs_device, rcu_work); rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); } @@ -1008,14 +1025,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, q = bdev_get_queue(bdev); if (blk_queue_discard(q)) device->can_discard = 1; + if (!blk_queue_nonrot(q)) + fs_devices->rotating = 1; device->bdev = bdev; device->in_fs_metadata = 0; device->mode = flags; - if (!blk_queue_nonrot(bdev_get_queue(bdev))) - fs_devices->rotating = 1; - fs_devices->open_devices++; if (device->writeable && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1349,15 +1365,13 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, int ret; int slot; struct extent_buffer *l; - u64 min_search_start; /* * We don't want to overwrite the superblock on the drive nor any area * used by the boot loader (grub for example), so we make sure to start * at an offset of at least 1MB. */ - min_search_start = max(fs_info->alloc_start, 1024ull * 1024); - search_start = max(search_start, min_search_start); + search_start = max_t(u64, search_start, SZ_1M); path = btrfs_alloc_path(); if (!path) @@ -2383,7 +2397,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; - device->total_bytes = i_size_read(bdev->bd_inode); + device->total_bytes = round_down(i_size_read(bdev->bd_inode), + fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; device->fs_info = fs_info; @@ -2413,16 +2428,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path fs_info->fs_devices->total_devices++; fs_info->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += device->total_bytes; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!blk_queue_nonrot(bdev_get_queue(bdev))) + if (!blk_queue_nonrot(q)) fs_info->fs_devices->rotating = 1; tmp = btrfs_super_total_bytes(fs_info->super_copy); btrfs_set_super_total_bytes(fs_info->super_copy, - tmp + device->total_bytes); + round_down(tmp + device->total_bytes, fs_info->sectorsize)); tmp = btrfs_super_num_devices(fs_info->super_copy); btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); @@ -2570,7 +2583,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, goto error; } - name = rcu_string_strdup(device_path, GFP_NOFS); + name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { kfree(device); ret = -ENOMEM; @@ -2685,6 +2698,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, if (!device->writeable) return -EACCES; + new_size = round_down(new_size, fs_info->sectorsize); + mutex_lock(&fs_info->chunk_mutex); old_total = btrfs_super_total_bytes(super_copy); diff = new_size - device->total_bytes; @@ -2697,7 +2712,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, fs_devices = fs_info->fs_devices; - btrfs_set_super_total_bytes(super_copy, old_total + diff); + btrfs_set_super_total_bytes(super_copy, + round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; btrfs_device_set_total_bytes(device, new_size); @@ -2795,10 +2811,38 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, return ret; } +static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, length); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(fs_info, "unable to find logical %llu length %llu", + logical, length); + return ERR_PTR(-EINVAL); + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_crit(fs_info, + "found a bad mapping, wanted %llu-%llu, found %llu-%llu", + logical, length, em->start, em->start + em->len); + free_extent_map(em); + return ERR_PTR(-EINVAL); + } + + /* callers are responsible for dropping em's ref. */ + return em; +} + int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct extent_map_tree *em_tree; struct extent_map *em; struct map_lookup *map; u64 dev_extent_len = 0; @@ -2806,23 +2850,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em_tree = &fs_info->mapping_tree.map_tree; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - - if (!em || em->start > chunk_offset || - em->start + em->len < chunk_offset) { + em = get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) { /* * This is a logic error, but we don't want to just rely on the * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); - if (em) - free_extent_map(em); - return -EINVAL; + return PTR_ERR(em); } map = em->map_lookup; mutex_lock(&fs_info->chunk_mutex); @@ -2850,9 +2886,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_bytes_used(device, device->bytes_used - dev_extent_len); - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += dev_extent_len; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); } @@ -3736,7 +3770,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) if (ret) btrfs_handle_fs_error(fs_info, ret, NULL); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } /* Non-zero return value signifies invalidity */ @@ -3755,6 +3789,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 meta_target, data_target; u64 allowed; int mixed = 0; int ret; @@ -3851,11 +3886,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } while (read_seqretry(&fs_info->profiles_lock, seq)); - if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < - btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < + btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { btrfs_warn(fs_info, "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", - bctl->meta.target, bctl->data.target); + meta_target, data_target); } if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { @@ -3910,7 +3950,7 @@ out: __cancel_balance(fs_info); else { kfree(bctl); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } return ret; } @@ -4000,7 +4040,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); @@ -4363,7 +4403,10 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = btrfs_device_get_total_bytes(device); - u64 diff = old_size - new_size; + u64 diff; + + new_size = round_down(new_size, fs_info->sectorsize); + diff = old_size - new_size; if (device->is_tgtdev_for_dev_replace) return -EINVAL; @@ -4379,9 +4422,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_device_set_total_bytes(device, new_size); if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space -= diff; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_sub(diff, &fs_info->free_chunk_space); } mutex_unlock(&fs_info->chunk_mutex); @@ -4492,7 +4533,8 @@ again: &fs_info->fs_devices->resized_devices); WARN_ON(diff > old_total); - btrfs_set_super_total_bytes(super_copy, old_total - diff); + btrfs_set_super_total_bytes(super_copy, + round_down(old_total - diff, fs_info->sectorsize)); mutex_unlock(&fs_info->chunk_mutex); /* Now btrfs_update_device() will change the on-disk size. */ @@ -4505,9 +4547,7 @@ done: btrfs_device_set_total_bytes(device, old_size); if (device->writeable) device->fs_devices->total_rw_bytes += diff; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += diff; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(diff, &fs_info->free_chunk_space); mutex_unlock(&fs_info->chunk_mutex); } return ret; @@ -4785,7 +4825,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = div_u64(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - stripe_size = div_u64(stripe_size, raid_stripe_len); + stripe_size = div64_u64(stripe_size, raid_stripe_len); stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -4833,7 +4873,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ret = add_extent_mapping(em_tree, em, 0); if (!ret) { list_add_tail(&em->list, &trans->transaction->pending_chunks); - atomic_inc(&em->refs); + refcount_inc(&em->refs); } write_unlock(&em_tree->lock); if (ret) { @@ -4852,9 +4892,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); } - spin_lock(&info->free_chunk_lock); - info->free_chunk_space -= (stripe_size * map->num_stripes); - spin_unlock(&info->free_chunk_lock); + atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); free_extent_map(em); check_raid56_incompat_flag(info, type); @@ -4888,7 +4926,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_device *device; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - struct extent_map_tree *em_tree; struct extent_map *em; struct map_lookup *map; size_t item_size; @@ -4897,24 +4934,9 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int i = 0; int ret = 0; - em_tree = &fs_info->mapping_tree.map_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_crit(fs_info, "unable to find logical %Lu len %Lu", - chunk_offset, chunk_size); - return -EINVAL; - } - - if (em->start != chunk_offset || em->len != chunk_size) { - btrfs_crit(fs_info, - "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu", - chunk_offset, chunk_size, em->start, em->len); - free_extent_map(em); - return -EINVAL; - } + em = get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(em)) + return PTR_ERR(em); map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); @@ -5015,20 +5037,19 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - struct btrfs_root *extent_root = fs_info->extent_root; u64 chunk_offset; u64 sys_chunk_offset; u64 alloc_profile; int ret; chunk_offset = find_next_chunk(fs_info); - alloc_profile = btrfs_get_alloc_profile(extent_root, 0); + alloc_profile = btrfs_metadata_alloc_profile(fs_info); ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); if (ret) return ret; sys_chunk_offset = find_next_chunk(fs_info); - alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); + alloc_profile = btrfs_system_alloc_profile(fs_info); ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); return ret; } @@ -5055,15 +5076,12 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct extent_map *em; struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int readonly = 0; int miss_ndevs = 0; int i; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); - if (!em) + em = get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) return 1; map = em->map_lookup; @@ -5117,34 +5135,19 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; int ret; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - - /* - * We could return errors for these cases, but that could get ugly and - * we'd probably do the same thing which is just not do anything else - * and exit, so return 1 so the callers don't try to use other copies. - */ - if (!em) { - btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, - logical+len); - return 1; - } - - if (em->start > logical || em->start + em->len < logical) { - btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got %Lu-%Lu", - logical, logical+len, em->start, - em->start + em->len); - free_extent_map(em); + em = get_chunk_map(fs_info, logical, len); + if (IS_ERR(em)) + /* + * We could return errors for these cases, but that could get + * ugly and we'd probably do the same thing which is just not do + * anything else and exit, so return 1 so the callers don't try + * to use other copies. + */ return 1; - } map = em->map_lookup; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) @@ -5160,7 +5163,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) free_extent_map(em); btrfs_dev_replace_lock(&fs_info->dev_replace, 0); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && + fs_info->dev_replace.tgtdev) ret++; btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); @@ -5173,15 +5177,11 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; unsigned long len = fs_info->sectorsize; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - BUG_ON(!em); + em = get_chunk_map(fs_info, logical, len); + WARN_ON(IS_ERR(em)); - BUG_ON(em->start > logical || em->start + em->len < logical); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); @@ -5189,20 +5189,16 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len, int mirror_num) { struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; int ret = 0; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - BUG_ON(!em); + em = get_chunk_map(fs_info, logical, len); + WARN_ON(IS_ERR(em)); - BUG_ON(em->start > logical || em->start + em->len < logical); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; @@ -5295,25 +5291,353 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) GFP_NOFS|__GFP_NOFAIL); atomic_set(&bbio->error, 0); - atomic_set(&bbio->refs, 1); + refcount_set(&bbio->refs, 1); return bbio; } void btrfs_get_bbio(struct btrfs_bio *bbio) { - WARN_ON(!atomic_read(&bbio->refs)); - atomic_inc(&bbio->refs); + WARN_ON(!refcount_read(&bbio->refs)); + refcount_inc(&bbio->refs); } void btrfs_put_bbio(struct btrfs_bio *bbio) { if (!bbio) return; - if (atomic_dec_and_test(&bbio->refs)) + if (refcount_dec_and_test(&bbio->refs)) kfree(bbio); } +/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ +/* + * Please note that, discard won't be sent to target device of device + * replace. + */ +static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, + struct btrfs_bio **bbio_ret) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_bio *bbio; + u64 offset; + u64 stripe_nr; + u64 stripe_nr_end; + u64 stripe_end_offset; + u64 stripe_cnt; + u64 stripe_len; + u64 stripe_offset; + u64 num_stripes; + u32 stripe_index; + u32 factor = 0; + u32 sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + u32 last_stripe = 0; + int ret = 0; + int i; + + /* discard always return a bbio */ + ASSERT(bbio_ret); + + em = get_chunk_map(fs_info, logical, length); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* we don't discard raid56 yet */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EOPNOTSUPP; + goto out; + } + + offset = logical - em->start; + length = min_t(u64, em->len - offset, length); + + stripe_len = map->stripe_len; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + stripe_nr = div64_u64(offset, stripe_len); + + /* stripe_offset is the offset of this block in its stripe */ + stripe_offset = offset - stripe_nr * stripe_len; + + stripe_nr_end = round_up(offset + length, map->stripe_len); + stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); + stripe_cnt = stripe_nr_end - stripe_nr; + stripe_end_offset = stripe_nr_end * map->stripe_len - + (offset + length); + /* + * after this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array + */ + num_stripes = 1; + stripe_index = 0; + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + num_stripes = min_t(u64, map->num_stripes, + sub_stripes * stripe_cnt); + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + stripe_index *= sub_stripes; + stripes_per_dev = div_u64_rem(stripe_cnt, factor, + &remaining_stripes); + div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); + last_stripe *= sub_stripes; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + num_stripes = map->num_stripes; + } else { + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); + } + + bbio = alloc_btrfs_bio(num_stripes, 0); + if (!bbio) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + + /* + * Special for the first stripe and + * the last stripe: + * + * |-------|...|-------| + * |----------| + * off end_off + */ + if (i < sub_stripes) + bbio->stripes[i].length -= + stripe_offset; + + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + sub_stripes - 1)) + bbio->stripes[i].length -= + stripe_end_offset; + + if (i == sub_stripes - 1) + stripe_offset = 0; + } else { + bbio->stripes[i].length = length; + } + + stripe_index++; + if (stripe_index == map->num_stripes) { + stripe_index = 0; + stripe_nr++; + } + } + + *bbio_ret = bbio; + bbio->map_type = map->type; + bbio->num_stripes = num_stripes; +out: + free_extent_map(em); + return ret; +} + +/* + * In dev-replace case, for repair case (that's the only case where the mirror + * is selected explicitly when calling btrfs_map_block), blocks left of the + * left cursor can also be read from the target drive. + * + * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the + * array of stripes. + * For READ, it also needs to be supported using the same mirror number. + * + * If the requested block is not left of the left cursor, EIO is returned. This + * can happen because btrfs_num_copies() returns one more in the dev-replace + * case. + */ +static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, + u64 srcdev_devid, int *mirror_num, + u64 *physical) +{ + struct btrfs_bio *bbio = NULL; + int num_stripes; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + int i; + int ret = 0; + + ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + logical, &length, &bbio, 0, 0); + if (ret) { + ASSERT(bbio == NULL); + return ret; + } + + num_stripes = bbio->num_stripes; + if (*mirror_num > num_stripes) { + /* + * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, + * that means that the requested area is not left of the left + * cursor + */ + btrfs_put_bbio(bbio); + return -EIO; + } + + /* + * process the rest of the function using the mirror_num of the source + * drive. Therefore look it up first. At the end, patch the device + * pointer to the one of the target drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid != srcdev_devid) + continue; + + /* + * In case of DUP, in order to keep it simple, only add the + * mirror with the lowest physical address + */ + if (found && + physical_of_found <= bbio->stripes[i].physical) + continue; + + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + + btrfs_put_bbio(bbio); + + ASSERT(found); + if (!found) + return -EIO; + + *mirror_num = index_srcdev + 1; + *physical = physical_of_found; + return ret; +} + +static void handle_ops_on_dev_replace(enum btrfs_map_op op, + struct btrfs_bio **bbio_ret, + struct btrfs_dev_replace *dev_replace, + int *num_stripes_ret, int *max_errors_ret) +{ + struct btrfs_bio *bbio = *bbio_ret; + u64 srcdev_devid = dev_replace->srcdev->devid; + int tgtdev_indexes = 0; + int num_stripes = *num_stripes_ret; + int max_errors = *max_errors_ret; + int i; + + if (op == BTRFS_MAP_WRITE) { + int index_where_to_add; + + /* + * duplicate the write operations while the dev replace + * procedure is running. Since the copying of the old disk to + * the new disk takes place at run time while the filesystem is + * mounted writable, the regular write operations to the old + * disk have to be duplicated to go to the new disk as well. + * + * Note that device->missing is handled by the caller, and that + * the write to the old disk is already set up in the stripes + * array. + */ + index_where_to_add = num_stripes; + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* write to new disk, too */ + struct btrfs_bio_stripe *new = + bbio->stripes + index_where_to_add; + struct btrfs_bio_stripe *old = + bbio->stripes + i; + + new->physical = old->physical; + new->length = old->length; + new->dev = dev_replace->tgtdev; + bbio->tgtdev_map[i] = index_where_to_add; + index_where_to_add++; + max_errors++; + tgtdev_indexes++; + } + } + num_stripes = index_where_to_add; + } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + /* + * During the dev-replace procedure, the target drive can also + * be used to read data in case it is needed to repair a corrupt + * block elsewhere. This is possible if the requested area is + * left of the left cursor. In this area, the target drive is a + * full copy of the source drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it simple, + * only add the mirror with the lowest physical + * address + */ + if (found && + physical_of_found <= + bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + } + if (found) { + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; + + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; + + tgtdev_indexes++; + num_stripes++; + } + } + + *num_stripes_ret = num_stripes; + *max_errors_ret = max_errors; + bbio->num_tgtdevs = tgtdev_indexes; + *bbio_ret = bbio; +} + +static bool need_full_stripe(enum btrfs_map_op op) +{ + return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5322,14 +5646,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct extent_map_tree *em_tree = &map_tree->map_tree; u64 offset; u64 stripe_offset; - u64 stripe_end_offset; u64 stripe_nr; - u64 stripe_nr_orig; - u64 stripe_nr_end; u64 stripe_len; u32 stripe_index; int i; @@ -5345,23 +5664,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, *length); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_crit(fs_info, "unable to find logical %llu len %llu", - logical, *length); - return -EINVAL; - } + if (op == BTRFS_MAP_DISCARD) + return __btrfs_map_block_for_discard(fs_info, logical, + *length, bbio_ret); - if (em->start > logical || em->start + em->len < logical) { - btrfs_crit(fs_info, - "found a bad mapping, wanted %Lu, found %Lu-%Lu", - logical, em->start, em->start + em->len); - free_extent_map(em); - return -EINVAL; - } + em = get_chunk_map(fs_info, logical, *length); + if (IS_ERR(em)) + return PTR_ERR(em); map = em->map_lookup; offset = logical - em->start; @@ -5400,14 +5709,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, raid56_full_stripe_start *= full_stripe_len; } - if (op == BTRFS_MAP_DISCARD) { - /* we don't discard raid56 yet */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = -EOPNOTSUPP; - goto out; - } - *length = min_t(u64, em->len - offset, *length); - } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { u64 max_len; /* For writes to RAID[56], allow a full stripeset across all disks. For other RAID types and for RAID[56] reads, just allow a single @@ -5438,105 +5740,28 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { - /* - * in dev-replace case, for repair case (that's the only - * case where the mirror is selected explicitly when - * calling btrfs_map_block), blocks left of the left cursor - * can also be read from the target drive. - * For REQ_GET_READ_MIRRORS, the target drive is added as - * the last one to the array of stripes. For READ, it also - * needs to be supported using the same mirror number. - * If the requested block is not left of the left cursor, - * EIO is returned. This can happen because btrfs_num_copies() - * returns one more in the dev-replace case. - */ - u64 tmp_length = *length; - struct btrfs_bio *tmp_bbio = NULL; - int tmp_num_stripes; - u64 srcdev_devid = dev_replace->srcdev->devid; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - - ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0, 0); - if (ret) { - WARN_ON(tmp_bbio != NULL); - goto out; - } - - tmp_num_stripes = tmp_bbio->num_stripes; - if (mirror_num > tmp_num_stripes) { - /* - * BTRFS_MAP_GET_READ_MIRRORS does not contain this - * mirror, that means that the requested area - * is not left of the left cursor - */ - ret = -EIO; - btrfs_put_bbio(tmp_bbio); - goto out; - } - - /* - * process the rest of the function using the mirror_num - * of the source drive. Therefore look it up first. - * At the end, patch the device pointer to the one of the - * target drive. - */ - for (i = 0; i < tmp_num_stripes; i++) { - if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) - continue; - - /* - * In case of DUP, in order to keep it simple, only add - * the mirror with the lowest physical address - */ - if (found && - physical_of_found <= tmp_bbio->stripes[i].physical) - continue; - - index_srcdev = i; - found = 1; - physical_of_found = tmp_bbio->stripes[i].physical; - } - - btrfs_put_bbio(tmp_bbio); - - if (!found) { - WARN_ON(1); - ret = -EIO; + !need_full_stripe(op) && dev_replace->tgtdev != NULL) { + ret = get_extra_mirror_from_replace(fs_info, logical, *length, + dev_replace->srcdev->devid, + &mirror_num, + &physical_to_patch_in_first_stripe); + if (ret) goto out; - } - - mirror_num = index_srcdev + 1; - patch_the_first_stripe_for_dev_replace = 1; - physical_to_patch_in_first_stripe = physical_of_found; + else + patch_the_first_stripe_for_dev_replace = 1; } else if (mirror_num > map->num_stripes) { mirror_num = 0; } num_stripes = 1; stripe_index = 0; - stripe_nr_orig = stripe_nr; - stripe_nr_end = ALIGN(offset + *length, map->stripe_len); - stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); - stripe_end_offset = stripe_nr_end * map->stripe_len - - (offset + *length); - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (op == BTRFS_MAP_DISCARD) - num_stripes = min_t(u64, map->num_stripes, - stripe_nr_end - stripe_nr_orig); stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS) + if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || - op == BTRFS_MAP_GET_READ_MIRRORS) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5549,8 +5774,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || - op == BTRFS_MAP_GET_READ_MIRRORS) { + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5566,10 +5790,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->sub_stripes; - else if (op == BTRFS_MAP_DISCARD) - num_stripes = min_t(u64, map->sub_stripes * - (stripe_nr_end - stripe_nr_orig), - map->num_stripes); else if (mirror_num) stripe_index += mirror_num - 1; else { @@ -5587,7 +5807,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ - stripe_nr = div_u64(raid56_full_stripe_start, + stripe_nr = div64_u64(raid56_full_stripe_start, stripe_len * nr_data_stripes(map)); /* RAID[56] write or recovery. Return all stripes */ @@ -5612,8 +5832,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1) + if ((op != BTRFS_MAP_WRITE && + op != BTRFS_MAP_GET_READ_MIRRORS) && + mirror_num <= 1) mirror_num = 1; } } else { @@ -5635,8 +5856,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } num_alloc_stripes = num_stripes; - if (dev_replace_is_ongoing) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { + if (op == BTRFS_MAP_WRITE) num_alloc_stripes <<= 1; if (op == BTRFS_MAP_GET_READ_MIRRORS) num_alloc_stripes++; @@ -5648,14 +5869,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ret = -ENOMEM; goto out; } - if (dev_replace_is_ongoing) + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); /* build raid_map */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - need_raid_map && - ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) || - mirror_num > 1)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && + (need_full_stripe(op) || mirror_num > 1)) { u64 tmp; unsigned rot; @@ -5679,173 +5898,27 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, RAID6_Q_STRIPE; } - if (op == BTRFS_MAP_DISCARD) { - u32 factor = 0; - u32 sub_stripes = 0; - u64 stripes_per_dev = 0; - u32 remaining_stripes = 0; - u32 last_stripe = 0; - - if (map->type & - (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - if (map->type & BTRFS_BLOCK_GROUP_RAID0) - sub_stripes = 1; - else - sub_stripes = map->sub_stripes; - - factor = map->num_stripes / sub_stripes; - stripes_per_dev = div_u64_rem(stripe_nr_end - - stripe_nr_orig, - factor, - &remaining_stripes); - div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); - last_stripe *= sub_stripes; - } - - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID10)) { - bbio->stripes[i].length = stripes_per_dev * - map->stripe_len; - - if (i / sub_stripes < remaining_stripes) - bbio->stripes[i].length += - map->stripe_len; - /* - * Special for the first stripe and - * the last stripe: - * - * |-------|...|-------| - * |----------| - * off end_off - */ - if (i < sub_stripes) - bbio->stripes[i].length -= - stripe_offset; - - if (stripe_index >= last_stripe && - stripe_index <= (last_stripe + - sub_stripes - 1)) - bbio->stripes[i].length -= - stripe_end_offset; - - if (i == sub_stripes - 1) - stripe_offset = 0; - } else - bbio->stripes[i].length = *length; - - stripe_index++; - if (stripe_index == map->num_stripes) { - /* This could only happen for RAID0/10 */ - stripe_index = 0; - stripe_nr++; - } - } - } else { - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + - stripe_nr * map->stripe_len; - bbio->stripes[i].dev = - map->stripes[stripe_index].dev; - stripe_index++; - } + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = + map->stripes[stripe_index].dev; + stripe_index++; } - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) max_errors = btrfs_chunk_max_errors(map); if (bbio->raid_map) sort_parity_stripes(bbio, num_stripes); - tgtdev_indexes = 0; - if (dev_replace_is_ongoing && - (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) && - dev_replace->tgtdev != NULL) { - int index_where_to_add; - u64 srcdev_devid = dev_replace->srcdev->devid; - - /* - * duplicate the write operations while the dev replace - * procedure is running. Since the copying of the old disk - * to the new disk takes place at run time while the - * filesystem is mounted writable, the regular write - * operations to the old disk have to be duplicated to go - * to the new disk as well. - * Note that device->missing is handled by the caller, and - * that the write to the old disk is already set up in the - * stripes array. - */ - index_where_to_add = num_stripes; - for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { - /* write to new disk, too */ - struct btrfs_bio_stripe *new = - bbio->stripes + index_where_to_add; - struct btrfs_bio_stripe *old = - bbio->stripes + i; - - new->physical = old->physical; - new->length = old->length; - new->dev = dev_replace->tgtdev; - bbio->tgtdev_map[i] = index_where_to_add; - index_where_to_add++; - max_errors++; - tgtdev_indexes++; - } - } - num_stripes = index_where_to_add; - } else if (dev_replace_is_ongoing && - op == BTRFS_MAP_GET_READ_MIRRORS && - dev_replace->tgtdev != NULL) { - u64 srcdev_devid = dev_replace->srcdev->devid; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - - /* - * During the dev-replace procedure, the target drive can - * also be used to read data in case it is needed to repair - * a corrupt block elsewhere. This is possible if the - * requested area is left of the left cursor. In this area, - * the target drive is a full copy of the source drive. - */ - for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it - * simple, only add the mirror with the - * lowest physical address - */ - if (found && - physical_of_found <= - bbio->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = bbio->stripes[i].physical; - } - } - if (found) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; - - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; - - tgtdev_indexes++; - num_stripes++; - } + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && + need_full_stripe(op)) { + handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, + &max_errors); } *bbio_ret = bbio; @@ -5853,7 +5926,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; - bbio->num_tgtdevs = tgtdev_indexes; /* * this is the case that REQ_READ && dev_replace_is_ongoing && @@ -5886,19 +5958,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num, - int need_raid_map) + struct btrfs_bio **bbio_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, - mirror_num, need_raid_map); + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); } int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct extent_map_tree *em_tree = &map_tree->map_tree; struct extent_map *em; struct map_lookup *map; u64 *buf; @@ -5908,24 +5976,11 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 rmap_len; int i, j, nr = 0; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_start, 1); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_err(fs_info, "couldn't find em for chunk %Lu", - chunk_start); + em = get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(em)) return -EIO; - } - if (em->start != chunk_start) { - btrfs_err(fs_info, "bad chunk start, em=%Lu, wanted=%Lu", - em->start, chunk_start); - free_extent_map(em); - return -EIO; - } map = em->map_lookup; - length = em->len; rmap_len = map->stripe_len; @@ -5949,7 +6004,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, continue; stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div_u64(stripe_nr, map->stripe_len); + stripe_nr = div64_u64(stripe_nr, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; @@ -5994,9 +6049,10 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; - if (bio->bi_error) { + if (bio->bi_status) { atomic_inc(&bbio->error); - if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) { + if (bio->bi_status == BLK_STS_IOERR || + bio->bi_status == BLK_STS_TARGET) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; @@ -6034,13 +6090,13 @@ static void btrfs_end_bio(struct bio *bio) * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } else { /* * this bio is actually up to date, we didn't * go over the max number of errors */ - bio->bi_error = 0; + bio->bi_status = 0; } btrfs_end_bbio(bbio, bio); @@ -6151,7 +6207,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; btrfs_end_bbio(bbio, bio); } } @@ -6218,10 +6274,9 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, continue; } - if (dev_nr < total_devs - 1) { - bio = btrfs_bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); /* -ENOMEM */ - } else + if (dev_nr < total_devs - 1) + bio = btrfs_bio_clone(first_bio); + else bio = first_bio; submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, @@ -6636,10 +6691,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, device->in_fs_metadata = 1; if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += device->total_bytes - - device->bytes_used; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(device->total_bytes - device->bytes_used, + &fs_info->free_chunk_space); } ret = 0; return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 59be81206dd7..6f45fd60d15a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -74,6 +74,8 @@ struct btrfs_device { int missing; int can_discard; int is_tgtdev_for_dev_replace; + int last_flush_error; + int flush_bio_sent; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED seqcount_t data_seqcount; @@ -123,7 +125,6 @@ struct btrfs_device { struct list_head resized_list; /* for sending down flush barriers */ - int nobarriers; struct bio *flush_bio; struct completion flush_wait; @@ -280,6 +281,11 @@ struct btrfs_io_bio { u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; u8 *csum_allocated; btrfs_io_bio_end_io_t *end_io; + struct bvec_iter iter; + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire btrfs_io_bio but relies on bio being last. + */ struct bio bio; }; @@ -298,7 +304,7 @@ struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); struct btrfs_bio { - atomic_t refs; + refcount_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; u64 map_type; /* get from map_lookup->type */ @@ -400,8 +406,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num, - int need_raid_map); + struct btrfs_bio **bbio_ret); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); @@ -475,7 +480,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); -int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct btrfs_mapping_tree *map_tree, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b3cbf80c5acf..2c7e53f9ff1b 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -336,7 +336,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) u32 this_len = sizeof(*di) + name_len + data_len; unsigned long name_ptr = (unsigned long)(di + 1); - if (verify_dir_item(fs_info, leaf, di)) { + if (verify_dir_item(fs_info, leaf, slot, di)) { ret = -EIO; goto err; } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 135b10823c6d..c248f9286366 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -24,12 +24,13 @@ #include <linux/slab.h> #include <linux/zlib.h> #include <linux/zutil.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/init.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/bio.h> +#include <linux/refcount.h> #include "compression.h" struct workspace { @@ -42,7 +43,7 @@ static void zlib_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); - vfree(workspace->strm.workspace); + kvfree(workspace->strm.workspace); kfree(workspace->buf); kfree(workspace); } @@ -52,14 +53,14 @@ static struct list_head *zlib_alloc_workspace(void) struct workspace *workspace; int workspacesize; - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = vmalloc(workspacesize); - workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS); + workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -211,10 +212,7 @@ out: return ret; } -static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen) +static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; @@ -222,8 +220,12 @@ static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in, char *data_in; size_t total_out = 0; unsigned long page_in_index = 0; + size_t srclen = cb->compressed_len; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; + struct page **pages_in = cb->compressed_pages; + u64 disk_start = cb->start; + struct bio *orig_bio = cb->orig_bio; data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; diff --git a/fs/buffer.c b/fs/buffer.c index 161be58c5cb0..5c2cba8d2387 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,7 +49,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - struct writeback_control *wbc); + enum rw_hint hint, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1829,7 +1829,8 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -1883,7 +1884,8 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -3038,7 +3040,7 @@ static void end_bio_bh_io_sync(struct bio *bio) if (unlikely(bio_flagged(bio, BIO_QUIET))) set_bit(BH_Quiet, &bh->b_state); - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bio_put(bio); } @@ -3091,7 +3093,7 @@ void guard_bio_eod(int op, struct bio *bio) } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - struct writeback_control *wbc) + enum rw_hint write_hint, struct writeback_control *wbc) { struct bio *bio; @@ -3120,6 +3122,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; + bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); BUG_ON(bio->bi_iter.bi_size != bh->b_size); @@ -3142,7 +3145,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, int submit_bh(int op, int op_flags, struct buffer_head *bh) { - return submit_bh_wbc(op, op_flags, bh, NULL); + return submit_bh_wbc(op, op_flags, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 9bf90bcc56ac..bb3a02ca9da4 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -18,7 +18,7 @@ #include <linux/fscache-cache.h> #include <linux/timer.h> -#include <linux/wait.h> +#include <linux/wait_bit.h> #include <linux/cred.h> #include <linux/workqueue.h> #include <linux/security.h> @@ -97,7 +97,7 @@ struct cachefiles_cache { * backing file read tracking */ struct cachefiles_one_read { - wait_queue_t monitor; /* link into monitored waitqueue */ + wait_queue_entry_t monitor; /* link into monitored waitqueue */ struct page *back_page; /* backing file page we're waiting for */ struct page *netfs_page; /* netfs page we're going to fill */ struct fscache_retrieval *op; /* retrieval op covering this */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 41df8a27d7eb..3978b324cbca 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -204,7 +204,7 @@ wait_for_old_object: wait_queue_head_t *wq; signed long timeout = 60 * HZ; - wait_queue_t wait; + wait_queue_entry_t wait; bool requeue; /* if the object we're waiting for is queued for processing, diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index afbdc418966d..18d7aa61ef0f 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -21,7 +21,7 @@ * - we use this to detect read completion of backing pages * - the caller holds the waitqueue lock */ -static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, +static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, int sync, void *_key) { struct cachefiles_one_read *monitor = @@ -48,7 +48,7 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, } /* remove from the waitqueue */ - list_del(&wait->task_list); + list_del(&wait->entry); /* move onto the action list and queue for FS-Cache thread pool */ ASSERT(monitor->op); diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 987044bca1c2..59cb307b15fb 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -131,6 +131,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) } if (new_mode != old_mode) { + newattrs.ia_ctime = current_time(inode); newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE; ret = __ceph_setattr(inode, &newattrs); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9ecb2fd348cb..1e71e6ca5ddf 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -670,8 +670,12 @@ static void writepages_finish(struct ceph_osd_request *req) bool remove_page; dout("writepages_finish %p rc %d\n", inode, rc); - if (rc < 0) + if (rc < 0) { mapping_set_error(mapping, rc); + ceph_set_error_write(ci); + } else { + ceph_clear_error_write(ci); + } /* * We lost the cache cap, need to truncate the page before @@ -703,9 +707,6 @@ static void writepages_finish(struct ceph_osd_request *req) clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - if (rc < 0) - SetPageError(page); - ceph_put_snap_context(page_snap_context(page)); page->private = 0; ClearPagePrivate(page); @@ -1892,6 +1893,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); wr_req->r_mtime = ci->vfs_inode.i_mtime; + wr_req->r_abort_on_full = true; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 68c78be19d5b..a3ebb632294e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1015,6 +1015,7 @@ static int send_cap_msg(struct cap_msg_args *arg) void *p; size_t extra_len; struct timespec zerotime = {0}; + struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" @@ -1076,8 +1077,12 @@ static int send_cap_msg(struct cap_msg_args *arg) ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); - /* osd_epoch_barrier (version 5) */ - ceph_encode_32(&p, 0); + /* + * osd_epoch_barrier (version 5) + * The epoch_barrier is protected osdc->lock, so READ_ONCE here in + * case it was recently changed + */ + ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier)); /* oldest_flush_tid (version 6) */ ceph_encode_64(&p, arg->oldest_flush_tid); @@ -1389,7 +1394,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, first_tid = cf->tid + 1; capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); - atomic_inc(&capsnap->nref); + refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); dout("__flush_snaps %p capsnap %p tid %llu %s\n", @@ -2202,7 +2207,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); - atomic_inc(&capsnap->nref); + refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, @@ -3633,13 +3638,19 @@ void ceph_handle_caps(struct ceph_mds_session *session, p += inline_len; } + if (le16_to_cpu(msg->hdr.version) >= 5) { + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; + u32 epoch_barrier; + + ceph_decode_32_safe(&p, end, epoch_barrier, bad); + ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); + } + if (le16_to_cpu(msg->hdr.version) >= 8) { u64 flush_tid; u32 caller_uid, caller_gid; - u32 osd_epoch_barrier; u32 pool_ns_len; - /* version >= 5 */ - ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad); + /* version >= 6 */ ceph_decode_64_safe(&p, end, flush_tid, bad); /* version >= 7 */ diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3ef11bc8d728..4e2d112c982f 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -22,20 +22,19 @@ static int mdsmap_show(struct seq_file *s, void *p) { int i; struct ceph_fs_client *fsc = s->private; + struct ceph_mdsmap *mdsmap; if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) return 0; - seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); - seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); - seq_printf(s, "session_timeout %d\n", - fsc->mdsc->mdsmap->m_session_timeout); - seq_printf(s, "session_autoclose %d\n", - fsc->mdsc->mdsmap->m_session_autoclose); - for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { - struct ceph_entity_addr *addr = - &fsc->mdsc->mdsmap->m_info[i].addr; - int state = fsc->mdsc->mdsmap->m_info[i].state; - + mdsmap = fsc->mdsc->mdsmap; + seq_printf(s, "epoch %d\n", mdsmap->m_epoch); + seq_printf(s, "root %d\n", mdsmap->m_root); + seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds); + seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout); + seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose); + for (i = 0; i < mdsmap->m_num_mds; i++) { + struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; + int state = mdsmap->m_info[i].state; seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, ceph_pr_addr(&addr->in_addr), ceph_mds_state_name(state)); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 3e9ad501addf..e071d23f6148 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -294,7 +294,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_mds_client *mdsc = fsc->mdsc; int i; int err; - u32 ftype; + unsigned frag = -1; struct ceph_mds_reply_info_parsed *rinfo; dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); @@ -341,7 +341,6 @@ more: /* do we have the correct frag content buffered? */ if (need_send_readdir(fi, ctx->pos)) { struct ceph_mds_request *req; - unsigned frag; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; @@ -352,8 +351,11 @@ more: } if (is_hash_order(ctx->pos)) { - frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), - NULL, NULL); + /* fragtree isn't always accurate. choose frag + * based on previous reply when possible. */ + if (frag == (unsigned)-1) + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); } else { frag = fpos_frag(ctx->pos); } @@ -378,7 +380,11 @@ more: ceph_mdsc_put_request(req); return -ENOMEM; } + } else if (is_hash_order(ctx->pos)) { + req->r_args.readdir.offset_hash = + cpu_to_le32(fpos_hash(ctx->pos)); } + req->r_dir_release_cnt = fi->dir_release_count; req->r_dir_ordered_cnt = fi->dir_ordered_count; req->r_readdir_cache_idx = fi->readdir_cache_idx; @@ -476,6 +482,7 @@ more: struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; ino_t ino; + u32 ftype; BUG_ON(rde->offset < ctx->pos); @@ -498,15 +505,17 @@ more: ctx->pos++; } + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + if (fi->next_offset > 2) { - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + frag = fi->frag; goto more; } /* more frags? */ if (!ceph_frag_is_rightmost(fi->frag)) { - unsigned frag = ceph_frag_next(fi->frag); + frag = ceph_frag_next(fi->frag); if (is_hash_order(ctx->pos)) { loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), fi->next_offset, true); diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e8f11fa565c5..7df550c13d7f 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -91,6 +91,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) ceph_mdsc_put_request(req); if (!inode) return ERR_PTR(-ESTALE); + if (inode->i_nlink == 0) { + iput(inode); + return ERR_PTR(-ESTALE); + } } return d_obtain_alias(inode); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 18c045e2ead6..29308a80d66f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -13,6 +13,38 @@ #include "mds_client.h" #include "cache.h" +static __le32 ceph_flags_sys2wire(u32 flags) +{ + u32 wire_flags = 0; + + switch (flags & O_ACCMODE) { + case O_RDONLY: + wire_flags |= CEPH_O_RDONLY; + break; + case O_WRONLY: + wire_flags |= CEPH_O_WRONLY; + break; + case O_RDWR: + wire_flags |= CEPH_O_RDWR; + break; + } + +#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; } + + ceph_sys2wire(O_CREAT); + ceph_sys2wire(O_EXCL); + ceph_sys2wire(O_TRUNC); + ceph_sys2wire(O_DIRECTORY); + ceph_sys2wire(O_NOFOLLOW); + +#undef ceph_sys2wire + + if (flags) + dout("unused open flags: %x", flags); + + return cpu_to_le32(wire_flags); +} + /* * Ceph file operations * @@ -120,7 +152,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode) if (IS_ERR(req)) goto out; req->r_fmode = ceph_flags_to_mode(flags); - req->r_args.open.flags = cpu_to_le32(flags); + req->r_args.open.flags = ceph_flags_sys2wire(flags); req->r_args.open.mode = cpu_to_le32(create_mode); out: return req; @@ -189,7 +221,7 @@ int ceph_renew_caps(struct inode *inode) spin_lock(&ci->i_ceph_lock); wanted = __ceph_caps_file_wanted(ci); if (__ceph_is_any_real_caps(ci) && - (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { + (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { int issued = __ceph_caps_issued(ci, NULL); spin_unlock(&ci->i_ceph_lock); dout("renew caps %p want %s issued %s updating mds_wanted\n", @@ -778,6 +810,7 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_callback = ceph_aio_complete_req; req->r_inode = inode; req->r_priv = aio_req; + req->r_abort_on_full = true; ret = ceph_osdc_start_request(req->r_osdc, req, false); out: @@ -1085,19 +1118,22 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, out: ceph_osdc_put_request(req); - if (ret == 0) { - pos += len; - written += len; - - if (pos > i_size_read(inode)) { - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) - ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, - NULL); - } - } else + if (ret != 0) { + ceph_set_error_write(ci); break; + } + + ceph_clear_error_write(ci); + pos += len; + written += len; + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } if (ret != -EOLDSNAPC && written > 0) { @@ -1303,6 +1339,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } retry_snap: + /* FIXME: not complete since it doesn't account for being at quota */ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; goto out; @@ -1324,7 +1361,8 @@ retry_snap: inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) || + (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { struct ceph_snap_context *snapc; struct iov_iter data; inode_unlock(inode); @@ -1633,8 +1671,12 @@ static long ceph_fallocate(struct file *file, int mode, } size = i_size_read(inode); - if (!(mode & FALLOC_FL_KEEP_SIZE)) + if (!(mode & FALLOC_FL_KEEP_SIZE)) { endoff = offset + length; + ret = inode_newsize_ok(inode, endoff); + if (ret) + goto unlock; + } if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d3119fe3ab45..4de6cdddf059 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1482,10 +1482,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) return readdir_prepopulate_inodes_only(req, session); - if (rinfo->hash_order && req->r_path2) { - last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, - req->r_path2, strlen(req->r_path2)); - last_hash = ceph_frag_value(last_hash); + if (rinfo->hash_order) { + if (req->r_path2) { + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + req->r_path2, + strlen(req->r_path2)); + last_hash = ceph_frag_value(last_hash); + } else if (rinfo->offset_hash) { + /* mds understands offset_hash */ + WARN_ON_ONCE(req->r_readdir_offset != 2); + last_hash = le32_to_cpu(rhead->args.readdir.offset_hash); + } } if (rinfo->dir_dir && @@ -1510,7 +1517,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, } if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && - !(rinfo->hash_order && req->r_path2)) { + !(rinfo->hash_order && last_hash)) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); @@ -2015,7 +2022,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) attr->ia_size > inode->i_size) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); - inode->i_ctime = attr->ia_ctime; ci->i_reported_size = attr->ia_size; dirtied |= CEPH_CAP_FILE_EXCL; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || @@ -2037,7 +2043,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, only ? "ctime only" : "ignored"); - inode->i_ctime = attr->ia_ctime; if (only) { /* * if kernel wants to dirty ctime but nothing else, @@ -2060,7 +2065,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (dirtied) { inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); - inode->i_ctime = current_time(inode); + inode->i_ctime = attr->ia_ctime; } release &= issued; @@ -2078,6 +2083,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; + req->r_stamp = attr->ia_ctime; err = ceph_mdsc_do_request(mdsc, NULL, req); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1d3fa90d40b9..0c05df44cc6c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -189,6 +189,7 @@ static int parse_reply_info_dir(void **p, void *end, info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); + info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH); } if (num == 0) goto done; @@ -378,9 +379,9 @@ const char *ceph_session_state_name(int s) static struct ceph_mds_session *get_session(struct ceph_mds_session *s) { - if (atomic_inc_not_zero(&s->s_ref)) { + if (refcount_inc_not_zero(&s->s_ref)) { dout("mdsc get_session %p %d -> %d\n", s, - atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); + refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref)); return s; } else { dout("mdsc get_session %p 0 -- FAIL", s); @@ -391,8 +392,8 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s) void ceph_put_mds_session(struct ceph_mds_session *s) { dout("mdsc put_session %p %d -> %d\n", s, - atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); - if (atomic_dec_and_test(&s->s_ref)) { + refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1); + if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); kfree(s); @@ -411,7 +412,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, return NULL; session = mdsc->sessions[mds]; dout("lookup_mds_session %p %d\n", session, - atomic_read(&session->s_ref)); + refcount_read(&session->s_ref)); get_session(session); return session; } @@ -441,7 +442,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; - if (mds >= mdsc->mdsmap->m_max_mds) + if (mds >= mdsc->mdsmap->m_num_mds) return ERR_PTR(-EINVAL); s = kzalloc(sizeof(*s), GFP_NOFS); @@ -466,7 +467,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, INIT_LIST_HEAD(&s->s_caps); s->s_nr_caps = 0; s->s_trim_caps = 0; - atomic_set(&s->s_ref, 1); + refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); s->s_num_cap_releases = 0; @@ -494,7 +495,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, } mdsc->sessions[mds] = s; atomic_inc(&mdsc->num_sessions); - atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ + refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */ ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); @@ -1004,7 +1005,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *ts; int i, mds = session->s_mds; - if (mds >= mdsc->mdsmap->m_max_mds) + if (mds >= mdsc->mdsmap->m_num_mds) return; mi = &mdsc->mdsmap->m_info[mds]; @@ -1551,9 +1552,15 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg *msg = NULL; struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; struct ceph_cap *cap; LIST_HEAD(tmp_list); int num_cap_releases; + __le32 barrier, *cap_barrier; + + down_read(&osdc->lock); + barrier = cpu_to_le32(osdc->epoch_barrier); + up_read(&osdc->lock); spin_lock(&session->s_cap_lock); again: @@ -1571,7 +1578,11 @@ again: head = msg->front.iov_base; head->num = cpu_to_le32(0); msg->front.iov_len = sizeof(*head); + + msg->hdr.version = cpu_to_le16(2); + msg->hdr.compat_version = cpu_to_le16(1); } + cap = list_first_entry(&tmp_list, struct ceph_cap, session_caps); list_del(&cap->session_caps); @@ -1589,6 +1600,11 @@ again: ceph_put_cap(mdsc, cap); if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + // Append cap_barrier field + cap_barrier = msg->front.iov_base + msg->front.iov_len; + *cap_barrier = barrier; + msg->front.iov_len += sizeof(*cap_barrier); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("send_cap_releases mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); @@ -1604,6 +1620,11 @@ again: spin_unlock(&session->s_cap_lock); if (msg) { + // Append cap_barrier field + cap_barrier = msg->front.iov_base + msg->front.iov_len; + *cap_barrier = barrier; + msg->front.iov_len += sizeof(*cap_barrier); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("send_cap_releases mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); @@ -1666,7 +1687,6 @@ struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); - struct timespec ts; if (!req) return ERR_PTR(-ENOMEM); @@ -1685,8 +1705,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); - ktime_get_real_ts(&ts); - req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran); + req->r_stamp = timespec_trunc(current_kernel_time(), mdsc->fsc->sb->s_time_gran); req->r_op = op; req->r_direct_mode = mode; @@ -1993,7 +2012,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_pagelist) { struct ceph_pagelist *pagelist = req->r_pagelist; - atomic_inc(&pagelist->refcnt); + refcount_inc(&pagelist->refcnt); ceph_msg_data_add_pagelist(msg, pagelist); msg->hdr.data_len = cpu_to_le32(pagelist->length); } else { @@ -2640,8 +2659,10 @@ static void handle_session(struct ceph_mds_session *session, seq = le64_to_cpu(h->seq); mutex_lock(&mdsc->mutex); - if (op == CEPH_SESSION_CLOSE) + if (op == CEPH_SESSION_CLOSE) { + get_session(session); __unregister_session(mdsc, session); + } /* FIXME: this ttl calculation is generous */ session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; mutex_unlock(&mdsc->mutex); @@ -2730,6 +2751,8 @@ static void handle_session(struct ceph_mds_session *session, kick_requests(mdsc, mds); mutex_unlock(&mdsc->mutex); } + if (op == CEPH_SESSION_CLOSE) + ceph_put_mds_session(session); return; bad: @@ -3109,7 +3132,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, dout("check_new_map new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); - for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) { + for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { if (mdsc->sessions[i] == NULL) continue; s = mdsc->sessions[i]; @@ -3123,15 +3146,33 @@ static void check_new_map(struct ceph_mds_client *mdsc, ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", ceph_session_state_name(s->s_state)); - if (i >= newmap->m_max_mds || + if (i >= newmap->m_num_mds || memcmp(ceph_mdsmap_get_addr(oldmap, i), ceph_mdsmap_get_addr(newmap, i), sizeof(struct ceph_entity_addr))) { if (s->s_state == CEPH_MDS_SESSION_OPENING) { /* the session never opened, just close it * out now */ + get_session(s); + __unregister_session(mdsc, s); __wake_requests(mdsc, &s->s_waiting); + ceph_put_mds_session(s); + } else if (i >= newmap->m_num_mds) { + /* force close session for stopped mds */ + get_session(s); __unregister_session(mdsc, s); + __wake_requests(mdsc, &s->s_waiting); + kick_requests(mdsc, i); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + cleanup_session_requests(mdsc, s); + remove_session_caps(s); + mutex_unlock(&s->s_mutex); + + ceph_put_mds_session(s); + + mutex_lock(&mdsc->mutex); } else { /* just close it */ mutex_unlock(&mdsc->mutex); @@ -3169,7 +3210,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, } } - for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { + for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) { s = mdsc->sessions[i]; if (!s) continue; @@ -3883,7 +3924,7 @@ static struct ceph_connection *con_get(struct ceph_connection *con) struct ceph_mds_session *s = con->private; if (get_session(s)) { - dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref)); + dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref)); return con; } dout("mdsc con_get %p FAIL\n", s); @@ -3894,7 +3935,7 @@ static void con_put(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; - dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1); + dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1); ceph_put_mds_session(s); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ac0475a2daa7..db57ae98ed34 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -7,6 +7,7 @@ #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/spinlock.h> +#include <linux/refcount.h> #include <linux/ceph/types.h> #include <linux/ceph/messenger.h> @@ -82,9 +83,10 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; - bool dir_complete; bool dir_end; + bool dir_complete; bool hash_order; + bool offset_hash; struct ceph_mds_reply_dir_entry *dir_entries; }; @@ -104,10 +106,13 @@ struct ceph_mds_reply_info_parsed { /* * cap releases are batched and sent to the MDS en masse. + * + * Account for per-message overhead of mds_cap_release header + * and __le32 for osd epoch barrier trailing field. */ -#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - \ +#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \ sizeof(struct ceph_mds_cap_release)) / \ - sizeof(struct ceph_mds_cap_item)) + sizeof(struct ceph_mds_cap_item)) /* @@ -156,7 +161,7 @@ struct ceph_mds_session { unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; - atomic_t s_ref; + refcount_t s_ref; struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ }; @@ -373,7 +378,7 @@ __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); static inline struct ceph_mds_session * ceph_get_mds_session(struct ceph_mds_session *s) { - atomic_inc(&s->s_ref); + refcount_inc(&s->s_ref); return s; } diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 5454e2327a5f..1a748cf88535 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -22,11 +22,11 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) int i; /* special case for one mds */ - if (1 == m->m_max_mds && m->m_info[0].state > 0) + if (1 == m->m_num_mds && m->m_info[0].state > 0) return 0; /* count */ - for (i = 0; i < m->m_max_mds; i++) + for (i = 0; i < m->m_num_mds; i++) if (m->m_info[i].state > 0) n++; if (n == 0) @@ -135,8 +135,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_session_autoclose = ceph_decode_32(p); m->m_max_file_size = ceph_decode_64(p); m->m_max_mds = ceph_decode_32(p); + m->m_num_mds = m->m_max_mds; - m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS); + m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); if (m->m_info == NULL) goto nomem; @@ -207,9 +208,20 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); - if (mds < 0 || mds >= m->m_max_mds || state <= 0) + if (mds < 0 || state <= 0) continue; + if (mds >= m->m_num_mds) { + int new_num = max(mds + 1, m->m_num_mds * 2); + void *new_m_info = krealloc(m->m_info, + new_num * sizeof(*m->m_info), + GFP_NOFS | __GFP_ZERO); + if (!new_m_info) + goto nomem; + m->m_info = new_m_info; + m->m_num_mds = new_num; + } + info = &m->m_info[mds]; info->global_id = global_id; info->state = state; @@ -229,6 +241,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->export_targets = NULL; } } + if (m->m_num_mds > m->m_max_mds) { + /* find max up mds */ + for (i = m->m_num_mds; i >= m->m_max_mds; i--) { + if (i == 0 || m->m_info[i-1].state > 0) + break; + } + m->m_num_mds = i; + } /* pg_pools */ ceph_decode_32_safe(p, end, n, bad); @@ -270,12 +290,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) for (i = 0; i < n; i++) { s32 mds = ceph_decode_32(p); - if (mds >= 0 && mds < m->m_max_mds) { + if (mds >= 0 && mds < m->m_num_mds) { if (m->m_info[mds].laggy) num_laggy++; } } m->m_num_laggy = num_laggy; + + if (n > m->m_num_mds) { + void *new_m_info = krealloc(m->m_info, + n * sizeof(*m->m_info), + GFP_NOFS | __GFP_ZERO); + if (!new_m_info) + goto nomem; + m->m_info = new_m_info; + } + m->m_num_mds = n; } /* inc */ @@ -341,7 +371,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m) { int i; - for (i = 0; i < m->m_max_mds; i++) + for (i = 0; i < m->m_num_mds; i++) kfree(m->m_info[i].export_targets); kfree(m->m_info); kfree(m->m_data_pg_pools); @@ -357,7 +387,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) return false; if (m->m_num_laggy > 0) return false; - for (i = 0; i < m->m_max_mds; i++) { + for (i = 0; i < m->m_num_mds; i++) { if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) nr_active++; } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 8f8b41c2ef0f..dab5d6732345 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -519,7 +519,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->need_flush ? "" : "no_flush"); ihold(inode); - atomic_set(&capsnap->nref, 1); + refcount_set(&capsnap->nref, 1); INIT_LIST_HEAD(&capsnap->ci_item); capsnap->follows = old_snapc->seq; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a8c81b2052ca..8d7918ce694a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -544,10 +544,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - const u64 supported_features = - CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH | - CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA; - const u64 required_features = 0; int page_count; size_t size; int err = -ENOMEM; @@ -556,8 +552,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc) return ERR_PTR(-ENOMEM); - fsc->client = ceph_create_client(opt, fsc, supported_features, - required_features); + fsc->client = ceph_create_client(opt, fsc); if (IS_ERR(fsc->client)) { err = PTR_ERR(fsc->client); goto fail; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 176186b12457..a973acd8beaf 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -14,6 +14,7 @@ #include <linux/writeback.h> #include <linux/slab.h> #include <linux/posix_acl.h> +#include <linux/refcount.h> #include <linux/ceph/libceph.h> @@ -160,7 +161,7 @@ struct ceph_cap_flush { * data before flushing the snapped state (tracked here) back to the MDS. */ struct ceph_cap_snap { - atomic_t nref; + refcount_t nref; struct list_head ci_item; struct ceph_cap_flush cap_flush; @@ -189,7 +190,7 @@ struct ceph_cap_snap { static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) { - if (atomic_dec_and_test(&capsnap->nref)) { + if (refcount_dec_and_test(&capsnap->nref)) { if (capsnap->xattr_blob) ceph_buffer_put(capsnap->xattr_blob); kfree(capsnap); @@ -471,6 +472,32 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ #define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ #define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ + +/* + * We set the ERROR_WRITE bit when we start seeing write errors on an inode + * and then clear it when they start succeeding. Note that we do a lockless + * check first, and only take the lock if it looks like it needs to be changed. + * The write submission code just takes this as a hint, so we're not too + * worried if a few slip through in either direction. + */ +static inline void ceph_set_error_write(struct ceph_inode_info *ci) +{ + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_ERROR_WRITE; + spin_unlock(&ci->i_ceph_lock); + } +} + +static inline void ceph_clear_error_write(struct ceph_inode_info *ci) +{ + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE; + spin_unlock(&ci->i_ceph_lock); + } +} static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index febc28f9e2c2..75267cdd5dfd 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -392,6 +392,7 @@ static int __set_xattr(struct ceph_inode_info *ci, if (update_xattr) { int err = 0; + if (xattr && (flags & XATTR_CREATE)) err = -EEXIST; else if (!xattr && (flags & XATTR_REPLACE)) @@ -399,12 +400,14 @@ static int __set_xattr(struct ceph_inode_info *ci, if (err) { kfree(name); kfree(val); + kfree(*newxattr); return err; } if (update_xattr < 0) { if (xattr) __remove_xattr(ci, xattr); kfree(name); + kfree(*newxattr); return 0; } } diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 15bac390dff9..b98436f5c7c7 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1135,20 +1135,19 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, u32 acllen = 0; int rc = 0; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); - struct cifs_tcon *tcon; + struct smb_version_operations *ops; cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); if (IS_ERR(tlink)) return PTR_ERR(tlink); - tcon = tlink_tcon(tlink); - if (pfid && (tcon->ses->server->ops->get_acl_by_fid)) - pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid, - &acllen); - else if (tcon->ses->server->ops->get_acl) - pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, - &acllen); + ops = tlink_tcon(tlink)->ses->server->ops; + + if (pfid && (ops->get_acl_by_fid)) + pntsd = ops->get_acl_by_fid(cifs_sb, pfid, &acllen); + else if (ops->get_acl) + pntsd = ops->get_acl(cifs_sb, inode, path, &acllen); else { cifs_put_tlink(tlink); return -EOPNOTSUPP; @@ -1181,23 +1180,23 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); - struct cifs_tcon *tcon; + struct smb_version_operations *ops; if (IS_ERR(tlink)) return PTR_ERR(tlink); - tcon = tlink_tcon(tlink); + + ops = tlink_tcon(tlink)->ses->server->ops; cifs_dbg(NOISY, "set ACL from mode for %s\n", path); /* Get the security descriptor */ - if (tcon->ses->server->ops->get_acl == NULL) { + if (ops->get_acl == NULL) { cifs_put_tlink(tlink); return -EOPNOTSUPP; } - pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, - &secdesclen); + pntsd = ops->get_acl(cifs_sb, inode, path, &secdesclen); if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); @@ -1224,13 +1223,12 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); - if (tcon->ses->server->ops->set_acl == NULL) + if (ops->set_acl == NULL) rc = -EOPNOTSUPP; if (!rc) { /* Set the security descriptor */ - rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode, - path, aclflag); + rc = ops->set_acl(pnntsd, secdesclen, inode, path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } cifs_put_tlink(tlink); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8be55be70faf..bcc7d9acad64 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -418,7 +418,7 @@ struct smb_version_operations { int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *, const unsigned char *, const unsigned char *, char *, - size_t, const struct nls_table *, int); + size_t, struct cifs_sb_info *); int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, const char *, const void *, const __u16, const struct nls_table *, int); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e49958c3f8bb..6eb3147132e3 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -480,8 +480,7 @@ extern int CIFSSMBCopy(unsigned int xid, extern ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, const unsigned char *ea_name, char *EAData, - size_t bufsize, const struct nls_table *nls_codepage, - int remap_special_chars); + size_t bufsize, struct cifs_sb_info *cifs_sb); extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const char *ea_name, const void *ea_value, const __u16 ea_value_len, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4c01b3f9abf0..fbb0d4cbda41 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -697,9 +697,7 @@ cifs_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, 1, CIFS_ECHO_OP); } @@ -1599,9 +1597,7 @@ cifs_readv_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &rdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, 1, 0); } @@ -2058,7 +2054,6 @@ cifs_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); - struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; @@ -2095,9 +2090,7 @@ cifs_writev_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &wdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, 1, 0); } @@ -6076,11 +6069,13 @@ ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, const unsigned char *ea_name, char *EAData, size_t buf_size, - const struct nls_table *nls_codepage, int remap) + struct cifs_sb_info *cifs_sb) { /* BB assumes one setup word */ TRANSACTION2_QPI_REQ *pSMB = NULL; TRANSACTION2_QPI_RSP *pSMBr = NULL; + int remap = cifs_remap(cifs_sb); + struct nls_table *nls_codepage = cifs_sb->local_nls; int rc = 0; int bytes_returned; int list_len; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6ef78ad838e6..fcef70602b27 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -582,7 +582,7 @@ cifs_relock_file(struct cifsFileInfo *cfile) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; - down_read(&cinode->lock_sem); + down_read_nested(&cinode->lock_sem, SINGLE_DEPTH_NESTING); if (cinode->can_cache_brlcks) { /* can cache locks - no need to relock */ up_read(&cinode->lock_sem); @@ -3271,7 +3271,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) if (!is_sync_kiocb(iocb)) ctx->iocb = iocb; - if (to->type & ITER_IOVEC) + if (to->type == ITER_IOVEC) ctx->should_dirty = true; rc = setup_aio_ctx_iter(ctx, to, READ); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index c3b2fa0b2ec8..a8693632235f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -24,6 +24,7 @@ #include <linux/pagemap.h> #include <linux/freezer.h> #include <linux/sched/signal.h> +#include <linux/wait_bit.h> #include <asm/div64.h> #include "cifsfs.h" @@ -563,8 +564,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path, "SETFILEBITS", ea_value, 4 /* size of buf */, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); + cifs_sb); cifs_put_tlink(tlink); if (rc < 0) return (int)rc; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index b08531977daa..3b147dc6af63 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -810,7 +810,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) if (!pages) { pages = vmalloc(max_pages * sizeof(struct page *)); - if (!bv) { + if (!pages) { kvfree(bv); return -ENOMEM; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 27bc360c7ffd..a723df3e0197 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -849,8 +849,13 @@ cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *fid, __u16 search_flags, struct cifs_search_info *srch_inf) { - return CIFSFindFirst(xid, tcon, path, cifs_sb, - &fid->netfid, search_flags, srch_inf, true); + int rc; + + rc = CIFSFindFirst(xid, tcon, path, cifs_sb, + &fid->netfid, search_flags, srch_inf, true); + if (rc) + cifs_dbg(FYI, "find first failed=%d\n", rc); + return rc; } static int diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c58691834eb2..7e48561abd29 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -982,7 +982,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); kfree(utf16_path); if (rc) { - cifs_dbg(VFS, "open dir failed\n"); + cifs_dbg(FYI, "open dir failed rc=%d\n", rc); return rc; } @@ -992,7 +992,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, fid->volatile_fid, 0, srch_inf); if (rc) { - cifs_dbg(VFS, "query directory failed\n"); + cifs_dbg(FYI, "query directory failed rc=%d\n", rc); SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); } return rc; @@ -1809,7 +1809,8 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) sg = init_sg(rqst, sign); if (!sg) { - cifs_dbg(VFS, "%s: Failed to init sg %d", __func__, rc); + cifs_dbg(VFS, "%s: Failed to init sg", __func__); + rc = -ENOMEM; goto free_req; } @@ -1817,6 +1818,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) iv = kzalloc(iv_len, GFP_KERNEL); if (!iv) { cifs_dbg(VFS, "%s: Failed to alloc IV", __func__); + rc = -ENOMEM; goto free_sg; } iv[0] = 3; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 48ff7703b919..e4afdaae743f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1240,15 +1240,19 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, goto tcon_exit; } - if (rsp->ShareType & SMB2_SHARE_TYPE_DISK) + switch (rsp->ShareType) { + case SMB2_SHARE_TYPE_DISK: cifs_dbg(FYI, "connection to disk share\n"); - else if (rsp->ShareType & SMB2_SHARE_TYPE_PIPE) { + break; + case SMB2_SHARE_TYPE_PIPE: tcon->ipc = true; cifs_dbg(FYI, "connection to pipe share\n"); - } else if (rsp->ShareType & SMB2_SHARE_TYPE_PRINT) { - tcon->print = true; + break; + case SMB2_SHARE_TYPE_PRINT: + tcon->ipc = true; cifs_dbg(FYI, "connection to printer\n"); - } else { + break; + default: cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType); rc = -EOPNOTSUPP; goto tcon_error_exit; @@ -2173,9 +2177,7 @@ smb2_echo_callback(struct mid_q_entry *mid) if (mid->mid_state == MID_RESPONSE_RECEIVED) credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, CIFS_ECHO_OP); } @@ -2433,9 +2435,7 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_READ_HE); queue_work(cifsiod_wq, &rdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, 0); } @@ -2594,7 +2594,6 @@ smb2_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); - struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; unsigned int credits_received = 1; @@ -2634,9 +2633,7 @@ smb2_writev_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); queue_work(cifsiod_wq, &wdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, credits_received, 0); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 4d64b5b8fc9c..47a125ece11e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -94,7 +94,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) now = jiffies; /* commands taking longer than one second are indications that something is wrong, unless it is quite a slow link or server */ - if ((now - midEntry->when_alloc) > HZ) { + if (time_after(now, midEntry->when_alloc + HZ)) { if ((cifsFYI & CIFS_TIMER) && (midEntry->command != command)) { pr_debug(" CIFS slow rsp: cmd %d mid %llu", midEntry->command, midEntry->mid); @@ -613,9 +613,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) } spin_unlock(&GlobalMid_Lock); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); return rc; } diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 20af5187ba63..de50e749ff05 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -188,8 +188,6 @@ static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode, pcreatetime = (__u64 *)value; *pcreatetime = CIFS_I(inode)->createtime; return sizeof(__u64); - - return rc; } @@ -235,8 +233,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, if (pTcon->ses->server->ops->query_all_EAs) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, - full_path, name, value, size, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + full_path, name, value, size, cifs_sb); break; case XATTR_CIFS_ACL: { @@ -336,8 +333,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) if (pTcon->ses->server->ops->query_all_EAs) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, - full_path, NULL, data, buf_size, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + full_path, NULL, data, buf_size, cifs_sb); list_ea_exit: kfree(full_path); free_xid(xid); diff --git a/fs/coda/file.c b/fs/coda/file.c index 9d956cd6d46f..363402fcb3ed 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -34,7 +34,7 @@ coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos); + return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0); } static ssize_t @@ -51,7 +51,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) host_file = cfi->cfi_container; file_start_write(host_file); inode_lock(coda_inode); - ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos); + ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 6116d5275a3e..112b3e1e20e3 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -866,8 +866,6 @@ COMPATIBLE_IOCTL(TIOCGDEV) COMPATIBLE_IOCTL(TIOCCBRK) COMPATIBLE_IOCTL(TIOCGSID) COMPATIBLE_IOCTL(TIOCGICOUNT) -COMPATIBLE_IOCTL(TIOCGPKT) -COMPATIBLE_IOCTL(TIOCGPTLCK) COMPATIBLE_IOCTL(TIOCGEXCL) /* Little t */ COMPATIBLE_IOCTL(TIOCGETD) @@ -883,16 +881,12 @@ COMPATIBLE_IOCTL(TIOCMGET) COMPATIBLE_IOCTL(TIOCMBIC) COMPATIBLE_IOCTL(TIOCMBIS) COMPATIBLE_IOCTL(TIOCMSET) -COMPATIBLE_IOCTL(TIOCPKT) COMPATIBLE_IOCTL(TIOCNOTTY) COMPATIBLE_IOCTL(TIOCSTI) COMPATIBLE_IOCTL(TIOCOUTQ) COMPATIBLE_IOCTL(TIOCSPGRP) COMPATIBLE_IOCTL(TIOCGPGRP) -COMPATIBLE_IOCTL(TIOCGPTN) -COMPATIBLE_IOCTL(TIOCSPTLCK) COMPATIBLE_IOCTL(TIOCSERGETLSR) -COMPATIBLE_IOCTL(TIOCSIG) #ifdef TIOCSRS485 COMPATIBLE_IOCTL(TIOCSRS485) #endif diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 8b2a994042dd..a66f6624d899 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -138,6 +138,14 @@ struct config_item *config_item_get(struct config_item *item) } EXPORT_SYMBOL(config_item_get); +struct config_item *config_item_get_unless_zero(struct config_item *item) +{ + if (item && kref_get_unless_zero(&item->ci_kref)) + return item; + return NULL; +} +EXPORT_SYMBOL(config_item_get_unless_zero); + static void config_item_cleanup(struct config_item *item) { struct config_item_type *t = item->ci_type; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index a6ab012a2c6a..c8aabba502f6 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -83,14 +83,13 @@ static int create_link(struct config_item *parent_item, ret = -ENOMEM; sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); if (sl) { - sl->sl_target = config_item_get(item); spin_lock(&configfs_dirent_lock); if (target_sd->s_type & CONFIGFS_USET_DROPPING) { spin_unlock(&configfs_dirent_lock); - config_item_put(item); kfree(sl); return -ENOENT; } + sl->sl_target = config_item_get(item); list_add(&sl->sl_list, &target_sd->s_links); spin_unlock(&configfs_dirent_lock); ret = configfs_create_link(sl, parent_item->ci_dentry, diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index a409a84f1bca..6181e9526860 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -129,7 +129,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, goto errout; } err = submit_bio_wait(bio); - if ((err == 0) && bio->bi_error) + if (err == 0 && bio->bi_status) err = -EIO; bio_put(bio); if (err) @@ -84,7 +84,7 @@ struct exceptional_entry_key { }; struct wait_exceptional_entry_queue { - wait_queue_t wait; + wait_queue_entry_t wait; struct exceptional_entry_key key; }; @@ -108,7 +108,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, return wait_table + hash; } -static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; @@ -461,35 +461,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) } /* - * Invalidate exceptional DAX entry if easily possible. This handles DAX - * entries for invalidate_inode_pages() so we evict the entry only if we can - * do so without blocking. - */ -int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - int ret = 0; - void *entry, **slot; - struct radix_tree_root *page_tree = &mapping->page_tree; - - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(page_tree, index, NULL, &slot); - if (!entry || !radix_tree_exceptional_entry(entry) || - slot_locked(mapping, slot)) - goto out; - if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) - goto out; - radix_tree_delete(page_tree, index); - mapping->nrexceptional--; - ret = 1; -out: - spin_unlock_irq(&mapping->tree_lock); - if (ret) - dax_wake_mapping_entry_waiter(mapping, index, entry, true); - return ret; -} - -/* * Invalidate exceptional DAX entry if it is clean. */ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, @@ -888,6 +859,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (ret < 0) goto out; } + start_index = indices[pvec.nr - 1] + 1; } out: put_dax(dax_dev); @@ -993,12 +965,12 @@ int __dax_zero_page_range(struct block_device *bdev, void *kaddr; pfn_t pfn; - rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); if (rc) return rc; id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, + rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); if (rc < 0) { dax_read_unlock(id); @@ -1044,7 +1016,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ - if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) { + if (iomap->flags & IOMAP_F_NEW) { invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); @@ -1177,6 +1149,23 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; + entry = grab_mapping_entry(mapping, vmf->pgoff, 0); + if (IS_ERR(entry)) { + vmf_ret = dax_fault_return(PTR_ERR(entry)); + goto out; + } + + /* + * It is possible, particularly with mixed reads & writes to private + * mappings, that we have raced with a PMD fault that overlaps with + * the PTE we need to set up. If so just return and the fault will be + * retried. + */ + if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { + vmf_ret = VM_FAULT_NOPAGE; + goto unlock_entry; + } + /* * Note that we don't bother to use iomap_apply here: DAX required * the file system block size to be equal the page size, which means @@ -1185,17 +1174,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); if (error) { vmf_ret = dax_fault_return(error); - goto out; + goto unlock_entry; } if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { - vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ - goto finish_iomap; - } - - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); - if (IS_ERR(entry)) { - vmf_ret = dax_fault_return(PTR_ERR(entry)); - goto finish_iomap; + error = -EIO; /* fs corruption? */ + goto error_finish_iomap; } sector = dax_iomap_sector(&iomap, pos); @@ -1217,13 +1200,13 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, } if (error) - goto error_unlock_entry; + goto error_finish_iomap; __SetPageUptodate(vmf->cow_page); vmf_ret = finish_fault(vmf); if (!vmf_ret) vmf_ret = VM_FAULT_DONE_COW; - goto unlock_entry; + goto finish_iomap; } switch (iomap.type) { @@ -1243,7 +1226,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { vmf_ret = dax_load_hole(mapping, &entry, vmf); - goto unlock_entry; + goto finish_iomap; } /*FALLTHRU*/ default: @@ -1252,10 +1235,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, break; } - error_unlock_entry: + error_finish_iomap: vmf_ret = dax_fault_return(error) | major; - unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); finish_iomap: if (ops->iomap_end) { int copied = PAGE_SIZE; @@ -1270,7 +1251,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, */ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } -out: + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + out: trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } @@ -1417,6 +1400,28 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, goto fallback; /* + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX + * PMD or a HZP entry. If it can't (because a 4k page is already in + * the tree, for instance), it will return -EEXIST and we just fall + * back to 4k entries. + */ + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); + if (IS_ERR(entry)) + goto fallback; + + /* + * It is possible, particularly with mixed reads & writes to private + * mappings, that we have raced with a PTE fault that overlaps with + * the PMD we need to set up. If so just return and the fault will be + * retried. + */ + if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && + !pmd_devmap(*vmf->pmd)) { + result = 0; + goto unlock_entry; + } + + /* * Note that we don't use iomap_apply here. We aren't doing I/O, only * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. @@ -1424,21 +1429,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pos = (loff_t)pgoff << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) - goto fallback; + goto unlock_entry; if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; - /* - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX - * PMD or a HZP entry. If it can't (because a 4k page is already in - * the tree, for instance), it will return -EEXIST and we just fall - * back to 4k entries. - */ - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); - if (IS_ERR(entry)) - goto finish_iomap; - switch (iomap.type) { case IOMAP_MAPPED: result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); @@ -1446,7 +1441,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) - goto unlock_entry; + break; result = dax_pmd_load_hole(vmf, &iomap, &entry); break; default: @@ -1454,8 +1449,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, break; } - unlock_entry: - put_locked_mapping_entry(mapping, pgoff, entry); finish_iomap: if (ops->iomap_end) { int copied = PMD_SIZE; @@ -1471,6 +1464,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, &iomap); } + unlock_entry: + put_locked_mapping_entry(mapping, pgoff, entry); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); diff --git a/fs/dcache.c b/fs/dcache.c index cddf39777835..a9f995f6859e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1494,7 +1494,7 @@ static void check_and_drop(void *_data) { struct detach_data *data = _data; - if (!data->mountpoint && !data->select.found) + if (!data->mountpoint && list_empty(&data->select.dispose)) __d_drop(data->select.start); } @@ -1536,17 +1536,15 @@ void d_invalidate(struct dentry *dentry) d_walk(dentry, &data, detach_and_collect, check_and_drop); - if (data.select.found) + if (!list_empty(&data.select.dispose)) shrink_dentry_list(&data.select.dispose); + else if (!data.mountpoint) + return; if (data.mountpoint) { detach_mounts(data.mountpoint); dput(data.mountpoint); } - - if (!data.mountpoint && !data.select.found) - break; - cond_resched(); } } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 354e2ab62031..6dabc4a10396 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -9,7 +9,7 @@ * 2 as published by the Free Software Foundation. * * debugfs is for people to use instead of /proc or /sys. - * See Documentation/DocBook/filesystems for more details. + * See Documentation/filesystems/ for more details. * */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e892ae7d89f8..77440e4aa9d4 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -9,7 +9,7 @@ * 2 as published by the Free Software Foundation. * * debugfs is for people to use instead of /proc or /sys. - * See Documentation/DocBook/kernel-api for more details. + * See ./Documentation/core-api/kernel-api.rst for more details. * */ diff --git a/fs/direct-io.c b/fs/direct-io.c index a04ebea77de8..08cf27811e5a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -294,7 +294,7 @@ static void dio_aio_complete_work(struct work_struct *work) dio_complete(dio, 0, true); } -static int dio_bio_complete(struct dio *dio, struct bio *bio); +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); /* * Asynchronous IO callback. @@ -348,13 +348,12 @@ static void dio_bio_end_io(struct bio *bio) /** * dio_end_io - handle the end io action for the given bio * @bio: The direct io bio thats being completed - * @error: Error if there was one * * This is meant to be called by any filesystem that uses their own dio_submit_t * so that the DIO specific endio actions are dealt with after the filesystem * has done it's completion work. */ -void dio_end_io(struct bio *bio, int error) +void dio_end_io(struct bio *bio) { struct dio *dio = bio->bi_private; @@ -386,6 +385,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, else bio->bi_end_io = dio_bio_end_io; + bio->bi_write_hint = dio->iocb->ki_hint; + sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } @@ -474,17 +475,20 @@ static struct bio *dio_await_one(struct dio *dio) /* * Process one completed BIO. No locks are held. */ -static int dio_bio_complete(struct dio *dio, struct bio *bio) +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) { struct bio_vec *bvec; unsigned i; - int err; + blk_status_t err = bio->bi_status; - if (bio->bi_error) - dio->io_error = -EIO; + if (err) { + if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT)) + dio->io_error = -EAGAIN; + else + dio->io_error = -EIO; + } if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { - err = bio->bi_error; bio_check_pages_dirty(bio); /* transfers ownership */ } else { bio_for_each_segment_all(bvec, bio, i) { @@ -495,7 +499,6 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) set_page_dirty_lock(page); put_page(page); } - err = bio->bi_error; bio_put(bio); } return err; @@ -539,7 +542,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) bio = dio->bio_list; dio->bio_list = bio->bi_private; spin_unlock_irqrestore(&dio->bio_lock, flags); - ret2 = dio_bio_complete(dio, bio); + ret2 = blk_status_to_errno(dio_bio_complete(dio, bio)); if (ret == 0) ret = ret2; } @@ -1197,6 +1200,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, if (iov_iter_rw(iter) == WRITE) { dio->op = REQ_OP_WRITE; dio->op_flags = REQ_SYNC | REQ_IDLE; + if (iocb->ki_flags & IOCB_NOWAIT) + dio->op_flags |= REQ_NOWAIT; } else { dio->op = REQ_OP_READ; } diff --git a/fs/eventfd.c b/fs/eventfd.c index 68b9fffcb2c8..2fb4eadaa118 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -191,7 +191,7 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) * This is used to atomically remove a wait queue entry from the eventfd wait * queue head, and read/reset the counter value. */ -int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt) { unsigned long flags; @@ -215,8 +215,8 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); * * Returns %0 if successful, or the following error codes: * - * -EAGAIN : The operation would have blocked but @no_wait was non-zero. - * -ERESTARTSYS : A signal interrupted the wait operation. + * - -EAGAIN : The operation would have blocked but @no_wait was non-zero. + * - -ERESTARTSYS : A signal interrupted the wait operation. * * If @no_wait is zero, the function might sleep until the eventfd internal * counter becomes greater than zero. diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5420767c9b68..b1c8e23ddf65 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -244,7 +244,7 @@ struct eppoll_entry { * Wait queue item that will be linked to the target file wait * queue head. */ - wait_queue_t wait; + wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; @@ -347,13 +347,13 @@ static inline int ep_is_linked(struct list_head *p) return !list_empty(p); } -static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ -static inline struct epitem *ep_item_from_wait(wait_queue_t *p) +static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } @@ -1078,7 +1078,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) * mechanism. It is called by the stored file descriptors when they * have events to report. */ -static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; @@ -1094,7 +1094,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * can't use __remove_wait_queue(). whead->lock is held by * the caller. */ - list_del_init(&wait->task_list); + list_del_init(&wait->entry); } spin_lock_irqsave(&ep->lock, flags); @@ -1699,7 +1699,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int res = 0, eavail, timed_out = 0; unsigned long flags; u64 slack = 0; - wait_queue_t wait; + wait_queue_entry_t wait; ktime_t expires, *to = NULL; if (timeout > 0) { diff --git a/fs/exec.c b/fs/exec.c index 72934df68471..904199086490 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -220,8 +220,26 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, if (write) { unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; + unsigned long ptr_size; struct rlimit *rlim; + /* + * Since the stack will hold pointers to the strings, we + * must account for them as well. + * + * The size calculation is the entire vma while each arg page is + * built, so each time we get here it's calculating how far it + * is currently (rather than each call being just the newly + * added size from the arg page). As a result, we need to + * always add the entire size of the pointers, so that on the + * last call to get_arg_page() we'll actually have the entire + * correct size. + */ + ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + if (ptr_size > ULONG_MAX - size) + goto fail; + size += ptr_size; + acct_arg_size(bprm, size / PAGE_SIZE); /* @@ -239,13 +257,15 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ rlim = current->signal->rlim; - if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { - put_page(page); - return NULL; - } + if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) + goto fail; } return page; + +fail: + put_page(page); + return NULL; } static void put_arg_page(struct page *page) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 26d77f9f8c12..2dcbd5698884 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -817,7 +817,7 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->bdev = bdev; iomap->offset = (u64)first_block << blkbits; if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); else iomap->dax_dev = NULL; @@ -841,7 +841,7 @@ static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { - put_dax(iomap->dax_dev); + fs_put_dax(iomap->dax_dev); if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 8ac673c71a36..9c2028b50e5c 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -32,6 +32,7 @@ #include <linux/log2.h> #include <linux/quotaops.h> #include <linux/uaccess.h> +#include <linux/dax.h> #include "ext2.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index fd389935ecd1..3ec0e46de95f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -4,6 +4,7 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ +#include <linux/quotaops.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" @@ -232,6 +233,9 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) handle_t *handle; int error, retries = 0; + error = dquot_initialize(inode); + if (error) + return error; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, ext4_jbd2_credits_xattr(inode)); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8e8046104f4d..32191548abed 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2523,7 +2523,6 @@ extern int ext4_search_dir(struct buffer_head *bh, int buf_size, struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(handle_t *handle, @@ -3007,7 +3006,6 @@ extern int htree_inlinedir_to_tree(struct file *dir_file, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); extern int ext4_delete_inline_entry(handle_t *handle, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2a97dff87b96..3e36508610b7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3413,13 +3413,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; - struct ext4_extent zero_ex; + struct ext4_extent zero_ex1, zero_ex2; struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int allocated = 0, max_zeroout = 0; int err = 0; - int split_flag = 0; + int split_flag = EXT4_EXT_DATA_VALID2; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, @@ -3436,7 +3436,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - zero_ex.ee_len = 0; + zero_ex1.ee_len = 0; + zero_ex2.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3576,62 +3577,52 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (ext4_encrypted_inode(inode)) max_zeroout = 0; - /* If extent is less than s_max_zeroout_kb, zeroout directly */ - if (max_zeroout && (ee_len <= max_zeroout)) { - err = ext4_ext_zeroout(inode, ex); - if (err) - goto out; - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); - ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - ext4_ext_mark_initialized(ex); - ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - goto out; - } - /* - * four cases: + * five cases: * 1. split the extent into three extents. - * 2. split the extent into two extents, zeroout the first half. - * 3. split the extent into two extents, zeroout the second half. + * 2. split the extent into two extents, zeroout the head of the first + * extent. + * 3. split the extent into two extents, zeroout the tail of the second + * extent. * 4. split the extent into two extents with out zeroout. + * 5. no splitting needed, just possibly zeroout the head and / or the + * tail of the extent. */ split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (max_zeroout && (allocated > map->m_len)) { + if (max_zeroout && (allocated > split_map.m_len)) { if (allocated <= max_zeroout) { - /* case 3 */ - zero_ex.ee_block = - cpu_to_le32(map->m_lblk); - zero_ex.ee_len = cpu_to_le16(allocated); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex) + map->m_lblk - ee_block); - err = ext4_ext_zeroout(inode, &zero_ex); + /* case 3 or 5 */ + zero_ex1.ee_block = + cpu_to_le32(split_map.m_lblk + + split_map.m_len); + zero_ex1.ee_len = + cpu_to_le16(allocated - split_map.m_len); + ext4_ext_store_pblock(&zero_ex1, + ext4_ext_pblock(ex) + split_map.m_lblk + + split_map.m_len - ee_block); + err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto out; - split_map.m_lblk = map->m_lblk; split_map.m_len = allocated; - } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { - /* case 2 */ - if (map->m_lblk != ee_block) { - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(map->m_lblk - + } + if (split_map.m_lblk - ee_block + split_map.m_len < + max_zeroout) { + /* case 2 or 5 */ + if (split_map.m_lblk != ee_block) { + zero_ex2.ee_block = ex->ee_block; + zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - ee_block); - ext4_ext_store_pblock(&zero_ex, + ext4_ext_store_pblock(&zero_ex2, ext4_ext_pblock(ex)); - err = ext4_ext_zeroout(inode, &zero_ex); + err = ext4_ext_zeroout(inode, &zero_ex2); if (err) goto out; } + split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; - split_map.m_len = map->m_lblk - ee_block + map->m_len; allocated = map->m_len; } } @@ -3642,8 +3633,11 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = 0; out: /* If we have gotten a failure, don't zero out status tree */ - if (!err) - err = ext4_zeroout_es(inode, &zero_ex); + if (!err) { + err = ext4_zeroout_es(inode, &zero_ex1); + if (!err) + err = ext4_zeroout_es(inode, &zero_ex2); + } return err ? err : allocated; } @@ -4883,6 +4877,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); @@ -5569,6 +5565,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); @@ -5742,6 +5739,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index cefa9835f275..58e2eeaa0bc4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -37,7 +37,11 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock_shared(inode); + if (!inode_trylock_shared(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock_shared(inode); + } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore @@ -179,7 +183,11 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -216,7 +224,12 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_dax_write_iter(iocb, from); #endif - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -235,9 +248,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->private = &overwrite; /* Check whether we do a DIO overwrite or not */ - if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && - ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) - overwrite = 1; + if (o_direct && !unaligned_aio) { + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { + if (ext4_should_dioread_nolock(inode)) + overwrite = 1; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + } ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); @@ -257,6 +276,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { int result; + handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -264,12 +284,24 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + EXT4_DATA_TRANS_BLOCKS(sb)); + } else { + down_read(&EXT4_I(inode)->i_mmap_sem); } - down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); - up_read(&EXT4_I(inode)->i_mmap_sem); - if (write) + if (!IS_ERR(handle)) + result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); + else + result = VM_FAULT_SIGBUS; + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); + } else { + up_read(&EXT4_I(inode)->i_mmap_sem); + } return result; } @@ -422,6 +454,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret < 0) return ret; } + + /* Set the flags to support nowait AIO */ + filp->f_mode |= FMODE_AIO_NOWAIT; + return dquot_file_open(inode, filp); } @@ -461,57 +497,37 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, endoff = (loff_t)end_blk << blkbits; index = startoff >> PAGE_SHIFT; - end = endoff >> PAGE_SHIFT; + end = (endoff - 1) >> PAGE_SHIFT; pagevec_init(&pvec, 0); do { int i, num; unsigned long nr_pages; - num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, (pgoff_t)num); - if (nr_pages == 0) { - if (whence == SEEK_DATA) - break; - - BUG_ON(whence != SEEK_HOLE); - /* - * If this is the first time to go into the loop and - * offset is not beyond the end offset, it will be a - * hole at this offset - */ - if (lastoff == startoff || lastoff < endoff) - found = 1; + if (nr_pages == 0) break; - } - - /* - * If this is the first time to go into the loop and - * offset is smaller than the first page offset, it will be a - * hole at this offset. - */ - if (lastoff == startoff && whence == SEEK_HOLE && - lastoff < page_offset(pvec.pages[0])) { - found = 1; - break; - } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; struct buffer_head *bh, *head; /* - * If the current offset is not beyond the end of given - * range, it will be a hole. + * If current offset is smaller than the page offset, + * there is a hole at this offset. */ - if (lastoff < endoff && whence == SEEK_HOLE && - page->index > end) { + if (whence == SEEK_HOLE && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { found = 1; *offset = lastoff; goto out; } + if (page->index > end) + goto out; + lock_page(page); if (unlikely(page->mapping != inode->i_mapping)) { @@ -551,20 +567,18 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); } - /* - * The no. of pages is less than our desired, that would be a - * hole in there. - */ - if (nr_pages < num && whence == SEEK_HOLE) { - found = 1; - *offset = lastoff; + /* The no. of pages is less than our desired, we are done. */ + if (nr_pages < num) break; - } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + if (whence == SEEK_HOLE && lastoff < endoff) { + found = 1; + *offset = lastoff; + } out: pagevec_release(&pvec); return found; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d5dea4c293ef..8d141c0c8ff9 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1627,7 +1627,6 @@ out: struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) { @@ -1649,7 +1648,7 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, - dir, fname, d_name, 0, res_dir); + dir, fname, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) @@ -1662,7 +1661,7 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, - dir, fname, d_name, 0, res_dir); + dir, fname, 0, res_dir); if (ret == 1) goto out_find; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5834c4d76be8..5cf82d03968c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2124,15 +2124,29 @@ static int ext4_writepage(struct page *page, static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) { int len; - loff_t size = i_size_read(mpd->inode); + loff_t size; int err; BUG_ON(page->index != mpd->first_page); + clear_page_dirty_for_io(page); + /* + * We have to be very careful here! Nothing protects writeback path + * against i_size changes and the page can be writeably mapped into + * page tables. So an application can be growing i_size and writing + * data through mmap while writeback runs. clear_page_dirty_for_io() + * write-protects our page in page tables and the page cannot get + * written to again until we release page lock. So only after + * clear_page_dirty_for_io() we are safe to sample i_size for + * ext4_bio_write_page() to zero-out tail of the written page. We rely + * on the barrier provided by TestClearPageDirty in + * clear_page_dirty_for_io() to make sure i_size is really sampled only + * after page tables are updated. + */ + size = i_size_read(mpd->inode); if (page->index == size >> PAGE_SHIFT) len = size & ~PAGE_MASK; else len = PAGE_SIZE; - clear_page_dirty_for_io(page); err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); if (!err) mpd->wbc->nr_to_write--; @@ -3412,7 +3426,7 @@ retry: bdev = inode->i_sb->s_bdev; iomap->bdev = bdev; if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); else iomap->dax_dev = NULL; iomap->offset = first_block << blkbits; @@ -3447,7 +3461,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, int blkbits = inode->i_blkbits; bool truncate = false; - put_dax(iomap->dax_dev); + fs_put_dax(iomap->dax_dev); if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) return 0; @@ -3629,9 +3643,6 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) get_block_func = ext4_dio_get_block_unwritten_async; dio_flags = DIO_LOCKING; } -#ifdef CONFIG_EXT4_FS_ENCRYPTION - BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); -#endif ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block_func, ext4_end_io_dio, NULL, dio_flags); @@ -3713,7 +3724,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) */ inode_lock_shared(inode); ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count); + iocb->ki_pos + count - 1); if (ret) goto out_unlock; ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, @@ -4207,6 +4218,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_dio: @@ -5637,8 +5650,9 @@ static int ext4_expand_extra_isize(struct inode *inode, /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { - memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, - new_extra_isize); + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize, 0, + new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5083bce20ac4..b7928cddd539 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3887,7 +3887,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - ext4_error(sb, "Error loading buddy information for %u", group); + ext4_warning(sb, "Error %d loading buddy information for %u", + err, group); put_bh(bitmap_bh); return 0; } @@ -4044,10 +4045,11 @@ repeat: BUG_ON(pa->pa_type != MB_INODE_PA); group = ext4_get_group_number(sb, pa->pa_pstart); - err = ext4_mb_load_buddy(sb, group, &e4b); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); if (err) { - ext4_error(sb, "Error loading buddy information for %u", - group); + ext4_error(sb, "Error %d loading buddy information for %u", + err, group); continue; } @@ -4303,11 +4305,14 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, spin_unlock(&lg->lg_prealloc_lock); list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { + int err; group = ext4_get_group_number(sb, pa->pa_pstart); - if (ext4_mb_load_buddy(sb, group, &e4b)) { - ext4_error(sb, "Error loading buddy information for %u", - group); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) { + ext4_error(sb, "Error %d loading buddy information for %u", + err, group); continue; } ext4_lock_group(sb, group); @@ -5127,8 +5132,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { - ext4_error(sb, "Error in loading buddy " - "information for %u", group); + ext4_warning(sb, "Error %d loading buddy information for %u", + ret, group); return ret; } bitmap = e4b.bd_bitmap; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b81f7d46f344..404256caf9cf 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1155,12 +1155,11 @@ errout: static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, - fname, d_name, offset, res_dir); + fname, offset, res_dir); } /* @@ -1262,7 +1261,6 @@ static inline bool ext4_match(const struct ext4_filename *fname, */ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; @@ -1355,7 +1353,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, if (ext4_has_inline_data(dir)) { int has_inline_data = 1; - ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir, + ret = ext4_find_inline_entry(dir, &fname, res_dir, &has_inline_data); if (has_inline_data) { if (inlined) @@ -1447,7 +1445,7 @@ restart: goto next; } set_buffer_verified(bh); - i = search_dirblock(bh, dir, &fname, d_name, + i = search_dirblock(bh, dir, &fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; @@ -1488,7 +1486,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, { struct super_block * sb = dir->i_sb; struct dx_frame frames[2], *frame; - const struct qstr *d_name = fname->usr_fname; struct buffer_head *bh; ext4_lblk_t block; int retval; @@ -1505,7 +1502,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, if (IS_ERR(bh)) goto errout; - retval = search_dirblock(bh, dir, fname, d_name, + retval = search_dirblock(bh, dir, fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (retval == 1) @@ -1530,7 +1527,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, bh = NULL; errout: - dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); + dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name)); success: dx_release(frames); return bh; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1a82138ba739..c2fce4478cca 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -85,7 +85,7 @@ static void ext4_finish_bio(struct bio *bio) } #endif - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); mapping_set_error(page->mapping, -EIO); } @@ -104,7 +104,7 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_error) + if (bio->bi_status) buffer_io_error(bh); } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); @@ -303,24 +303,25 @@ static void ext4_end_bio(struct bio *bio) bdevname(bio->bi_bdev, b), (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), - bio->bi_error)) { + bio->bi_status)) { ext4_finish_bio(bio); bio_put(bio); return; } bio->bi_end_io = NULL; - if (bio->bi_error) { + if (bio->bi_status) { struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - bio->bi_error, inode->i_ino, + bio->bi_status, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); - mapping_set_error(inode->i_mapping, bio->bi_error); + mapping_set_error(inode->i_mapping, + blk_status_to_errno(bio->bi_status)); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { @@ -349,6 +350,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0; + io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint; bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); submit_bio(io->io_bio); } @@ -396,6 +398,7 @@ submit_and_retry: ret = io_submit_init_bio(io, bh); if (ret) return ret; + io->io_bio->bi_write_hint = inode->i_write_hint; } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a81b829d56de..40a5497b0f60 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -73,7 +73,7 @@ static void mpage_end_io(struct bio *bio) int i; if (ext4_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -83,7 +83,7 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { SetPageUptodate(page); } else { ClearPageUptodate(page); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c90edf09b0c3..9006cb5857b8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -37,6 +37,7 @@ #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> +#include <linux/dax.h> #include <linux/cleancache.h> #include <linux/uaccess.h> @@ -847,14 +848,9 @@ static inline void ext4_quota_off_umount(struct super_block *sb) { int type; - if (ext4_has_feature_quota(sb)) { - dquot_disable(sb, -1, - DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - } else { - /* Use our quota_off function to clear inode flags etc. */ - for (type = 0; type < EXT4_MAXQUOTAS; type++) - ext4_quota_off(sb, type); - } + /* Use our quota_off function to clear inode flags etc. */ + for (type = 0; type < EXT4_MAXQUOTAS; type++) + ext4_quota_off(sb, type); } #else static inline void ext4_quota_off_umount(struct super_block *sb) @@ -1178,6 +1174,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, return res; } + res = dquot_initialize(inode); + if (res) + return res; retry: handle = ext4_journal_start(inode, EXT4_HT_MISC, ext4_jbd2_credits_xattr(inode)); @@ -3951,7 +3950,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif - memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); @@ -5484,7 +5483,7 @@ static int ext4_quota_off(struct super_block *sb, int type) goto out; err = dquot_quota_off(sb, type); - if (err) + if (err || ext4_has_feature_quota(sb)) goto out_put; inode_lock(inode); @@ -5504,6 +5503,7 @@ static int ext4_quota_off(struct super_block *sb, int type) out_unlock: inode_unlock(inode); out_put: + lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL); iput(inode); return err; out: diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8fb7ce14e6eb..5d3c2536641c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -888,6 +888,8 @@ inserted: else { u32 ref; + WARN_ON_ONCE(dquot_initialize_needed(inode)); + /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, @@ -954,6 +956,8 @@ inserted: /* We need to allocate a new block */ ext4_fsblk_t goal, block; + WARN_ON_ONCE(dquot_initialize_needed(inode)); + goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); @@ -1166,6 +1170,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, return -EINVAL; if (strlen(name) > 255) return -ERANGE; + ext4_write_lock_xattr(inode, &no_expand); error = ext4_reserve_inode_write(handle, inode, &is.iloc); @@ -1267,6 +1272,9 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, int error, retries = 0; int credits = ext4_jbd2_credits_xattr(inode); + error = dquot_initialize(inode); + if (error) + return error; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7c0f6bdf817d..36fe82012a33 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -58,12 +58,12 @@ static void f2fs_read_end_io(struct bio *bio) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } #endif if (f2fs_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -74,7 +74,7 @@ static void f2fs_read_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { if (!PageUptodate(page)) SetPageUptodate(page); } else { @@ -102,14 +102,14 @@ static void f2fs_write_end_io(struct bio *bio) unlock_page(page); mempool_free(page, sbi->write_io_dummy); - if (unlikely(bio->bi_error)) + if (unlikely(bio->bi_status)) f2fs_stop_checkpoint(sbi, true); continue; } fscrypt_pullback_bio_page(&page, true); - if (unlikely(bio->bi_error)) { + if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2185c7a040a1..fd2e651bad6d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1078,6 +1078,7 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, { SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); u32 *ctx = (u32 *)shash_desc_ctx(shash); + u32 retval; int err; shash->tfm = sbi->s_chksum_driver; @@ -1087,7 +1088,9 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *ctx; + retval = *ctx; + barrier_data(ctx); + return retval; } static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 96845854e7ee..ea9f455d94ba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -749,7 +749,7 @@ static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - dc->error = bio->bi_error; + dc->error = blk_status_to_errno(bio->bi_status); dc->state = D_DONE; complete(&dc->wait); bio_put(bio); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 83355ec4a92c..0b89b0b7b9f7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1937,7 +1937,7 @@ try_onemore: sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); - memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; diff --git a/fs/fcntl.c b/fs/fcntl.c index f4e7267d117f..b6bd89628025 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -109,20 +109,34 @@ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, } EXPORT_SYMBOL(__f_setown); -void f_setown(struct file *filp, unsigned long arg, int force) +int f_setown(struct file *filp, unsigned long arg, int force) { enum pid_type type; - struct pid *pid; - int who = arg; + struct pid *pid = NULL; + int who = arg, ret = 0; + type = PIDTYPE_PID; if (who < 0) { + /* avoid overflow below */ + if (who == INT_MIN) + return -EINVAL; + type = PIDTYPE_PGID; who = -who; } + rcu_read_lock(); - pid = find_vpid(who); - __f_setown(filp, pid, type, force); + if (who) { + pid = find_vpid(who); + if (!pid) + ret = -ESRCH; + } + + if (!ret) + __f_setown(filp, pid, type, force); rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL(f_setown); @@ -243,9 +257,72 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) } #endif +static bool rw_hint_valid(enum rw_hint hint) +{ + switch (hint) { + case RWF_WRITE_LIFE_NOT_SET: + case RWH_WRITE_LIFE_NONE: + case RWH_WRITE_LIFE_SHORT: + case RWH_WRITE_LIFE_MEDIUM: + case RWH_WRITE_LIFE_LONG: + case RWH_WRITE_LIFE_EXTREME: + return true; + default: + return false; + } +} + +static long fcntl_rw_hint(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + u64 *argp = (u64 __user *)arg; + enum rw_hint hint; + u64 h; + + switch (cmd) { + case F_GET_FILE_RW_HINT: + h = file_write_hint(file); + if (copy_to_user(argp, &h, sizeof(*argp))) + return -EFAULT; + return 0; + case F_SET_FILE_RW_HINT: + if (copy_from_user(&h, argp, sizeof(h))) + return -EFAULT; + hint = (enum rw_hint) h; + if (!rw_hint_valid(hint)) + return -EINVAL; + + spin_lock(&file->f_lock); + file->f_write_hint = hint; + spin_unlock(&file->f_lock); + return 0; + case F_GET_RW_HINT: + h = inode->i_write_hint; + if (copy_to_user(argp, &h, sizeof(*argp))) + return -EFAULT; + return 0; + case F_SET_RW_HINT: + if (copy_from_user(&h, argp, sizeof(h))) + return -EFAULT; + hint = (enum rw_hint) h; + if (!rw_hint_valid(hint)) + return -EINVAL; + + inode_lock(inode); + inode->i_write_hint = hint; + inode_unlock(inode); + return 0; + default: + return -EINVAL; + } +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { + void __user *argp = (void __user *)arg; + struct flock flock; long err = -EINVAL; switch (cmd) { @@ -273,7 +350,11 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_OFD_GETLK: #endif case F_GETLK: - err = fcntl_getlk(filp, cmd, (struct flock __user *) arg); + if (copy_from_user(&flock, argp, sizeof(flock))) + return -EFAULT; + err = fcntl_getlk(filp, cmd, &flock); + if (!err && copy_to_user(argp, &flock, sizeof(flock))) + return -EFAULT; break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ @@ -283,7 +364,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, /* Fallthrough */ case F_SETLK: case F_SETLKW: - err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); + if (copy_from_user(&flock, argp, sizeof(flock))) + return -EFAULT; + err = fcntl_setlk(fd, filp, cmd, &flock); break; case F_GETOWN: /* @@ -297,8 +380,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - f_setown(filp, arg, 1); - err = 0; + err = f_setown(filp, arg, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); @@ -337,6 +419,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_GET_SEALS: err = shmem_fcntl(filp, cmd, arg); break; + case F_GET_RW_HINT: + case F_SET_RW_HINT: + case F_GET_FILE_RW_HINT: + case F_SET_FILE_RW_HINT: + err = fcntl_rw_hint(filp, cmd, arg); + break; default: break; } @@ -383,7 +471,9 @@ out: SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { + void __user *argp = (void __user *)arg; struct fd f = fdget_raw(fd); + struct flock64 flock; long err = -EBADF; if (!f.file) @@ -401,14 +491,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, switch (cmd) { case F_GETLK64: case F_OFD_GETLK: - err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg); + err = -EFAULT; + if (copy_from_user(&flock, argp, sizeof(flock))) + break; + err = fcntl_getlk64(f.file, cmd, &flock); + if (!err && copy_to_user(argp, &flock, sizeof(flock))) + err = -EFAULT; break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: - err = fcntl_setlk64(fd, f.file, cmd, - (struct flock64 __user *) arg); + err = -EFAULT; + if (copy_from_user(&flock, argp, sizeof(flock))) + break; + err = fcntl_setlk64(fd, f.file, cmd, &flock); break; default: err = do_fcntl(fd, cmd, arg, f.file); @@ -422,57 +519,56 @@ out: #endif #ifdef CONFIG_COMPAT +/* careful - don't use anywhere else */ +#define copy_flock_fields(from, to) \ + (to).l_type = (from).l_type; \ + (to).l_whence = (from).l_whence; \ + (to).l_start = (from).l_start; \ + (to).l_len = (from).l_len; \ + (to).l_pid = (from).l_pid; + static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) { - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) + struct compat_flock fl; + + if (copy_from_user(&fl, ufl, sizeof(struct compat_flock))) return -EFAULT; + copy_flock_fields(*kfl, fl); return 0; } -static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) { - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) + struct compat_flock64 fl; + + if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64))) return -EFAULT; + copy_flock_fields(*kfl, fl); return 0; } -#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64 -static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) { - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) + struct compat_flock fl; + + memset(&fl, 0, sizeof(struct compat_flock)); + copy_flock_fields(fl, *kfl); + if (copy_to_user(ufl, &fl, sizeof(struct compat_flock))) return -EFAULT; return 0; } -#endif -#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64 static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) { - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) + struct compat_flock64 fl; + + memset(&fl, 0, sizeof(struct compat_flock64)); + copy_flock_fields(fl, *kfl); + if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64))) return -EFAULT; return 0; } -#endif +#undef copy_flock_fields static unsigned int convert_fcntl_cmd(unsigned int cmd) @@ -489,76 +585,92 @@ convert_fcntl_cmd(unsigned int cmd) return cmd; } +/* + * GETLK was successful and we need to return the data, but it needs to fit in + * the compat structure. + * l_start shouldn't be too big, unless the original start + end is greater than + * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return + * -EOVERFLOW in that case. l_len could be too big, in which case we just + * truncate it, and only allow the app to see that part of the conflicting lock + * that might make sense to it anyway + */ +static int fixup_compat_flock(struct flock *flock) +{ + if (flock->l_start > COMPAT_OFF_T_MAX) + return -EOVERFLOW; + if (flock->l_len > COMPAT_OFF_T_MAX) + flock->l_len = COMPAT_OFF_T_MAX; + return 0; +} + COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { - mm_segment_t old_fs; - struct flock f; - long ret; - unsigned int conv_cmd; + struct fd f = fdget_raw(fd); + struct flock flock; + long err = -EBADF; + + if (!f.file) + return err; + + if (unlikely(f.file->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) + goto out_put; + } + + err = security_file_fcntl(f.file, cmd, arg); + if (err) + goto out_put; switch (cmd) { case F_GETLK: + err = get_compat_flock(&flock, compat_ptr(arg)); + if (err) + break; + err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); + if (err) + break; + err = fixup_compat_flock(&flock); + if (err) + return err; + err = put_compat_flock(&flock, compat_ptr(arg)); + break; + case F_GETLK64: + case F_OFD_GETLK: + err = get_compat_flock64(&flock, compat_ptr(arg)); + if (err) + break; + err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); + if (err) + break; + err = fixup_compat_flock(&flock); + if (err) + return err; + err = put_compat_flock64(&flock, compat_ptr(arg)); + break; case F_SETLK: case F_SETLKW: - ret = get_compat_flock(&f, compat_ptr(arg)); - if (ret != 0) + err = get_compat_flock(&flock, compat_ptr(arg)); + if (err) break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_fcntl(fd, cmd, (unsigned long)&f); - set_fs(old_fs); - if (cmd == F_GETLK && ret == 0) { - /* GETLK was successful and we need to return the data... - * but it needs to fit in the compat structure. - * l_start shouldn't be too big, unless the original - * start + end is greater than COMPAT_OFF_T_MAX, in which - * case the app was asking for trouble, so we return - * -EOVERFLOW in that case. - * l_len could be too big, in which case we just truncate it, - * and only allow the app to see that part of the conflicting - * lock that might make sense to it anyway - */ - - if (f.l_start > COMPAT_OFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_OFF_T_MAX) - f.l_len = COMPAT_OFF_T_MAX; - if (ret == 0) - ret = put_compat_flock(&f, compat_ptr(arg)); - } + err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock); break; - - case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: - ret = get_compat_flock64(&f, compat_ptr(arg)); - if (ret != 0) + err = get_compat_flock64(&flock, compat_ptr(arg)); + if (err) break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - conv_cmd = convert_fcntl_cmd(cmd); - ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); - set_fs(old_fs); - if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { - /* need to return lock information - see above for commentary */ - if (f.l_start > COMPAT_LOFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_LOFF_T_MAX) - f.l_len = COMPAT_LOFF_T_MAX; - if (ret == 0) - ret = put_compat_flock64(&f, compat_ptr(arg)); - } + err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock); break; - default: - ret = sys_fcntl(fd, cmd, arg); + err = do_fcntl(fd, cmd, arg, f.file); break; } - return ret; +out_put: + fdput(f); + return err; } COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 63ee2940775c..8b426f83909f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2052,11 +2052,13 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) } /** - * __mark_inode_dirty - internal function - * @inode: inode to mark - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) - * Mark an inode as dirty. Callers should use mark_inode_dirty or - * mark_inode_dirty_sync. + * __mark_inode_dirty - internal function + * + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. * * Put the inode on the super block's dirty list. * diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 611b5408f6ec..e747b3d720ee 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -34,7 +34,7 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m) void pin_kill(struct fs_pin *p) { - wait_queue_t wait; + wait_queue_entry_t wait; if (!p) { rcu_read_unlock(); @@ -61,7 +61,7 @@ void pin_kill(struct fs_pin *p) rcu_read_unlock(); schedule(); rcu_read_lock(); - if (likely(list_empty(&wait.task_list))) + if (likely(list_empty(&wait.entry))) break; /* OK, we know p couldn't have been freed yet */ spin_lock_irq(&p->wait.lock); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c2d7f3a92679..c16d00e53264 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -20,6 +20,7 @@ #include <linux/pipe_fs_i.h> #include <linux/swap.h> #include <linux/splice.h> +#include <linux/sched.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); @@ -45,7 +46,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); - atomic_set(&req->count, 1); + refcount_set(&req->count, 1); req->pages = pages; req->page_descs = page_descs; req->max_pages = npages; @@ -102,21 +103,20 @@ void fuse_request_free(struct fuse_req *req) void __fuse_get_request(struct fuse_req *req) { - atomic_inc(&req->count); + refcount_inc(&req->count); } /* Must be called with > 1 refcount */ static void __fuse_put_request(struct fuse_req *req) { - BUG_ON(atomic_read(&req->count) < 2); - atomic_dec(&req->count); + refcount_dec(&req->count); } -static void fuse_req_init_context(struct fuse_req *req) +static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req) { req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); - req->in.h.pid = current->pid; + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); } void fuse_set_initialized(struct fuse_conn *fc) @@ -163,7 +163,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, goto out; } - fuse_req_init_context(req); + fuse_req_init_context(fc, req); __set_bit(FR_WAITING, &req->flags); if (for_background) __set_bit(FR_BACKGROUND, &req->flags); @@ -256,7 +256,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, if (!req) req = get_reserved_req(fc, file); - fuse_req_init_context(req); + fuse_req_init_context(fc, req); __set_bit(FR_WAITING, &req->flags); __clear_bit(FR_BACKGROUND, &req->flags); return req; @@ -264,7 +264,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { - if (atomic_dec_and_test(&req->count)) { + if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { /* * We get here in the unlikely case that a background @@ -1222,6 +1222,9 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_in *in; unsigned reqsize; + if (task_active_pid_ns(current) != fc->pid_ns) + return -EIO; + restart: spin_lock(&fiq->waitq.lock); err = -EAGAIN; @@ -1820,6 +1823,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_req *req; struct fuse_out_header oh; + if (task_active_pid_ns(current) != fc->pid_ns) + return -EIO; + if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ec238fb5a584..3ee4fdc3da9e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -58,7 +58,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) } INIT_LIST_HEAD(&ff->write_entry); - atomic_set(&ff->count, 1); + refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -77,7 +77,7 @@ void fuse_file_free(struct fuse_file *ff) static struct fuse_file *fuse_file_get(struct fuse_file *ff) { - atomic_inc(&ff->count); + refcount_inc(&ff->count); return ff; } @@ -88,7 +88,7 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) static void fuse_file_put(struct fuse_file *ff, bool sync) { - if (atomic_dec_and_test(&ff->count)) { + if (refcount_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; if (ff->fc->no_open) { @@ -293,7 +293,7 @@ static int fuse_release(struct inode *inode, struct file *file) void fuse_sync_release(struct fuse_file *ff, int flags) { - WARN_ON(atomic_read(&ff->count) != 1); + WARN_ON(refcount_read(&ff->count) > 1); fuse_prepare_release(ff, flags, FUSE_RELEASE); /* * iput(NULL) is a no-op and since the refcount is 1 and everything's @@ -2083,7 +2083,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) return generic_file_mmap(file, vma); } -static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, +static int convert_fuse_file_lock(struct fuse_conn *fc, + const struct fuse_file_lock *ffl, struct file_lock *fl) { switch (ffl->type) { @@ -2098,7 +2099,14 @@ static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, fl->fl_start = ffl->start; fl->fl_end = ffl->end; - fl->fl_pid = ffl->pid; + + /* + * Convert pid into the caller's pid namespace. If the pid + * does not map into the namespace fl_pid will get set to 0. + */ + rcu_read_lock(); + fl->fl_pid = pid_vnr(find_pid_ns(ffl->pid, fc->pid_ns)); + rcu_read_unlock(); break; default: @@ -2147,7 +2155,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) args.out.args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) - err = convert_fuse_file_lock(&outarg.lk, fl); + err = convert_fuse_file_lock(fc, &outarg.lk, fl); return err; } @@ -2159,7 +2167,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) FUSE_ARGS(args); struct fuse_lk_in inarg; int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; - pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; + struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; + pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns); int err; if (fl->fl_lmops && fl->fl_lmops->lm_grant) { @@ -2168,10 +2177,13 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) } /* Unlock on close is handled by the flush method */ - if (fl->fl_flags & FL_CLOSE) + if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) return 0; - fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg); + if (pid && pid_nr == 0) + return -EOVERFLOW; + + fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fc, &args); /* locking is restartable */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f33341d9501a..1bd7ffdad593 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -24,6 +24,8 @@ #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/xattr.h> +#include <linux/pid_namespace.h> +#include <linux/refcount.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -137,7 +139,7 @@ struct fuse_file { u64 nodeid; /** Refcount */ - atomic_t count; + refcount_t count; /** FOPEN_* flags returned by open */ u32 open_flags; @@ -306,7 +308,7 @@ struct fuse_req { struct list_head intr_entry; /** refcount */ - atomic_t count; + refcount_t count; /** Unique ID for the interrupt request */ u64 intr_unique; @@ -448,7 +450,7 @@ struct fuse_conn { spinlock_t lock; /** Refcount */ - atomic_t count; + refcount_t count; /** Number of fuse_dev's */ atomic_t dev_count; @@ -461,6 +463,9 @@ struct fuse_conn { /** The group id for this mount */ kgid_t group_id; + /** The pid namespace for this mount */ + struct pid_namespace *pid_ns; + /** Maximum read size */ unsigned max_read; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 73cf05135252..65c88379a3a1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/exportfs.h> #include <linux/posix_acl.h> +#include <linux/pid_namespace.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -601,7 +602,7 @@ void fuse_conn_init(struct fuse_conn *fc) memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); init_rwsem(&fc->killsb); - atomic_set(&fc->count, 1); + refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); init_waitqueue_head(&fc->reserved_req_waitq); @@ -619,14 +620,16 @@ void fuse_conn_init(struct fuse_conn *fc) fc->connected = 1; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); + fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(fuse_conn_init); void fuse_conn_put(struct fuse_conn *fc) { - if (atomic_dec_and_test(&fc->count)) { + if (refcount_dec_and_test(&fc->count)) { if (fc->destroy_req) fuse_request_free(fc->destroy_req); + put_pid_ns(fc->pid_ns); fc->release(fc); } } @@ -634,7 +637,7 @@ EXPORT_SYMBOL_GPL(fuse_conn_put); struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) { - atomic_inc(&fc->count); + refcount_inc(&fc->count); return fc; } EXPORT_SYMBOL_GPL(fuse_conn_get); @@ -972,8 +975,15 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) int err; char *suffix = ""; - if (sb->s_bdev) + if (sb->s_bdev) { suffix = "-fuseblk"; + /* + * sb->s_bdi points to blkdev's bdi however we want to redirect + * it to our private bdi... + */ + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + } err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev), MINOR(fc->dev), suffix); if (err) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 4d810be532dd..9fa3aef9a5b3 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -970,7 +970,7 @@ more_rgrps: continue; bn = be64_to_cpu(*p); if (gfs2_holder_initialized(rd_gh)) { - rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object; + rgd = gfs2_glock2rgrp(rd_gh->gh_gl); gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(rd_gh->gh_gl)); } else { diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 79113219be5f..db427658ccd9 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1444,7 +1444,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, "g.offset (%u)\n", (unsigned long long)bh->b_blocknr, entries2, g.offset); - + gfs2_consist_inode(ip); error = -EIO; goto out_free; } @@ -1612,6 +1612,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, (unsigned long long)dip->i_no_addr, dip->i_entries, g.offset); + gfs2_consist_inode(dip); error = -EIO; goto out; } @@ -2031,8 +2032,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { - struct gfs2_rgrpd *rgd; - rgd = rlist.rl_ghs[x].gh_gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); + rg_blocks += rgd->rd_length; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 959a19ced4d5..6cd71c50b8bd 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -152,20 +152,34 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) spin_unlock(&lru_lock); } -/** - * gfs2_glock_put() - Decrement reference count on glock - * @gl: The glock to put - * +/* + * Enqueue the glock on the work queue. Passes one glock reference on to the + * work queue. */ +static void __gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) { + if (!queue_delayed_work(glock_workqueue, &gl->gl_work, delay)) { + /* + * We are holding the lockref spinlock, and the work was still + * queued above. The queued work (glock_work_func) takes that + * spinlock before dropping its glock reference(s), so it + * cannot have dropped them in the meantime. + */ + GLOCK_BUG_ON(gl, gl->gl_lockref.count < 2); + gl->gl_lockref.count--; + } +} -void gfs2_glock_put(struct gfs2_glock *gl) +static void gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) { + spin_lock(&gl->gl_lockref.lock); + __gfs2_glock_queue_work(gl, delay); + spin_unlock(&gl->gl_lockref.lock); +} + +static void __gfs2_glock_put(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); - if (lockref_put_or_lock(&gl->gl_lockref)) - return; - lockref_mark_dead(&gl->gl_lockref); gfs2_glock_remove_from_lru(gl); @@ -178,6 +192,20 @@ void gfs2_glock_put(struct gfs2_glock *gl) } /** + * gfs2_glock_put() - Decrement reference count on glock + * @gl: The glock to put + * + */ + +void gfs2_glock_put(struct gfs2_glock *gl) +{ + if (lockref_put_or_lock(&gl->gl_lockref)) + return; + + __gfs2_glock_put(gl); +} + +/** * may_grant - check if its ok to grant a new lock * @gl: The glock * @gh: The lock request which we wish to grant @@ -482,8 +510,7 @@ __acquires(&gl->gl_lockref.lock) target == LM_ST_UNLOCKED && test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) { finish_xmote(gl, target); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); + gfs2_glock_queue_work(gl, 0); } else if (ret) { pr_err("lm_lock ret %d\n", ret); @@ -492,8 +519,7 @@ __acquires(&gl->gl_lockref.lock) } } else { /* lock_nolock */ finish_xmote(gl, target); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); + gfs2_glock_queue_work(gl, 0); } spin_lock(&gl->gl_lockref.lock); @@ -565,8 +591,7 @@ out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); smp_mb__after_atomic(); gl->gl_lockref.count++; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gl->gl_lockref.count--; + __gfs2_glock_queue_work(gl, 0); return; out_unlock: @@ -601,11 +626,11 @@ static void glock_work_func(struct work_struct *work) { unsigned long delay = 0; struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); - int drop_ref = 0; + unsigned int drop_refs = 1; if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); - drop_ref = 1; + drop_refs++; } spin_lock(&gl->gl_lockref.lock); if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && @@ -623,17 +648,25 @@ static void glock_work_func(struct work_struct *work) } } run_queue(gl, 0); - spin_unlock(&gl->gl_lockref.lock); - if (!delay) - gfs2_glock_put(gl); - else { + if (delay) { + /* Keep one glock reference for the work we requeue. */ + drop_refs--; if (gl->gl_name.ln_type != LM_TYPE_INODE) delay = 0; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) - gfs2_glock_put(gl); + __gfs2_glock_queue_work(gl, delay); } - if (drop_ref) - gfs2_glock_put(gl); + + /* + * Drop the remaining glock references manually here. (Mind that + * __gfs2_glock_queue_work depends on the lockref spinlock begin held + * here as well.) + */ + gl->gl_lockref.count -= drop_refs; + if (!gl->gl_lockref.count) { + __gfs2_glock_put(gl); + return; + } + spin_unlock(&gl->gl_lockref.lock); } /** @@ -986,8 +1019,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) { set_bit(GLF_REPLY_PENDING, &gl->gl_flags); gl->gl_lockref.count++; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gl->gl_lockref.count--; + __gfs2_glock_queue_work(gl, 0); } run_queue(gl, 1); spin_unlock(&gl->gl_lockref.lock); @@ -1047,17 +1079,15 @@ void gfs2_glock_dq(struct gfs2_holder *gh) gfs2_glock_add_to_lru(gl); trace_gfs2_glock_queue(gh, 0); + if (unlikely(!fast_path)) { + gl->gl_lockref.count++; + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) + delay = gl->gl_hold_time; + __gfs2_glock_queue_work(gl, delay); + } spin_unlock(&gl->gl_lockref.lock); - if (likely(fast_path)) - return; - - gfs2_glock_hold(gl); - if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && - !test_bit(GLF_DEMOTE, &gl->gl_flags) && - gl->gl_name.ln_type == LM_TYPE_INODE) - delay = gl->gl_hold_time; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) - gfs2_glock_put(gl); } void gfs2_glock_dq_wait(struct gfs2_holder *gh) @@ -1233,9 +1263,8 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) spin_lock(&gl->gl_lockref.lock); handle_callback(gl, state, delay, true); + __gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) - gfs2_glock_put(gl); } /** @@ -1294,10 +1323,8 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) gl->gl_lockref.count++; set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + __gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); - - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); } static int glock_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -1355,8 +1382,7 @@ add_back_to_lru: if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gl->gl_lockref.count--; + __gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); cond_resched_lock(&lru_lock); } @@ -1462,13 +1488,12 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) static void thaw_glock(struct gfs2_glock *gl) { - if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) - goto out; - set_bit(GLF_REPLY_PENDING, &gl->gl_flags); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) { -out: + if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) { gfs2_glock_put(gl); + return; } + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gfs2_glock_queue_work(gl, 0); } /** @@ -1484,9 +1509,8 @@ static void clear_glock(struct gfs2_glock *gl) spin_lock(&gl->gl_lockref.lock); if (gl->gl_state != LM_ST_UNLOCKED) handle_callback(gl, LM_ST_UNLOCKED, 0, false); + __gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); } /** diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index ab1ef322f7a5..9ad4a6ac6c84 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -257,4 +257,11 @@ static inline bool gfs2_holder_initialized(struct gfs2_holder *gh) return gh->gh_gl; } +static inline void glock_set_object(struct gfs2_glock *gl, void *object) +{ + spin_lock(&gl->gl_lockref.lock); + gl->gl_object = object; + spin_unlock(&gl->gl_lockref.lock); +} + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5db59d444838..5e69636d4dd3 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -137,7 +137,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) * * Called when demoting or unlocking an EX glock. We must flush * to disk all dirty buffers/pages relating to this glock, and must not - * not return to caller to demote/unlock the glock until I/O is complete. + * return to caller to demote/unlock the glock until I/O is complete. */ static void rgrp_go_sync(struct gfs2_glock *gl) @@ -184,7 +184,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; - struct gfs2_rgrpd *rgd = gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); if (rgd) gfs2_rgrp_brelse(rgd); @@ -197,6 +197,38 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) rgd->rd_flags &= ~GFS2_RDF_UPTODATE; } +static struct gfs2_inode *gfs2_glock2inode(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip) + set_bit(GIF_GLOP_PENDING, &ip->i_flags); + spin_unlock(&gl->gl_lockref.lock); + return ip; +} + +struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl) +{ + struct gfs2_rgrpd *rgd; + + spin_lock(&gl->gl_lockref.lock); + rgd = gl->gl_object; + spin_unlock(&gl->gl_lockref.lock); + + return rgd; +} + +static void gfs2_clear_glop_pending(struct gfs2_inode *ip) +{ + if (!ip) + return; + + clear_bit_unlock(GIF_GLOP_PENDING, &ip->i_flags); + wake_up_bit(&ip->i_flags, GIF_GLOP_PENDING); +} + /** * inode_go_sync - Sync the dirty data and/or metadata for an inode glock * @gl: the glock protecting the inode @@ -205,25 +237,24 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) static void inode_go_sync(struct gfs2_glock *gl) { - struct gfs2_inode *ip = gl->gl_object; + struct gfs2_inode *ip = gfs2_glock2inode(gl); + int isreg = ip && S_ISREG(ip->i_inode.i_mode); struct address_space *metamapping = gfs2_glock2aspace(gl); int error; - if (ip && !S_ISREG(ip->i_inode.i_mode)) - ip = NULL; - if (ip) { + if (isreg) { if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); inode_dio_wait(&ip->i_inode); } if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) - return; + goto out; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH); filemap_fdatawrite(metamapping); - if (ip) { + if (isreg) { struct address_space *mapping = ip->i_inode.i_mapping; filemap_fdatawrite(mapping); error = filemap_fdatawait(mapping); @@ -238,6 +269,9 @@ static void inode_go_sync(struct gfs2_glock *gl) */ smp_mb__before_atomic(); clear_bit(GLF_DIRTY, &gl->gl_flags); + +out: + gfs2_clear_glop_pending(ip); } /** @@ -253,7 +287,7 @@ static void inode_go_sync(struct gfs2_glock *gl) static void inode_go_inval(struct gfs2_glock *gl, int flags) { - struct gfs2_inode *ip = gl->gl_object; + struct gfs2_inode *ip = gfs2_glock2inode(gl); gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count)); @@ -274,6 +308,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) } if (ip && S_ISREG(ip->i_inode.i_mode)) truncate_inode_pages(ip->i_inode.i_mapping, 0); + + gfs2_clear_glop_pending(ip); } /** @@ -541,7 +577,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl) */ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) { - struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; + struct gfs2_inode *ip = gl->gl_object; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY)) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b7cf65d13561..790e73984288 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -336,7 +336,6 @@ enum { }; struct gfs2_glock { - struct hlist_bl_node gl_list; unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; @@ -386,6 +385,7 @@ enum { GIF_SW_PAGED = 3, GIF_ORDERED = 4, GIF_FREE_VFS_INODE = 5, + GIF_GLOP_PENDING = 6, }; struct gfs2_inode { @@ -815,13 +815,11 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; - int sd_log_error; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; unsigned int sd_log_flush_head; - u64 sd_log_flush_wrapped; spinlock_t sd_ail_lock; struct list_head sd_ail1_list; @@ -858,5 +856,7 @@ static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) preempt_enable(); } +extern struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl); + #endif /* __INCORE_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 9f605ea4810c..acca501f8110 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -144,7 +144,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (unlikely(error)) goto fail; - ip->i_gl->gl_object = ip; + flush_delayed_work(&ip->i_gl->gl_work); + glock_set_object(ip->i_gl, ip); error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) @@ -173,8 +174,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail_put; - - ip->i_iopen_gh.gh_gl->gl_object = ip; + flush_delayed_work(&ip->i_iopen_gh.gh_gl->gl_work); + glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -201,14 +202,14 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - ip->i_iopen_gh.gh_gl->gl_object = NULL; + glock_set_object(ip->i_iopen_gh.gh_gl, NULL); gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_put: if (io_gl) gfs2_glock_put(io_gl); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); - ip->i_gl->gl_object = NULL; + glock_set_object(ip->i_gl, NULL); fail: iget_failed(inode); return ERR_PTR(error); @@ -607,6 +608,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); if (error) goto fail; + gfs2_holder_mark_uninitialized(ghs + 1); error = create_ok(dip, name, mode); if (error) @@ -705,7 +707,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; - ip->i_gl->gl_object = ip; + glock_set_object(ip->i_gl, ip); error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_free_inode; @@ -731,7 +733,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock2; - ip->i_iopen_gh.gh_gl->gl_object = ip; + glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); gfs2_set_iop(inode); insert_inode_hash(inode); @@ -778,7 +780,6 @@ fail_gunlock3: fail_gunlock2: if (io_gl) clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); - gfs2_glock_dq_uninit(ghs + 1); fail_free_inode: if (ip->i_gl) gfs2_glock_put(ip->i_gl); @@ -799,6 +800,8 @@ fail_gunlock: &GFS2_I(inode)->i_flags); iput(inode); } + if (gfs2_holder_initialized(ghs + 1)) + gfs2_glock_dq_uninit(ghs + 1); fail: return error; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f865b96374df..9a624f694400 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -659,7 +659,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) struct gfs2_log_header *lh; unsigned int tail; u32 hash; - int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META; + int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); lh = page_address(page); @@ -722,7 +722,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); sdp->sd_log_flush_head = sdp->sd_log_head; - sdp->sd_log_flush_wrapped = 0; tr = sdp->sd_log_tr; if (tr) { sdp->sd_log_tr = NULL; @@ -775,7 +774,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, } atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ trace_gfs2_log_blocks(sdp, -1); - sdp->sd_log_flush_wrapped = 0; log_write_header(sdp, 0); sdp->sd_log_head = sdp->sd_log_flush_head; } @@ -880,7 +878,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); sdp->sd_log_flush_head = sdp->sd_log_head; - sdp->sd_log_flush_wrapped = 0; log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index b1f9144b42c7..e5259cd92ea4 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -71,7 +71,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) { struct gfs2_glock *gl = bd->bd_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct gfs2_rgrpd *rgd = gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; struct gfs2_bitmap *bi = rgd->rd_bits + index; @@ -134,10 +134,8 @@ static void gfs2_log_incr_head(struct gfs2_sbd *sdp) BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && (sdp->sd_log_flush_head != sdp->sd_log_head)); - if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { + if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) sdp->sd_log_flush_head = 0; - sdp->sd_log_flush_wrapped = 1; - } } static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) @@ -170,7 +168,7 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) */ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, - int error) + blk_status_t error) { struct buffer_head *bh, *next; struct page *page = bvec->bv_page; @@ -209,15 +207,13 @@ static void gfs2_end_log_write(struct bio *bio) struct page *page; int i; - if (bio->bi_error) { - sdp->sd_log_error = bio->bi_error; - fs_err(sdp, "Error %d writing to log\n", bio->bi_error); - } + if (bio->bi_status) + fs_err(sdp, "Error %d writing to log\n", bio->bi_status); bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; if (page_has_buffers(page)) - gfs2_end_log_write_bh(sdp, bvec, bio->bi_error); + gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); else mempool_free(page, gfs2_page_pool); } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 67d1fc4668f7..0a89e6f7a314 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -52,7 +52,6 @@ static void gfs2_init_glock_once(void *foo) { struct gfs2_glock *gl = foo; - INIT_HLIST_BL_NODE(&gl->gl_list); spin_lock_init(&gl->gl_lockref.lock); INIT_LIST_HEAD(&gl->gl_holders); INIT_LIST_HEAD(&gl->gl_lru); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 663ffc135ef3..fabe1614f879 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -201,7 +201,7 @@ static void gfs2_meta_read_endio(struct bio *bio) do { struct buffer_head *next = bh->b_this_page; len -= bh->b_size; - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bh = next; } while (bh && len); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ed67548b286c..e76058d34b74 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -176,10 +176,10 @@ static void end_bio_io_page(struct bio *bio) { struct page *page = bio->bi_private; - if (!bio->bi_error) + if (!bio->bi_status) SetPageUptodate(page); else - pr_warn("error %d reading superblock\n", bio->bi_error); + pr_warn("error %d reading superblock\n", bio->bi_status); unlock_page(page); } @@ -203,7 +203,7 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); - memcpy(s->s_uuid, str->sb_uuid, 16); + memcpy(&s->s_uuid, str->sb_uuid, 16); } /** diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 83c9909ff14a..836e38ba5d0a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -705,9 +705,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) rb_erase(n, &sdp->sd_rindex_tree); if (gl) { - spin_lock(&gl->gl_lockref.lock); - gl->gl_object = NULL; - spin_unlock(&gl->gl_lockref.lock); + glock_set_object(gl, NULL); gfs2_glock_add_to_lru(gl); gfs2_glock_put(gl); } @@ -917,7 +915,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) error = rgd_insert(rgd); spin_unlock(&sdp->sd_rindex_spin); if (!error) { - rgd->rd_gl->gl_object = rgd; + glock_set_object(rgd->rd_gl, rgd); rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK; rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 29b0473f6e74..fdedec379b78 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1105,9 +1105,12 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host gfs2_holder_uninit(gh); error = err; } else { - if (!error) - error = statfs_slow_fill( - gh->gh_gl->gl_object, sc); + if (!error) { + struct gfs2_rgrpd *rgd = + gfs2_glock2rgrp(gh->gh_gl); + + error = statfs_slow_fill(rgd, sc); + } gfs2_glock_dq_uninit(gh); } } @@ -1535,6 +1538,12 @@ static void gfs2_evict_inode(struct inode *inode) if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) goto out; + if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { + BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl)); + gfs2_holder_mark_uninitialized(&gh); + goto alloc_failed; + } + /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { @@ -1543,11 +1552,9 @@ static void gfs2_evict_inode(struct inode *inode) goto out; } - if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { - error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); - if (error) - goto out_truncate; - } + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (error) + goto out_truncate; if (test_bit(GIF_INVALID, &ip->i_flags)) { error = gfs2_inode_refresh(ip); @@ -1555,6 +1562,7 @@ static void gfs2_evict_inode(struct inode *inode) goto out_truncate; } +alloc_failed: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; @@ -1621,7 +1629,8 @@ out_unlock: } gfs2_holder_uninit(&ip->i_iopen_gh); } - gfs2_glock_dq_uninit(&gh); + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: @@ -1631,13 +1640,13 @@ out: gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); - ip->i_gl->gl_object = NULL; - flush_delayed_work(&ip->i_gl->gl_work); + glock_set_object(ip->i_gl, NULL); + wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; if (gfs2_holder_initialized(&ip->i_iopen_gh)) { - ip->i_iopen_gh.gh_gl->gl_object = NULL; + glock_set_object(ip->i_iopen_gh.gh_gl, NULL); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 7a515345610c..ca1f97ff898c 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -71,25 +71,14 @@ static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); } -static int gfs2_uuid_valid(const u8 *uuid) -{ - int i; - - for (i = 0; i < 16; i++) { - if (uuid[i]) - return 1; - } - return 0; -} - static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) { struct super_block *s = sdp->sd_vfs; - const u8 *uuid = s->s_uuid; + buf[0] = '\0'; - if (!gfs2_uuid_valid(uuid)) + if (uuid_is_null(&s->s_uuid)) return 0; - return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid); + return snprintf(buf, PAGE_SIZE, "%pUB\n", &s->s_uuid); } static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) @@ -637,12 +626,12 @@ static struct attribute *tune_attrs[] = { NULL, }; -static struct attribute_group tune_group = { +static const struct attribute_group tune_group = { .name = "tune", .attrs = tune_attrs, }; -static struct attribute_group lock_module_group = { +static const struct attribute_group lock_module_group = { .name = "lock_module", .attrs = lock_module_attrs, }; @@ -712,14 +701,13 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); struct super_block *s = sdp->sd_vfs; - const u8 *uuid = s->s_uuid; add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid); - if (gfs2_uuid_valid(uuid)) - add_uevent_var(env, "UUID=%pUB", uuid); + if (!uuid_is_null(&s->s_uuid)) + add_uevent_var(env, "UUID=%pUB", &s->s_uuid); return 0; } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index d87721aeb575..54179554c7d2 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1327,8 +1327,8 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { - struct gfs2_rgrpd *rgd; - rgd = rlist.rl_ghs[x].gh_gl->gl_object; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); + rg_blocks += rgd->rd_length; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dde861387a40..d44f5456eb9b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -200,7 +200,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/fs/inode.c b/fs/inode.c index db5914783a71..ab3b9a795c0b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -146,6 +146,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; + inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; @@ -1891,11 +1892,11 @@ static void __wait_on_freeing_inode(struct inode *inode) wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); schedule(); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); spin_lock(&inode_hash_lock); } @@ -2038,11 +2039,11 @@ static void __inode_dio_wait(struct inode *inode) DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); do { - prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); if (atomic_read(&inode->i_dio_count)) schedule(); } while (atomic_read(&inode->i_dio_count)); - finish_wait(wq, &q.wait); + finish_wait(wq, &q.wq_entry); } /** diff --git a/fs/iomap.c b/fs/iomap.c index 4b10892967a5..fa6cd5b3f578 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -672,8 +672,8 @@ static void iomap_dio_bio_end_io(struct bio *bio) struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); - if (bio->bi_error) - iomap_dio_set_error(dio, bio->bi_error); + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); if (atomic_dec_and_test(&dio->ref)) { if (is_sync_kiocb(dio->iocb)) { @@ -793,6 +793,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, bio->bi_bdev = iomap->bdev; bio->bi_iter.bi_sector = iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -881,6 +882,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, flags |= IOMAP_WRITE; } + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, start, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + flags |= IOMAP_NOWAIT; + } + ret = filemap_write_and_wait_range(mapping, start, end); if (ret) goto out_free_dio; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5a0245e36240..7d5ef3bf3f3e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2363,7 +2363,7 @@ static int jbd2_journal_init_journal_head_cache(void) jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ - SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, + SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU, NULL); /* ctor */ retval = 0; if (!jbd2_journal_head_cache) { @@ -2579,10 +2579,10 @@ restart: wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&journal->j_list_lock); schedule(); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); goto restart; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 9ee4832b6f8b..8b08044b3120 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -409,25 +409,6 @@ static handle_t *new_handle(int nblocks) return handle; } -/** - * handle_t *jbd2_journal_start() - Obtain a new handle. - * @journal: Journal to start transaction on. - * @nblocks: number of block buffer we might modify - * - * We make sure that the transaction can guarantee at least nblocks of - * modified buffers in the log. We block until the log can guarantee - * that much space. Additionally, if rsv_blocks > 0, we also create another - * handle with rsv_blocks reserved blocks in the journal. This handle is - * is stored in h_rsv_handle. It is not attached to any particular transaction - * and thus doesn't block transaction commit. If the caller uses this reserved - * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() - * on the parent handle will dispose the reserved one. Reserved handle has to - * be converted to a normal handle using jbd2_journal_start_reserved() before - * it can be used. - * - * Return a pointer to a newly allocated handle, or an ERR_PTR() value - * on failure. - */ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, gfp_t gfp_mask, unsigned int type, unsigned int line_no) @@ -478,6 +459,25 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, EXPORT_SYMBOL(jbd2__journal_start); +/** + * handle_t *jbd2_journal_start() - Obtain a new handle. + * @journal: Journal to start transaction on. + * @nblocks: number of block buffer we might modify + * + * We make sure that the transaction can guarantee at least nblocks of + * modified buffers in the log. We block until the log can guarantee + * that much space. Additionally, if rsv_blocks > 0, we also create another + * handle with rsv_blocks reserved blocks in the journal. This handle is + * is stored in h_rsv_handle. It is not attached to any particular transaction + * and thus doesn't block transaction commit. If the caller uses this reserved + * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() + * on the parent handle will dispose the reserved one. Reserved handle has to + * be converted to a normal handle using jbd2_journal_start_reserved() before + * it can be used. + * + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. + */ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0); @@ -680,6 +680,12 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); handle->h_buffer_credits = nblocks; + /* + * Restore the original nofs context because the journal restart + * is basically the same thing as journal stop and start. + * start_this_handle will start a new nofs context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); ret = start_this_handle(journal, handle, gfp_mask); return ret; } @@ -1066,10 +1072,10 @@ out: * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes * - * Returns an error code or 0 on success. + * Returns: error code or 0 on success. * * In full data journalling mode the buffer may be of type BJ_AsyncData, - * because we're write()ing a buffer which is also part of a shared mapping. + * because we're ``write()ing`` a buffer which is also part of a shared mapping. */ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 06a71dbd4833..389ea53ea487 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1366,7 +1366,7 @@ int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, jffs2_add_ino_cache(c, f->inocache); } if (!f->inocache) { - JFFS2_ERROR("requestied to read an nonexistent ino %u\n", ino); + JFFS2_ERROR("requested to read a nonexistent ino %u\n", ino); return -ENOENT; } diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index bb1da1feafeb..a21f0e9eecd4 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2205,7 +2205,7 @@ static void lbmIODone(struct bio *bio) bp->l_flag |= lbmDONE; - if (bio->bi_error) { + if (bio->bi_status) { bp->l_flag |= lbmERROR; jfs_err("lbmIODone: I/O error in JFS log"); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 489aaa1403e5..ce93db3aef3c 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -280,7 +280,7 @@ static void metapage_read_end_io(struct bio *bio) { struct page *page = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_read_end_io: I/O error\n"); SetPageError(page); } @@ -337,7 +337,7 @@ static void metapage_write_end_io(struct bio *bio) BUG_ON(!PagePrivate(page)); - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_write_end_io: I/O error\n"); SetPageError(page); } diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 41e491b8e5d7..27d577dbe51a 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -69,6 +69,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL) goto out_nobind; + host->h_nlmclnt_ops = nlm_init->nlmclnt_ops; return host; out_nobind: nlmclnt_release_host(host); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 112952037933..066ac313ae5c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -150,17 +150,22 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req) * @host: address of a valid nlm_host context representing the NLM server * @cmd: fcntl-style file lock operation to perform * @fl: address of arguments for the lock operation + * @data: address of data to be sent to callback operations * */ -int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) +int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data) { struct nlm_rqst *call; int status; + const struct nlmclnt_operations *nlmclnt_ops = host->h_nlmclnt_ops; call = nlm_alloc_call(host); if (call == NULL) return -ENOMEM; + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_alloc_call) + nlmclnt_ops->nlmclnt_alloc_call(data); + nlmclnt_locks_init_private(fl, host); if (!fl->fl_u.nfs_fl.owner) { /* lockowner allocation has failed */ @@ -169,6 +174,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) } /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); + call->a_callback_data = data; if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { if (fl->fl_type != F_UNLCK) { @@ -214,8 +220,12 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) void nlmclnt_release_call(struct nlm_rqst *call) { + const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops; + if (!atomic_dec_and_test(&call->a_count)) return; + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call) + nlmclnt_ops->nlmclnt_release_call(call->a_callback_data); nlmclnt_release_host(call->a_host); nlmclnt_release_lockargs(call); kfree(call); @@ -687,6 +697,19 @@ out: return status; } +static void nlmclnt_unlock_prepare(struct rpc_task *task, void *data) +{ + struct nlm_rqst *req = data; + const struct nlmclnt_operations *nlmclnt_ops = req->a_host->h_nlmclnt_ops; + bool defer_call = false; + + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_unlock_prepare) + defer_call = nlmclnt_ops->nlmclnt_unlock_prepare(task, req->a_callback_data); + + if (!defer_call) + rpc_call_start(task); +} + static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) { struct nlm_rqst *req = data; @@ -720,6 +743,7 @@ die: } static const struct rpc_call_ops nlmclnt_unlock_ops = { + .rpc_call_prepare = nlmclnt_unlock_prepare, .rpc_call_done = nlmclnt_unlock_callback, .rpc_release = nlmclnt_rpc_release, }; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e7c8b9c76e48..5d481e8a1b5d 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -132,6 +132,8 @@ lockd(void *vrqstp) { int err = 0; struct svc_rqst *rqstp = vrqstp; + struct net *net = &init_net; + struct lockd_net *ln = net_generic(net, lockd_net_id); /* try_to_freeze() is called from svc_recv() */ set_freezable(); @@ -176,6 +178,8 @@ lockd(void *vrqstp) if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); return 0; } @@ -270,8 +274,6 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); - cancel_delayed_work_sync(&ln->grace_period_end); - locks_end_grace(&ln->lockd_manager); svc_shutdown_net(serv, net); dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net); } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 5581e020644b..3507c80d1d4b 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -870,15 +870,15 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) if (!(block = nlmsvc_find_block(cookie))) return; - if (block) { - if (status == nlm_lck_denied_grace_period) { - /* Try again in a couple of seconds */ - nlmsvc_insert_block(block, 10 * HZ); - } else { - /* Lock is now held by client, or has been rejected. - * In both cases, the block should be removed. */ - nlmsvc_unlink_block(block); - } + if (status == nlm_lck_denied_grace_period) { + /* Try again in a couple of seconds */ + nlmsvc_insert_block(block, 10 * HZ); + } else { + /* + * Lock is now held by client, or has been rejected. + * In both cases, the block should be removed. + */ + nlmsvc_unlink_block(block); } nlmsvc_release_block(block); } diff --git a/fs/locks.c b/fs/locks.c index 26811321d39b..afefeb4ad6de 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1858,8 +1858,8 @@ EXPORT_SYMBOL(generic_setlease); * * Call this to establish a lease on the file. The "lease" argument is not * used for F_UNLCK requests and may be NULL. For commands that set or alter - * an existing lease, the (*lease)->fl_lmops->lm_break operation must be set; - * if not, this function will return -ENOLCK (and generate a scary-looking + * an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be + * set; if not, this function will return -ENOLCK (and generate a scary-looking * stack trace). * * The "priv" pointer is passed directly to the lm_setup function as-is. It @@ -1972,15 +1972,13 @@ EXPORT_SYMBOL(locks_lock_inode_wait); * @cmd: the type of lock to apply. * * Apply a %FL_FLOCK style lock to an open file descriptor. - * The @cmd can be one of + * The @cmd can be one of: * - * %LOCK_SH -- a shared lock. - * - * %LOCK_EX -- an exclusive lock. - * - * %LOCK_UN -- remove an existing lock. - * - * %LOCK_MAND -- a `mandatory' flock. This exists to emulate Windows Share Modes. + * - %LOCK_SH -- a shared lock. + * - %LOCK_EX -- an exclusive lock. + * - %LOCK_UN -- remove an existing lock. + * - %LOCK_MAND -- a 'mandatory' flock. + * This exists to emulate Windows Share Modes. * * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other * processes read and write access respectively. @@ -2086,26 +2084,22 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) +int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) { struct file_lock file_lock; - struct flock flock; int error; - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; error = -EINVAL; - if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock_to_posix_lock(filp, &file_lock, &flock); + error = flock_to_posix_lock(filp, &file_lock, flock); if (error) goto out; if (cmd == F_OFD_GETLK) { error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_GETLK; @@ -2117,15 +2111,12 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) if (error) goto out; - flock.l_type = file_lock.fl_type; + flock->l_type = file_lock.fl_type; if (file_lock.fl_type != F_UNLCK) { - error = posix_lock_to_flock(&flock, &file_lock); + error = posix_lock_to_flock(flock, &file_lock); if (error) goto rel_priv; } - error = -EFAULT; - if (!copy_to_user(l, &flock, sizeof(flock))) - error = 0; rel_priv: locks_release_private(&file_lock); out: @@ -2218,26 +2209,16 @@ check_fmode_for_setlk(struct file_lock *fl) * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, - struct flock __user *l) + struct flock *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct flock flock; - struct inode *inode; + struct inode *inode = locks_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; - inode = locks_inode(filp); - - /* - * This might block, so we do it before checking the inode. - */ - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; - /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ @@ -2246,7 +2227,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, goto out; } - error = flock_to_posix_lock(filp, file_lock, &flock); + error = flock_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2261,7 +2242,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, switch (cmd) { case F_OFD_SETLK: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLK; @@ -2270,7 +2251,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, break; case F_OFD_SETLKW: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLKW; @@ -2315,26 +2296,22 @@ out: /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) +int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) { struct file_lock file_lock; - struct flock64 flock; int error; - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; error = -EINVAL; - if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock64_to_posix_lock(filp, &file_lock, &flock); + error = flock64_to_posix_lock(filp, &file_lock, flock); if (error) goto out; if (cmd == F_OFD_GETLK) { error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_GETLK64; @@ -2346,13 +2323,9 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) if (error) goto out; - flock.l_type = file_lock.fl_type; + flock->l_type = file_lock.fl_type; if (file_lock.fl_type != F_UNLCK) - posix_lock_to_flock64(&flock, &file_lock); - - error = -EFAULT; - if (!copy_to_user(l, &flock, sizeof(flock))) - error = 0; + posix_lock_to_flock64(flock, &file_lock); locks_release_private(&file_lock); out: @@ -2363,26 +2336,16 @@ out: * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, - struct flock64 __user *l) + struct flock64 *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct flock64 flock; - struct inode *inode; + struct inode *inode = locks_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; - /* - * This might block, so we do it before checking the inode. - */ - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; - - inode = locks_inode(filp); - /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ @@ -2391,7 +2354,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, goto out; } - error = flock64_to_posix_lock(filp, file_lock, &flock); + error = flock64_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2406,7 +2369,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, switch (cmd) { case F_OFD_SETLK: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLK64; @@ -2415,7 +2378,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, break; case F_OFD_SETLKW: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLKW64; @@ -2504,7 +2467,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) .fl_owner = filp, .fl_pid = current->tgid, .fl_file = filp, - .fl_flags = FL_FLOCK, + .fl_flags = FL_FLOCK | FL_CLOSE, .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; diff --git a/fs/mount.h b/fs/mount.h index bf1fda6eed8f..de45d9e76748 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -58,6 +58,7 @@ struct mount { struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ + struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; diff --git a/fs/mpage.c b/fs/mpage.c index baff8f820c29..2e4c41ccb5c9 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -50,7 +50,8 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, op_is_write(bio_op(bio)), bio->bi_error); + page_endio(page, op_is_write(bio_op(bio)), + blk_status_to_errno(bio->bi_status)); } bio_put(bio); @@ -344,6 +345,7 @@ confused: * * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be * submitted in the following order: + * * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 * * because the indirect block has to be read to get the mappings of blocks @@ -614,6 +616,7 @@ alloc_new: goto confused; wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; } /* diff --git a/fs/namei.c b/fs/namei.c index 7286f87ce863..8bacc390c51e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2142,7 +2142,6 @@ OK: static const char *path_init(struct nameidata *nd, unsigned flags) { - int retval = 0; const char *s = nd->name->name; if (!*s) @@ -2154,13 +2153,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; - if (*s) { - if (!d_can_lookup(root)) - return ERR_PTR(-ENOTDIR); - retval = inode_permission(inode, MAY_EXEC); - if (retval) - return ERR_PTR(retval); - } + if (*s && unlikely(!d_can_lookup(root))) + return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { @@ -2258,6 +2252,35 @@ static inline int lookup_last(struct nameidata *nd) return walk_component(nd, 0); } +static int handle_lookup_down(struct nameidata *nd) +{ + struct path path = nd->path; + struct inode *inode = nd->inode; + unsigned seq = nd->seq; + int err; + + if (nd->flags & LOOKUP_RCU) { + /* + * don't bother with unlazy_walk on failure - we are + * at the very beginning of walk, so we lose nothing + * if we simply redo everything in non-RCU mode + */ + if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq))) + return -ECHILD; + } else { + dget(path.dentry); + err = follow_managed(&path, nd); + if (unlikely(err < 0)) + return err; + inode = d_backing_inode(path.dentry); + seq = 0; + } + path_to_nameidata(&path, nd); + nd->inode = inode; + nd->seq = seq; + return 0; +} + /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { @@ -2266,6 +2289,15 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path if (IS_ERR(s)) return PTR_ERR(s); + + if (unlikely(flags & LOOKUP_DOWN)) { + err = handle_lookup_down(nd); + if (unlikely(err < 0)) { + terminate_walk(nd); + return err; + } + } + while (!(err = link_path_walk(s, nd)) && ((err = lookup_last(nd)) > 0)) { s = trailing_symlink(nd); @@ -4300,6 +4332,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: + * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4 screws up. Current fix: serialization on diff --git a/fs/namespace.c b/fs/namespace.c index b3b115bd4e1e..f70914a859a4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -236,6 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); + INIT_LIST_HEAD(&mnt->mnt_umounting); init_fs_pin(&mnt->mnt_umount, drop_mountpoint); } return mnt; @@ -3462,8 +3463,9 @@ static void mntns_put(struct ns_common *ns) static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct fs_struct *fs = current->fs; - struct mnt_namespace *mnt_ns = to_mnt_ns(ns); + struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; struct path root; + int err; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || @@ -3474,15 +3476,20 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) return -EINVAL; get_mnt_ns(mnt_ns); - put_mnt_ns(nsproxy->mnt_ns); + old_mnt_ns = nsproxy->mnt_ns; nsproxy->mnt_ns = mnt_ns; /* Find the root */ - root.mnt = &mnt_ns->root->mnt; - root.dentry = mnt_ns->root->mnt.mnt_root; - path_get(&root); - while(d_mountpoint(root.dentry) && follow_down_one(&root)) - ; + err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt, + "/", LOOKUP_DOWN, &root); + if (err) { + /* revert to old namespace */ + nsproxy->mnt_ns = old_mnt_ns; + put_mnt_ns(mnt_ns); + return err; + } + + put_mnt_ns(old_mnt_ns); /* Update the pwd and root */ set_fs_pwd(fs, &root); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index f31fd0dd92c6..69d02cf8cf37 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -123,11 +123,6 @@ config PNFS_BLOCK depends on NFS_V4_1 && BLK_DEV_DM default NFS_V4 -config PNFS_OBJLAYOUT - tristate - depends on NFS_V4_1 && SCSI_OSD_ULD - default NFS_V4 - config PNFS_FLEXFILE_LAYOUT tristate depends on NFS_V4_1 && NFS_V3 diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 6abdda209642..98f4e5728a67 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -31,6 +31,5 @@ nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ -obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/ diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 0ca370d23ddb..d8863a804b15 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -188,7 +188,7 @@ static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { struct nfs_pgio_header *header = par->data; if (!header->pnfs_error) @@ -319,7 +319,7 @@ static void bl_end_io_write(struct bio *bio) struct parallel_io *par = bio->bi_private; struct nfs_pgio_header *header = par->data; - if (bio->bi_error) { + if (bio->bi_status) { if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 773774531aff..73a1f928226c 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -76,7 +76,10 @@ nfs4_callback_svc(void *vrqstp) set_freezable(); - while (!kthread_should_stop()) { + while (!kthread_freezable_should_stop(NULL)) { + + if (signal_pending(current)) + flush_signals(current); /* * Listen for a request on the socket */ @@ -85,6 +88,8 @@ nfs4_callback_svc(void *vrqstp) continue; svc_process(rqstp); } + svc_exit_thread(rqstp); + module_put_and_exit(0); return 0; } @@ -103,9 +108,10 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); - while (!kthread_should_stop()) { - if (try_to_freeze()) - continue; + while (!kthread_freezable_should_stop(NULL)) { + + if (signal_pending(current)) + flush_signals(current); prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); @@ -121,11 +127,13 @@ nfs41_callback_svc(void *vrqstp) error); } else { spin_unlock_bh(&serv->sv_cb_lock); - schedule(); + if (!kthread_should_stop()) + schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } - flush_signals(current); } + svc_exit_thread(rqstp); + module_put_and_exit(0); return 0; } @@ -221,14 +229,14 @@ err_bind: static struct svc_serv_ops nfs40_cb_sv_ops = { .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, + .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; #if defined(CONFIG_NFS_V4_1) static struct svc_serv_ops nfs41_cb_sv_ops = { .svo_function = nfs41_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, + .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; @@ -280,7 +288,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); + serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f073a6d2c6a5..52479f180ea1 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -131,10 +131,11 @@ restart: if (!inode) continue; if (!nfs_sb_active(inode->i_sb)) { - rcu_read_lock(); + rcu_read_unlock(); spin_unlock(&clp->cl_lock); iput(inode); spin_lock(&clp->cl_lock); + rcu_read_lock(); goto restart; } return inode; @@ -170,10 +171,11 @@ restart: if (!inode) continue; if (!nfs_sb_active(inode->i_sb)) { - rcu_read_lock(); + rcu_read_unlock(); spin_unlock(&clp->cl_lock); iput(inode); spin_lock(&clp->cl_lock); + rcu_read_lock(); goto restart; } return inode; @@ -317,31 +319,18 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, static u32 do_callback_layoutrecall(struct nfs_client *clp, struct cb_layoutrecallargs *args) { - u32 res; - - dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); if (args->cbl_recall_type == RETURN_FILE) - res = initiate_file_draining(clp, args); - else - res = initiate_bulk_draining(clp, args); - dprintk("%s returning %i\n", __func__, res); - return res; - + return initiate_file_draining(clp, args); + return initiate_bulk_draining(clp, args); } __be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, void *dummy, struct cb_process_state *cps) { - u32 res; - - dprintk("%s: -->\n", __func__); + u32 res = NFS4ERR_OP_NOT_IN_SESSION; if (cps->clp) res = do_callback_layoutrecall(cps->clp, args); - else - res = NFS4ERR_OP_NOT_IN_SESSION; - - dprintk("%s: exit with status = %d\n", __func__, res); return cpu_to_be32(res); } @@ -364,8 +353,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, struct nfs_client *clp = cps->clp; struct nfs_server *server = NULL; - dprintk("%s: -->\n", __func__); - if (!clp) { res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); goto out; @@ -384,8 +371,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, goto found; } rcu_read_unlock(); - dprintk("%s: layout type %u not found\n", - __func__, dev->cbd_layout_type); continue; } @@ -395,8 +380,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, out: kfree(args->devs); - dprintk("%s: exit with status = %u\n", - __func__, be32_to_cpu(res)); return res; } @@ -417,16 +400,11 @@ static __be32 validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot, const struct cb_sequenceargs * args) { - dprintk("%s enter. slotid %u seqid %u, slot table seqid: %u\n", - __func__, args->csa_slotid, args->csa_sequenceid, slot->seq_nr); - if (args->csa_slotid > tbl->server_highest_slotid) return htonl(NFS4ERR_BADSLOT); /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { - dprintk("%s seqid %u is a replay\n", - __func__, args->csa_sequenceid); if (nfs4_test_locked_slot(tbl, slot->slot_nr)) return htonl(NFS4ERR_DELAY); /* Signal process_op to set this error on next op */ @@ -480,15 +458,6 @@ static bool referring_call_exists(struct nfs_client *clp, for (j = 0; j < rclist->rcl_nrefcalls; j++) { ref = &rclist->rcl_refcalls[j]; - - dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u " - "slotid %u\n", __func__, - ((u32 *)&rclist->rcl_sessionid.data)[0], - ((u32 *)&rclist->rcl_sessionid.data)[1], - ((u32 *)&rclist->rcl_sessionid.data)[2], - ((u32 *)&rclist->rcl_sessionid.data)[3], - ref->rc_sequenceid, ref->rc_slotid); - status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid, ref->rc_sequenceid, HZ >> 1) < 0; if (status) @@ -593,8 +562,6 @@ out: res->csr_status = status; trace_nfs4_cb_sequence(args, res, status); - dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, - ntohl(status), ntohl(res->csr_status)); return status; } diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d051fc3583a9..390ac9c39c59 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -171,8 +171,6 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound return htonl(NFS4ERR_MINOR_VERS_MISMATCH); } hdr->nops = ntohl(*p); - dprintk("%s: minorversion %d nops %d\n", __func__, - hdr->minorversion, hdr->nops); return 0; } @@ -192,11 +190,8 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr status = decode_fh(xdr, &args->fh); if (unlikely(status != 0)) - goto out; - status = decode_bitmap(xdr, args->bitmap); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return status; + return decode_bitmap(xdr, args->bitmap); } static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args) @@ -206,17 +201,12 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) - goto out; + return status; p = read_buf(xdr, 4); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_RESOURCE); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); args->truncate = ntohl(*p); - status = decode_fh(xdr, &args->fh); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return decode_fh(xdr, &args->fh); } #if defined(CONFIG_NFS_V4_1) @@ -235,10 +225,8 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, uint32_t iomode; p = read_buf(xdr, 4 * sizeof(uint32_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); args->cbl_layout_type = ntohl(*p++); /* Depite the spec's xdr, iomode really belongs in the FILE switch, @@ -252,37 +240,23 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, args->cbl_range.iomode = iomode; status = decode_fh(xdr, &args->cbl_fh); if (unlikely(status != 0)) - goto out; + return status; p = read_buf(xdr, 2 * sizeof(uint64_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); - status = decode_layout_stateid(xdr, &args->cbl_stateid); - if (unlikely(status != 0)) - goto out; + return decode_layout_stateid(xdr, &args->cbl_stateid); } else if (args->cbl_recall_type == RETURN_FSID) { p = read_buf(xdr, 2 * sizeof(uint64_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_fsid.major); p = xdr_decode_hyper(p, &args->cbl_fsid.minor); - } else if (args->cbl_recall_type != RETURN_ALL) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } - dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n", - __func__, - args->cbl_layout_type, iomode, - args->cbl_layoutchanged, args->cbl_recall_type); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + } else if (args->cbl_recall_type != RETURN_ALL) + return htonl(NFS4ERR_BADXDR); + return 0; } static @@ -437,12 +411,11 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, status = decode_sessionid(xdr, &args->csa_sessionid); if (status) - goto out; + return status; - status = htonl(NFS4ERR_RESOURCE); p = read_buf(xdr, 5 * sizeof(uint32_t)); if (unlikely(p == NULL)) - goto out; + return htonl(NFS4ERR_RESOURCE); args->csa_addr = svc_addr(rqstp); args->csa_sequenceid = ntohl(*p++); @@ -456,7 +429,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, sizeof(*args->csa_rclists), GFP_KERNEL); if (unlikely(args->csa_rclists == NULL)) - goto out; + return htonl(NFS4ERR_RESOURCE); for (i = 0; i < args->csa_nrclists; i++) { status = decode_rc_list(xdr, &args->csa_rclists[i]); @@ -466,27 +439,13 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, } } } - status = 0; - - dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u " - "highestslotid %u cachethis %d nrclists %u\n", - __func__, - ((u32 *)&args->csa_sessionid)[0], - ((u32 *)&args->csa_sessionid)[1], - ((u32 *)&args->csa_sessionid)[2], - ((u32 *)&args->csa_sessionid)[3], - args->csa_sequenceid, args->csa_slotid, - args->csa_highestslotid, args->csa_cachethis, - args->csa_nrclists); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return 0; out_free: for (i = 0; i < args->csa_nrclists; i++) kfree(args->csa_rclists[i].rcl_refcalls); kfree(args->csa_rclists); - goto out; + return status; } static __be32 decode_recallany_args(struct svc_rqst *rqstp, @@ -557,11 +516,8 @@ static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream status = decode_fh(xdr, &args->cbnl_fh); if (unlikely(status != 0)) - goto out; - status = decode_lockowner(xdr, args); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return status; + return decode_lockowner(xdr, args); } #endif /* CONFIG_NFS_V4_1 */ @@ -707,7 +663,6 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, status = encode_attr_mtime(xdr, res->bitmap, &res->mtime); *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1))); out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } @@ -734,11 +689,11 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, __be32 status = res->csr_status; if (unlikely(status != 0)) - goto out; + return status; status = encode_sessionid(xdr, &res->csr_sessionid); if (status) - goto out; + return status; p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t)); if (unlikely(p == NULL)) @@ -748,9 +703,7 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, *p++ = htonl(res->csr_slotid); *p++ = htonl(res->csr_highestslotid); *p++ = htonl(res->csr_target_highestslotid); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return 0; } static __be32 @@ -800,7 +753,6 @@ static void nfs4_callback_free_slot(struct nfs4_session *session, * A single slot, so highest used slotid is either 0 or -1 */ nfs4_free_slot(tbl, slot); - nfs4_slot_tbl_drain_complete(tbl); spin_unlock(&tbl->slot_tbl_lock); } @@ -871,14 +823,10 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp, long maxlen; __be32 res; - dprintk("%s: start\n", __func__); status = decode_op_hdr(xdr_in, &op_nr); if (unlikely(status)) return status; - dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", - __func__, cps->minorversion, nop, op_nr); - switch (cps->minorversion) { case 0: status = preprocess_nfs4_op(op_nr, &op); @@ -917,7 +865,6 @@ encode_hdr: return res; if (op->encode_res != NULL && status == 0) status = op->encode_res(rqstp, xdr_out, resp); - dprintk("%s: done, status = %d\n", __func__, ntohl(status)); return status; } @@ -937,8 +884,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r }; unsigned int nops = 0; - dprintk("%s: start\n", __func__); - xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); @@ -977,7 +922,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r *hdr_res.nops = htonl(nops); nfs4_cb_free_slot(&cps); nfs_put_client(cps.clp); - dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; out_invalidcred: diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 04d15a0045e3..ee5ddbd36088 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -218,6 +218,7 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); + rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); } #else @@ -240,8 +241,6 @@ static void pnfs_init_server(struct nfs_server *server) */ void nfs_free_client(struct nfs_client *clp) { - dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); - nfs_fscache_release_client_cookie(clp); /* -EIO all pending I/O */ @@ -256,8 +255,6 @@ void nfs_free_client(struct nfs_client *clp) kfree(clp->cl_hostname); kfree(clp->cl_acceptor); kfree(clp); - - dprintk("<-- nfs_free_client()\n"); } EXPORT_SYMBOL_GPL(nfs_free_client); @@ -271,7 +268,6 @@ void nfs_put_client(struct nfs_client *clp) if (!clp) return; - dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); nn = net_generic(clp->cl_net, nfs_net_id); if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { @@ -382,9 +378,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, } smp_rmb(); - - dprintk("<-- %s found nfs_client %p for %s\n", - __func__, clp, cl_init->hostname ?: ""); return clp; } @@ -403,9 +396,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) return NULL; } - dprintk("--> nfs_get_client(%s,v%u)\n", - cl_init->hostname, rpc_ops->version); - /* see if the client already exists */ do { spin_lock(&nn->nfs_client_lock); @@ -430,8 +420,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) new = rpc_ops->alloc_client(cl_init); } while (!IS_ERR(new)); - dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", - cl_init->hostname, PTR_ERR(new)); return new; } EXPORT_SYMBOL_GPL(nfs_get_client); @@ -558,6 +546,7 @@ static int nfs_start_lockd(struct nfs_server *server) .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, .net = clp->cl_net, + .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, }; if (nlm_init.nfs_version > 3) @@ -624,27 +613,21 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, { int error; - if (clp->cl_cons_state == NFS_CS_READY) { - /* the client is already initialised */ - dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp); + /* the client is already initialised */ + if (clp->cl_cons_state == NFS_CS_READY) return clp; - } /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); - if (error < 0) - goto error; - nfs_mark_client_ready(clp, NFS_CS_READY); + nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error); + if (error < 0) { + nfs_put_client(clp); + clp = ERR_PTR(error); + } return clp; - -error: - nfs_mark_client_ready(clp, error); - nfs_put_client(clp); - dprintk("<-- nfs_init_client() = xerror %d\n", error); - return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_init_client); @@ -668,8 +651,6 @@ static int nfs_init_server(struct nfs_server *server, struct nfs_client *clp; int error; - dprintk("--> nfs_init_server()\n"); - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, data->timeo, data->retrans); if (data->flags & NFS_MOUNT_NORESVPORT) @@ -677,10 +658,8 @@ static int nfs_init_server(struct nfs_server *server, /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); - if (IS_ERR(clp)) { - dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); + if (IS_ERR(clp)) return PTR_ERR(clp); - } server->nfs_client = clp; @@ -725,13 +704,11 @@ static int nfs_init_server(struct nfs_server *server, server->mountd_protocol = data->mount_server.protocol; server->namelen = data->namlen; - dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp); return 0; error: server->nfs_client = NULL; nfs_put_client(clp); - dprintk("<-- nfs_init_server() = xerror %d\n", error); return error; } @@ -798,12 +775,10 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs struct nfs_client *clp = server->nfs_client; int error; - dprintk("--> nfs_probe_fsinfo()\n"); - if (clp->rpc_ops->set_capabilities != NULL) { error = clp->rpc_ops->set_capabilities(server, mntfh); if (error < 0) - goto out_error; + return error; } fsinfo.fattr = fattr; @@ -811,7 +786,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) - goto out_error; + return error; nfs_server_set_fsinfo(server, &fsinfo); @@ -826,12 +801,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs server->namelen = pathinfo.max_namelen; } - dprintk("<-- nfs_probe_fsinfo() = 0\n"); return 0; - -out_error: - dprintk("nfs_probe_fsinfo: error = %d\n", -error); - return error; } EXPORT_SYMBOL_GPL(nfs_probe_fsinfo); @@ -927,8 +897,6 @@ EXPORT_SYMBOL_GPL(nfs_alloc_server); */ void nfs_free_server(struct nfs_server *server) { - dprintk("--> nfs_free_server()\n"); - nfs_server_remove_lists(server); if (server->destroy != NULL) @@ -946,7 +914,6 @@ void nfs_free_server(struct nfs_server *server) nfs_free_iostats(server->io_stats); kfree(server); nfs_release_automount_timer(); - dprintk("<-- nfs_free_server()\n"); } EXPORT_SYMBOL_GPL(nfs_free_server); @@ -1026,10 +993,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, struct nfs_fattr *fattr_fsinfo; int error; - dprintk("--> nfs_clone_server(,%llx:%llx,)\n", - (unsigned long long) fattr->fsid.major, - (unsigned long long) fattr->fsid.minor); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1061,10 +1024,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; - dprintk("Cloned FSID: %llx:%llx\n", - (unsigned long long) server->fsid.major, - (unsigned long long) server->fsid.minor); - error = nfs_start_lockd(server); if (error < 0) goto out_free_server; @@ -1073,13 +1032,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, server->mount_time = jiffies; nfs_free_fattr(fattr_fsinfo); - dprintk("<-- nfs_clone_server() = %p\n", server); return server; out_free_server: nfs_free_fattr(fattr_fsinfo); nfs_free_server(server); - dprintk("<-- nfs_clone_server() = error %d\n", error); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_clone_server); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f92ba8d6c556..2ac00bf4ecf1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -57,7 +57,7 @@ static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, - .iterate_shared = nfs_readdir, + .iterate = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, @@ -145,7 +145,6 @@ struct nfs_cache_array_entry { }; struct nfs_cache_array { - atomic_t refcount; int size; int eof_index; u64 last_cookie; @@ -171,27 +170,6 @@ typedef struct { } nfs_readdir_descriptor_t; /* - * The caller is responsible for calling nfs_readdir_release_array(page) - */ -static -struct nfs_cache_array *nfs_readdir_get_array(struct page *page) -{ - void *ptr; - if (page == NULL) - return ERR_PTR(-EIO); - ptr = kmap(page); - if (ptr == NULL) - return ERR_PTR(-ENOMEM); - return ptr; -} - -static -void nfs_readdir_release_array(struct page *page) -{ - kunmap(page); -} - -/* * we are freeing strings created by nfs_add_to_readdir_array() */ static @@ -201,18 +179,9 @@ void nfs_readdir_clear_array(struct page *page) int i; array = kmap_atomic(page); - if (atomic_dec_and_test(&array->refcount)) - for (i = 0; i < array->size; i++) - kfree(array->array[i].string.name); - kunmap_atomic(array); -} - -static bool grab_page(struct page *page) -{ - struct nfs_cache_array *array = kmap_atomic(page); - bool res = atomic_inc_not_zero(&array->refcount); + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); kunmap_atomic(array); - return res; } /* @@ -239,13 +208,10 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le static int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) { - struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array *array = kmap(page); struct nfs_cache_array_entry *cache_entry; int ret; - if (IS_ERR(array)) - return PTR_ERR(array); - cache_entry = &array->array[array->size]; /* Check that this entry lies within the page bounds */ @@ -264,7 +230,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (entry->eof != 0) array->eof_index = array->size; out: - nfs_readdir_release_array(page); + kunmap(page); return ret; } @@ -353,11 +319,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) struct nfs_cache_array *array; int status; - array = nfs_readdir_get_array(desc->page); - if (IS_ERR(array)) { - status = PTR_ERR(array); - goto out; - } + array = kmap(desc->page); if (*desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); @@ -369,8 +331,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) desc->current_index += array->size; desc->page_index++; } - nfs_readdir_release_array(desc->page); -out: + kunmap(desc->page); return status; } @@ -606,13 +567,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = nfs_readdir_get_array(page); - if (!IS_ERR(array)) { - array->eof_index = array->size; - status = 0; - nfs_readdir_release_array(page); - } else - status = PTR_ERR(array); + array = kmap(page); + array->eof_index = array->size; + status = 0; + kunmap(page); } put_page(scratch); @@ -674,13 +632,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out; } - array = nfs_readdir_get_array(page); - if (IS_ERR(array)) { - status = PTR_ERR(array); - goto out_label_free; - } + array = kmap(page); memset(array, 0, sizeof(struct nfs_cache_array)); - atomic_set(&array->refcount, 1); array->eof_index = -1; status = nfs_readdir_alloc_pages(pages, array_size); @@ -703,8 +656,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, nfs_readdir_free_pages(pages, array_size); out_release_array: - nfs_readdir_release_array(page); -out_label_free: + kunmap(page); nfs4_label_free(entry.label); out: nfs_free_fattr(entry.fattr); @@ -743,7 +695,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) static void cache_page_release(nfs_readdir_descriptor_t *desc) { - nfs_readdir_clear_array(desc->page); + if (!desc->page->mapping) + nfs_readdir_clear_array(desc->page); put_page(desc->page); desc->page = NULL; } @@ -751,16 +704,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc) static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - struct page *page; - - for (;;) { - page = read_cache_page(desc->file->f_mapping, + return read_cache_page(desc->file->f_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); - if (IS_ERR(page) || grab_page(page)) - break; - put_page(page); - } - return page; } /* @@ -809,12 +754,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) struct nfs_cache_array *array = NULL; struct nfs_open_dir_context *ctx = file->private_data; - array = nfs_readdir_get_array(desc->page); - if (IS_ERR(array)) { - res = PTR_ERR(array); - goto out; - } - + array = kmap(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; @@ -835,8 +775,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) if (array->eof_index >= 0) desc->eof = 1; - nfs_readdir_release_array(desc->page); -out: + kunmap(desc->page); cache_page_release(desc); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res); @@ -966,11 +905,13 @@ out: static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { + struct inode *inode = file_inode(filp); struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); + inode_lock(inode); switch (whence) { case 1: offset += filp->f_pos; @@ -978,13 +919,16 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) if (offset >= 0) break; default: - return -EINVAL; + offset = -EINVAL; + goto out; } if (offset != filp->f_pos) { filp->f_pos = offset; dir_ctx->dir_cookie = 0; dir_ctx->duped = 0; } +out: + inode_unlock(inode); return offset; } @@ -2002,29 +1946,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL_GPL(nfs_link); -static void -nfs_complete_rename(struct rpc_task *task, struct nfs_renamedata *data) -{ - struct dentry *old_dentry = data->old_dentry; - struct dentry *new_dentry = data->new_dentry; - struct inode *old_inode = d_inode(old_dentry); - struct inode *new_inode = d_inode(new_dentry); - - nfs_mark_for_revalidate(old_inode); - - switch (task->tk_status) { - case 0: - if (new_inode != NULL) - nfs_drop_nlink(new_inode); - d_move(old_dentry, new_dentry); - nfs_set_verifier(new_dentry, - nfs_save_change_attribute(data->new_dir)); - break; - case -ENOENT: - nfs_dentry_handle_enoent(old_dentry); - } -} - /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2055,7 +1976,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct dentry *dentry = NULL; + struct dentry *dentry = NULL, *rehash = NULL; struct rpc_task *task; int error = -EBUSY; @@ -2078,8 +1999,10 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, * To prevent any new references to the target during the * rename, we unhash the dentry in advance. */ - if (!d_unhashed(new_dentry)) + if (!d_unhashed(new_dentry)) { d_drop(new_dentry); + rehash = new_dentry; + } if (d_count(new_dentry) > 2) { int err; @@ -2096,6 +2019,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; new_dentry = dentry; + rehash = NULL; new_inode = NULL; } } @@ -2104,8 +2028,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode != NULL) NFS_PROTO(new_inode)->return_delegation(new_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, - nfs_complete_rename); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); goto out; @@ -2115,9 +2038,27 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error == 0) error = task->tk_status; rpc_put_task(task); + nfs_mark_for_revalidate(old_inode); out: + if (rehash) + d_rehash(rehash); trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); + if (!error) { + if (new_inode != NULL) + nfs_drop_nlink(new_inode); + /* + * The d_move() should be here instead of in an async RPC completion + * handler because we need the proper locks to move the dentry. If + * we're interrupted by a signal, the async RPC completion handler + * should mark the directories for revalidation. + */ + d_move(old_dentry, new_dentry); + nfs_set_verifier(new_dentry, + nfs_save_change_attribute(new_dir)); + } else if (error == -ENOENT) + nfs_dentry_handle_enoent(old_dentry); + /* new dentry created? */ if (dentry) dput(dentry); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index c1b5fed7c863..6fb9fad2d1e6 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -392,16 +392,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) nfs_direct_req_release(dreq); } -static void nfs_direct_readpage_release(struct nfs_page *req) -{ - dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", - req->wb_context->dentry->d_sb->s_id, - (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), - req->wb_bytes, - (long long)req_offset(req)); - nfs_release_request(req); -} - static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) { unsigned long bytes = 0; @@ -426,7 +416,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) set_page_dirty(page); bytes += req->wb_bytes; nfs_list_remove_request(req); - nfs_direct_readpage_release(req); + nfs_release_request(req); } out_put: if (put_dreq(dreq)) @@ -700,16 +690,9 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) int status = data->task.tk_status; nfs_init_cinfo_from_dreq(&cinfo, dreq); - if (status < 0) { - dprintk("NFS: %5u commit failed with error %d.\n", - data->task.tk_pid, status); - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) { - dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); + if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data)) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } - dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 668213984d68..5713eb32a45e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -482,7 +482,7 @@ static int nfs_launder_page(struct page *page) inode->i_ino, (long long)page_offset(page)); nfs_fscache_wait_on_page_write(nfsi, page); - return nfs_wb_launder_page(inode, page); + return nfs_wb_page(inode, page); } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -697,14 +697,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!IS_ERR(l_ctx)) { status = nfs_iocounter_wait(l_ctx); nfs_put_lock_context(l_ctx); - if (status < 0) + /* NOTE: special case + * If we're signalled while cleaning up locks on process exit, we + * still need to complete the unlock. + */ + if (status < 0 && !(fl->fl_flags & FL_CLOSE)) return status; } - /* NOTE: special case - * If we're signalled while cleaning up locks on process exit, we - * still need to complete the unlock. - */ /* * Use local locking if mounted with "-onolock" or with appropriate * "-olocal_lock=" @@ -820,9 +820,23 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; - /* We're simulating flock() locks using posix locks on the server */ - if (fl->fl_type == F_UNLCK) + /* + * VFS doesn't require the open mode to match a flock() lock's type. + * NFS, however, may simulate flock() locking with posix locking which + * requires the open mode to match the lock type. + */ + switch (fl->fl_type) { + case F_UNLCK: return do_unlk(filp, cmd, fl, is_local); + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } + return do_setlk(filp, cmd, fl, is_local); } EXPORT_SYMBOL_GPL(nfs_flock); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index acd30baca461..1cf85d65b748 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -921,11 +921,11 @@ fl_pnfs_update_layout(struct inode *ino, fl = FILELAYOUT_LSEG(lseg); status = filelayout_check_deviceid(lo, fl, gfp_flags); - if (status) + if (status) { + pnfs_put_lseg(lseg); lseg = ERR_PTR(status); + } out: - if (IS_ERR(lseg)) - pnfs_put_lseg(lseg); return lseg; } @@ -933,6 +933,7 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -959,6 +960,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, req->wb_context, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 42dedf2d625f..23542dc44a25 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -454,6 +454,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; /* fh */ + rc = -EIO; p = xdr_inline_decode(&stream, 4); if (!p) goto out_err_free; @@ -846,6 +847,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; retry: + pnfs_generic_pg_check_layout(pgio); /* Use full layout for now */ if (!pgio->pg_lseg) ff_layout_pg_get_read(pgio, req, false); @@ -894,6 +896,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int status; retry: + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -1800,16 +1803,16 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); if (!ds) - return PNFS_NOT_ATTEMPTED; + goto out_failed; ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, hdr->inode); if (IS_ERR(ds_clnt)) - return PNFS_NOT_ATTEMPTED; + goto out_failed; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); if (!ds_cred) - return PNFS_NOT_ATTEMPTED; + goto out_failed; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1839,6 +1842,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) sync, RPC_TASK_SOFTCONN); put_rpccred(ds_cred); return PNFS_ATTEMPTED; + +out_failed: + if (ff_layout_avoid_mds_available_ds(lseg)) + return PNFS_TRY_AGAIN; + return PNFS_NOT_ATTEMPTED; } static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) @@ -2354,10 +2362,21 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) return 0; } +static int +ff_layout_set_layoutdriver(struct nfs_server *server, + const struct nfs_fh *dummy) +{ +#if IS_ENABLED(CONFIG_NFS_V4_2) + server->caps |= NFS_CAP_LAYOUTSTATS; +#endif + return 0; +} + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", .owner = THIS_MODULE, + .set_layoutdriver = ff_layout_set_layoutdriver, .alloc_layout_hdr = ff_layout_alloc_layout_hdr, .free_layout_hdr = ff_layout_free_layout_hdr, .alloc_lseg = ff_layout_alloc_lseg, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 457cfeb1d5c1..6df7a0cf5660 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -119,7 +119,13 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE) ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE; - if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) { + /* + * check for valid major/minor combination. + * currently we support dataserver which talk: + * v3, v4.0, v4.1, v4.2 + */ + if (!((ds_versions[i].version == 3 && ds_versions[i].minor_version == 0) || + (ds_versions[i].version == 4 && ds_versions[i].minor_version < 3))) { dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__, i, ds_versions[i].version, ds_versions[i].minor_version); @@ -415,7 +421,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].minor_version); /* connect success, check rsize/wsize limit */ - if (ds->ds_clp) { + if (!status) { max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f489a5a71bd5..1de93ba78dc9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -734,7 +734,10 @@ int nfs_getattr(const struct path *path, struct kstat *stat, if (need_atime || nfs_need_revalidate_inode(inode)) { struct nfs_server *server = NFS_SERVER(inode); - nfs_readdirplus_parent_cache_miss(path->dentry); + if (!(server->flags & NFS_MOUNT_NOAC)) + nfs_readdirplus_parent_cache_miss(path->dentry); + else + nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); } else nfs_readdirplus_parent_cache_hit(path->dentry); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7b38fedb7e03..8701d7617964 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -7,6 +7,7 @@ #include <linux/security.h> #include <linux/crc32.h> #include <linux/nfs_page.h> +#include <linux/wait_bit.h> #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) @@ -398,7 +399,6 @@ extern struct file_system_type nfs4_referral_fs_type; bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); -void nfs_initialise_sb(struct super_block *); int nfs_set_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); int nfs_clone_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); struct dentry *nfs_fs_mount_common(struct nfs_server *, int, const char *, @@ -458,7 +458,6 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); /* super.c */ -void nfs_clone_super(struct super_block *, struct nfs_mount_info *); void nfs_umount_begin(struct super_block *); int nfs_statfs(struct dentry *, struct kstatfs *); int nfs_show_options(struct seq_file *, struct dentry *); @@ -495,7 +494,6 @@ void nfs_mark_request_commit(struct nfs_page *req, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); -int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, @@ -756,9 +754,13 @@ static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: + case -EACCES: + case -EDQUOT: + case -EFBIG: case -EIO: case -ENOSPC: case -EROFS: + case -ESTALE: case -E2BIG: return true; default: diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 786f17580582..e5686be67be8 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -143,11 +143,8 @@ struct vfsmount *nfs_d_automount(struct path *path) struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; - dprintk("--> nfs_d_automount()\n"); - - mnt = ERR_PTR(-ESTALE); if (IS_ROOT(path->dentry)) - goto out_nofree; + return ERR_PTR(-ESTALE); mnt = ERR_PTR(-ENOMEM); fh = nfs_alloc_fhandle(); @@ -155,13 +152,10 @@ struct vfsmount *nfs_d_automount(struct path *path) if (fh == NULL || fattr == NULL) goto out; - dprintk("%s: enter\n", __func__); - mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr); if (IS_ERR(mnt)) goto out; - dprintk("%s: done, success\n", __func__); mntget(mnt); /* prevent immediate expiration */ mnt_set_expiry(mnt, &nfs_automount_list); schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); @@ -169,11 +163,6 @@ struct vfsmount *nfs_d_automount(struct path *path) out: nfs_free_fattr(fattr); nfs_free_fhandle(fh); -out_nofree: - if (IS_ERR(mnt)) - dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt)); - else - dprintk("<-- %s() = %p\n", __func__, mnt); return mnt; } @@ -248,27 +237,20 @@ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, .fattr = fattr, .authflavor = authflavor, }; - struct vfsmount *mnt = ERR_PTR(-ENOMEM); + struct vfsmount *mnt; char *page = (char *) __get_free_page(GFP_USER); char *devname; - dprintk("--> nfs_do_submount()\n"); - - dprintk("%s: submounting on %pd2\n", __func__, - dentry); if (page == NULL) - goto out; + return ERR_PTR(-ENOMEM); + devname = nfs_devname(dentry, page, PAGE_SIZE); - mnt = (struct vfsmount *)devname; if (IS_ERR(devname)) - goto free_page; - mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); -free_page: - free_page((unsigned long)page); -out: - dprintk("%s: done\n", __func__); + mnt = ERR_CAST(devname); + else + mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); - dprintk("<-- nfs_do_submount() = %p\n", mnt); + free_page((unsigned long)page); return mnt; } EXPORT_SYMBOL_GPL(nfs_do_submount); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index dc925b531f32..0c07b567118d 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -865,12 +865,63 @@ static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT]; } +static void nfs3_nlm_alloc_call(void *data) +{ + struct nfs_lock_context *l_ctx = data; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) { + get_nfs_open_context(l_ctx->open_context); + nfs_get_lock_context(l_ctx->open_context); + } +} + +static bool nfs3_nlm_unlock_prepare(struct rpc_task *task, void *data) +{ + struct nfs_lock_context *l_ctx = data; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) + return nfs_async_iocounter_wait(task, l_ctx); + return false; + +} + +static void nfs3_nlm_release_call(void *data) +{ + struct nfs_lock_context *l_ctx = data; + struct nfs_open_context *ctx; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) { + ctx = l_ctx->open_context; + nfs_put_lock_context(l_ctx); + put_nfs_open_context(ctx); + } +} + +const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { + .nlmclnt_alloc_call = nfs3_nlm_alloc_call, + .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare, + .nlmclnt_release_call = nfs3_nlm_release_call, +}; + static int nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); + struct nfs_lock_context *l_ctx = NULL; + struct nfs_open_context *ctx = nfs_file_open_context(filp); + int status; - return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); + if (fl->fl_flags & FL_CLOSE) { + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) + l_ctx = NULL; + else + set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); + } + + status = nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, l_ctx); + + if (l_ctx) + nfs_put_lock_context(l_ctx); + + return status; } static int nfs3_have_delegation(struct inode *inode, fmode_t flags) @@ -921,6 +972,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .dir_inode_ops = &nfs3_dir_inode_operations, .file_inode_ops = &nfs3_file_inode_operations, .file_ops = &nfs_file_operations, + .nlmclnt_ops = &nlmclnt_fl_close_lock_ops, .getroot = nfs3_proc_get_root, .submount = nfs_submount, .try_mount = nfs_try_mount, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 1e486c73ec94..319a47db218d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -167,23 +167,29 @@ static ssize_t _nfs42_proc_copy(struct file *src, if (status) return status; + res->commit_res.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS); + if (!res->commit_res.verf) + return -ENOMEM; status = nfs4_call_sync(server->client, server, &msg, &args->seq_args, &res->seq_res, 0); if (status == -ENOTSUPP) server->caps &= ~NFS_CAP_COPY; if (status) - return status; + goto out; - if (res->write_res.verifier.committed != NFS_FILE_SYNC) { - status = nfs_commit_file(dst, &res->write_res.verifier.verifier); - if (status) - return status; + if (nfs_write_verifier_cmp(&res->write_res.verifier.verifier, + &res->commit_res.verf->verifier)) { + status = -EAGAIN; + goto out; } truncate_pagecache_range(dst_inode, pos_dst, pos_dst + res->write_res.count); - return res->write_res.count; + status = res->write_res.count; +out: + kfree(res->commit_res.verf); + return status; } ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, @@ -240,6 +246,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, if (err == -ENOTSUPP) { err = -EOPNOTSUPP; break; + } if (err == -EAGAIN) { + dst_exception.retry = 1; + continue; } err2 = nfs4_handle_exception(server, err, &src_exception); @@ -379,6 +388,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); + nfs_commit_inode(inode, 0); } else spin_unlock(&inode->i_lock); break; @@ -400,8 +410,6 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) case -EOPNOTSUPP: NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; } - - dprintk("%s server returns %d\n", __func__, task->tk_status); } static void diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6c7296454bbc..528362f69cc1 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -66,12 +66,14 @@ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ - encode_copy_maxsz) + encode_copy_maxsz + \ + encode_commit_maxsz) #define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ - decode_copy_maxsz) + decode_copy_maxsz + \ + decode_commit_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ @@ -222,6 +224,18 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, encode_nops(&hdr); } +static void encode_copy_commit(struct xdr_stream *xdr, + struct nfs42_copy_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->dst_pos); + *p = cpu_to_be32(args->count); +} + /* * Encode COPY request */ @@ -239,6 +253,7 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req, encode_savefh(xdr, &hdr); encode_putfh(xdr, args->dst_fh, &hdr); encode_copy(xdr, args, &hdr); + encode_copy_commit(xdr, args, &hdr); encode_nops(&hdr); } @@ -481,6 +496,9 @@ static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp, if (status) goto out; status = decode_copy(xdr, res); + if (status) + goto out; + status = decode_commit(xdr, &res->commit_res); out: return status; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8346ccbf2d52..66776f022111 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -359,11 +359,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, struct nfs_client *old; int error; - if (clp->cl_cons_state == NFS_CS_READY) { + if (clp->cl_cons_state == NFS_CS_READY) /* the client is initialised already */ - dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); return clp; - } /* Check NFS protocol revision and initialize RPC op vector */ clp->rpc_ops = &nfs_v4_clientops; @@ -421,7 +419,6 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error: nfs_mark_client_ready(clp, error); nfs_put_client(clp); - dprintk("<-- nfs4_init_client() = xerror %d\n", error); return ERR_PTR(error); } @@ -469,6 +466,50 @@ static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2) return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0; } +static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new, + struct nfs_client **prev, struct nfs_net *nn) +{ + int status; + + if (pos->rpc_ops != new->rpc_ops) + return 1; + + if (pos->cl_minorversion != new->cl_minorversion) + return 1; + + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos", especially the client + * ID and serverowner fields. Wait for CREATE_SESSION + * to finish. */ + if (pos->cl_cons_state > NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + nfs_put_client(*prev); + *prev = pos; + + status = nfs_wait_client_init_complete(pos); + spin_lock(&nn->nfs_client_lock); + + if (status < 0) + return status; + } + + if (pos->cl_cons_state != NFS_CS_READY) + return 1; + + if (pos->cl_clientid != new->cl_clientid) + return 1; + + /* NFSv4.1 always uses the uniform string, however someone + * might switch the uniquifier string on us. + */ + if (!nfs4_match_client_owner_id(pos, new)) + return 1; + + return 0; +} + /** * nfs40_walk_client_list - Find server that recognizes a client ID * @@ -497,34 +538,10 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - - /* If "pos" isn't marked ready, we can't trust the - * remaining fields in "pos" */ - if (pos->cl_cons_state > NFS_CS_READY) { - atomic_inc(&pos->cl_count); - spin_unlock(&nn->nfs_client_lock); - - nfs_put_client(prev); - prev = pos; - - status = nfs_wait_client_init_complete(pos); - if (status < 0) - goto out; - status = -NFS4ERR_STALE_CLIENTID; - spin_lock(&nn->nfs_client_lock); - } - if (pos->cl_cons_state != NFS_CS_READY) - continue; - - if (pos->cl_clientid != new->cl_clientid) - continue; - - if (!nfs4_match_client_owner_id(pos, new)) + status = nfs4_match_client(pos, new, &prev, nn); + if (status < 0) + goto out_unlock; + if (status != 0) continue; /* * We just sent a new SETCLIENTID, which should have @@ -557,8 +574,6 @@ int nfs40_walk_client_list(struct nfs_client *new, prev = NULL; *result = pos; - dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", - __func__, pos, atomic_read(&pos->cl_count)); goto out; case -ERESTARTSYS: case -ETIMEDOUT: @@ -572,32 +587,17 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); } +out_unlock: spin_unlock(&nn->nfs_client_lock); /* No match found. The server lost our clientid */ out: nfs_put_client(prev); - dprintk("NFS: <-- %s status = %d\n", __func__, status); return status; } #ifdef CONFIG_NFS_V4_1 /* - * Returns true if the client IDs match - */ -static bool nfs4_match_clientids(u64 a, u64 b) -{ - if (a != b) { - dprintk("NFS: --> %s client ID %llx does not match %llx\n", - __func__, a, b); - return false; - } - dprintk("NFS: --> %s client ID %llx matches %llx\n", - __func__, a, b); - return true; -} - -/* * Returns true if the server major ids match */ static bool @@ -605,36 +605,8 @@ nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, struct nfs41_server_owner *o2) { if (o1->major_id_sz != o2->major_id_sz) - goto out_major_mismatch; - if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) - goto out_major_mismatch; - - dprintk("NFS: --> %s server owner major IDs match\n", __func__); - return true; - -out_major_mismatch: - dprintk("NFS: --> %s server owner major IDs do not match\n", - __func__); - return false; -} - -/* - * Returns true if server minor ids match - */ -static bool -nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1, - struct nfs41_server_owner *o2) -{ - /* Check eir_server_owner so_minor_id */ - if (o1->minor_id != o2->minor_id) - goto out_minor_mismatch; - - dprintk("NFS: --> %s server owner minor IDs match\n", __func__); - return true; - -out_minor_mismatch: - dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__); - return false; + return false; + return memcmp(o1->major_id, o2->major_id, o1->major_id_sz) == 0; } /* @@ -645,18 +617,9 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1, struct nfs41_server_scope *s2) { if (s1->server_scope_sz != s2->server_scope_sz) - goto out_scope_mismatch; - if (memcmp(s1->server_scope, s2->server_scope, - s1->server_scope_sz) != 0) - goto out_scope_mismatch; - - dprintk("NFS: --> %s server scopes match\n", __func__); - return true; - -out_scope_mismatch: - dprintk("NFS: --> %s server scopes do not match\n", - __func__); - return false; + return false; + return memcmp(s1->server_scope, s2->server_scope, + s1->server_scope_sz) == 0; } /** @@ -680,7 +643,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp, struct rpc_xprt *xprt) { /* Check eir_clientid */ - if (!nfs4_match_clientids(clp->cl_clientid, res->clientid)) + if (clp->cl_clientid != res->clientid) goto out_err; /* Check eir_server_owner so_major_id */ @@ -689,8 +652,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp, goto out_err; /* Check eir_server_owner so_minor_id */ - if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner, - res->server_owner)) + if (clp->cl_serverowner->minor_id != res->server_owner->minor_id) goto out_err; /* Check eir_server_scope */ @@ -739,33 +701,10 @@ int nfs41_walk_client_list(struct nfs_client *new, if (pos == new) goto found; - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - - /* If "pos" isn't marked ready, we can't trust the - * remaining fields in "pos", especially the client - * ID and serverowner fields. Wait for CREATE_SESSION - * to finish. */ - if (pos->cl_cons_state > NFS_CS_READY) { - atomic_inc(&pos->cl_count); - spin_unlock(&nn->nfs_client_lock); - - nfs_put_client(prev); - prev = pos; - - status = nfs_wait_client_init_complete(pos); - spin_lock(&nn->nfs_client_lock); - if (status < 0) - break; - status = -NFS4ERR_STALE_CLIENTID; - } - if (pos->cl_cons_state != NFS_CS_READY) - continue; - - if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid)) + status = nfs4_match_client(pos, new, &prev, nn); + if (status < 0) + goto out; + if (status != 0) continue; /* @@ -777,23 +716,15 @@ int nfs41_walk_client_list(struct nfs_client *new, new->cl_serverowner)) continue; - /* Unlike NFSv4.0, we know that NFSv4.1 always uses the - * uniform string, however someone might switch the - * uniquifier string on us. - */ - if (!nfs4_match_client_owner_id(pos, new)) - continue; found: atomic_inc(&pos->cl_count); *result = pos; status = 0; - dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", - __func__, pos, atomic_read(&pos->cl_count)); break; } +out: spin_unlock(&nn->nfs_client_lock); - dprintk("NFS: <-- %s status = %d\n", __func__, status); nfs_put_client(prev); return status; } @@ -916,9 +847,6 @@ static int nfs4_set_client(struct nfs_server *server, .timeparms = timeparms, }; struct nfs_client *clp; - int error; - - dprintk("--> nfs4_set_client()\n"); if (server->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); @@ -927,15 +855,11 @@ static int nfs4_set_client(struct nfs_server *server, /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); - if (IS_ERR(clp)) { - error = PTR_ERR(clp); - goto error; - } + if (IS_ERR(clp)) + return PTR_ERR(clp); - if (server->nfs_client == clp) { - error = -ELOOP; - goto error; - } + if (server->nfs_client == clp) + return -ELOOP; /* * Query for the lease time on clientid setup or renewal @@ -947,11 +871,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); server->nfs_client = clp; - dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); return 0; -error: - dprintk("<-- nfs4_set_client() = xerror %d\n", error); - return error; } /* @@ -982,7 +902,6 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, }; - struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) @@ -998,10 +917,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, * (section 13.1 RFC 5661). */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init); - - dprintk("<-- %s %p\n", __func__, clp); - return clp; + return nfs_get_client(&cl_init); } EXPORT_SYMBOL_GPL(nfs4_set_ds_client); @@ -1098,8 +1014,6 @@ static int nfs4_init_server(struct nfs_server *server, struct rpc_timeout timeparms; int error; - dprintk("--> nfs4_init_server()\n"); - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, data->timeo, data->retrans); @@ -1127,7 +1041,7 @@ static int nfs4_init_server(struct nfs_server *server, data->minorversion, data->net); if (error < 0) - goto error; + return error; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -1138,16 +1052,10 @@ static int nfs4_init_server(struct nfs_server *server, server->acregmax = data->acregmax * HZ; server->acdirmin = data->acdirmin * HZ; server->acdirmax = data->acdirmax * HZ; + server->port = data->nfs_server.port; - server->port = data->nfs_server.port; - - error = nfs_init_server_rpcclient(server, &timeparms, - data->selected_flavor); - -error: - /* Done */ - dprintk("<-- nfs4_init_server() = %d\n", error); - return error; + return nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); } /* @@ -1163,8 +1071,6 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, bool auth_probe; int error; - dprintk("--> nfs4_create_server()\n"); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1180,12 +1086,10 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, if (error < 0) goto error; - dprintk("<-- nfs4_create_server() = %p\n", server); return server; error: nfs_free_server(server); - dprintk("<-- nfs4_create_server() = error %d\n", error); return ERR_PTR(error); } @@ -1200,8 +1104,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, bool auth_probe; int error; - dprintk("--> nfs4_create_referral_server()\n"); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1235,12 +1137,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; - dprintk("<-- nfs_create_referral_server() = %p\n", server); return server; error: nfs_free_server(server); - dprintk("<-- nfs4_create_referral_server() = error %d\n", error); return ERR_PTR(error); } @@ -1300,31 +1200,16 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, struct sockaddr *localaddr = (struct sockaddr *)&address; int error; - dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__, - (unsigned long long)server->fsid.major, - (unsigned long long)server->fsid.minor, - hostname); - error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); - if (error != 0) { - dprintk("<-- %s(): rpc_switch_client_transport returned %d\n", - __func__, error); - goto out; - } + if (error != 0) + return error; error = rpc_localaddr(clnt, localaddr, sizeof(address)); - if (error != 0) { - dprintk("<-- %s(): rpc_localaddr returned %d\n", - __func__, error); - goto out; - } + if (error != 0) + return error; - error = -EAFNOSUPPORT; - if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) { - dprintk("<-- %s(): rpc_ntop returned %d\n", - __func__, error); - goto out; - } + if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) + return -EAFNOSUPPORT; nfs_server_remove_lists(server); error = nfs4_set_client(server, hostname, sap, salen, buf, @@ -1333,21 +1218,12 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, nfs_put_client(clp); if (error != 0) { nfs_server_insert_lists(server); - dprintk("<-- %s(): nfs4_set_client returned %d\n", - __func__, error); - goto out; + return error; } if (server->nfs_client->cl_hostname == NULL) server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); nfs_server_insert_lists(server); - error = nfs_probe_destination(server); - if (error < 0) - goto out; - - dprintk("<-- %s() succeeded\n", __func__); - -out: - return error; + return nfs_probe_destination(server); } diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index 039b3eb6d834..ac8406018962 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -14,8 +14,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p struct nfs_fsinfo fsinfo; int ret = -ENOMEM; - dprintk("--> nfs4_get_rootfh()\n"); - fsinfo.fattr = nfs_alloc_fattr(); if (fsinfo.fattr == NULL) goto out; @@ -38,6 +36,5 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); out: nfs_free_fattr(fsinfo.fattr); - dprintk("<-- nfs4_get_rootfh() = %d\n", ret); return ret; } diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index d8b040bd9814..7d531da1bae3 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -340,7 +340,6 @@ static struct vfsmount *nfs_follow_referral(struct dentry *dentry, out: free_page((unsigned long) page); free_page((unsigned long) page2); - dprintk("%s: done\n", __func__); return mnt; } @@ -358,11 +357,9 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry * int err; /* BUG_ON(IS_ROOT(dentry)); */ - dprintk("%s: enter\n", __func__); - page = alloc_page(GFP_KERNEL); if (page == NULL) - goto out; + return mnt; fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); if (fs_locations == NULL) @@ -386,8 +383,6 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry * out_free: __free_page(page); kfree(fs_locations); -out: - dprintk("%s: done\n", __func__); return mnt; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 201ca3f2c4ba..98b0b662af09 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -698,7 +698,8 @@ static int nfs41_sequence_process(struct rpc_task *task, session = slot->table->session; if (slot->interrupted) { - slot->interrupted = 0; + if (res->sr_status != -NFS4ERR_DELAY) + slot->interrupted = 0; interrupted = true; } @@ -2300,8 +2301,10 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) if (status != 0) return status; } - if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) + if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + nfs4_sequence_free_slot(&o_res->seq_res); nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); + } return 0; } @@ -2586,7 +2589,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, /* Except MODE, it seems harmless of setting twice. */ if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && - attrset[1] & FATTR4_WORD1_MODE) + (attrset[1] & FATTR4_WORD1_MODE || + attrset[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_valid &= ~ATTR_MODE; if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) @@ -3265,6 +3269,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f .rpc_resp = &res, }; int status; + int i; bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_FH_EXPIRE_TYPE | @@ -3330,8 +3335,13 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; server->cache_consistency_bitmask[2] = 0; + + /* Avoid a regression due to buggy server */ + for (i = 0; i < ARRAY_SIZE(res.exclcreat_bitmask); i++) + res.exclcreat_bitmask[i] &= res.attr_bitmask[i]; memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask, sizeof(server->exclcreat_bitmask)); + server->acl_bitmask = res.acl_bitmask; server->fh_expire_type = res.fh_expire_type; } @@ -4610,7 +4620,7 @@ static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, return 0; if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, - hdr->rw_ops->rw_mode) == -EIO) + hdr->rw_mode) == -EIO) return -EIO; if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) return -EIO; @@ -4804,8 +4814,10 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, if (!atomic_inc_not_zero(&clp->cl_count)) return -EIO; data = kmalloc(sizeof(*data), GFP_NOFS); - if (data == NULL) + if (data == NULL) { + nfs_put_client(clp); return -ENOMEM; + } data->client = clp; data->timestamp = jiffies; return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT, @@ -5782,6 +5794,7 @@ struct nfs4_unlockdata { struct nfs_locku_res res; struct nfs4_lock_state *lsp; struct nfs_open_context *ctx; + struct nfs_lock_context *l_ctx; struct file_lock fl; struct nfs_server *server; unsigned long timestamp; @@ -5806,6 +5819,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, atomic_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); + p->l_ctx = nfs_get_lock_context(ctx); memcpy(&p->fl, fl, sizeof(p->fl)); p->server = NFS_SERVER(inode); return p; @@ -5816,6 +5830,7 @@ static void nfs4_locku_release_calldata(void *data) struct nfs4_unlockdata *calldata = data; nfs_free_seqid(calldata->arg.seqid); nfs4_put_lock_state(calldata->lsp); + nfs_put_lock_context(calldata->l_ctx); put_nfs_open_context(calldata->ctx); kfree(calldata); } @@ -5857,6 +5872,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; + if (test_bit(NFS_CONTEXT_UNLOCK, &calldata->l_ctx->open_context->flags) && + nfs_async_iocounter_wait(task, calldata->l_ctx)) + return; + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); @@ -5908,6 +5927,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, * canceled lock is passed in, and it won't be an unlock. */ fl->fl_type = F_UNLCK; + if (fl->fl_flags & FL_CLOSE) + set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); if (data == NULL) { @@ -6352,7 +6373,7 @@ struct nfs4_lock_waiter { }; static int -nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key) +nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key) { int ret; struct cb_notify_lock_args *cbnl = key; @@ -6395,7 +6416,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) .inode = state->inode, .owner = &owner, .notified = false }; - wait_queue_t wait; + wait_queue_entry_t wait; /* Don't bother with waitqueue if we don't expect a callback */ if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) @@ -6445,9 +6466,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) ctx = nfs_file_open_context(filp); state = ctx->state; - if (request->fl_start < 0 || request->fl_end < 0) - return -EINVAL; - if (IS_GETLK(cmd)) { if (state != NULL) return nfs4_proc_getlk(state, F_GETLK, request); @@ -6470,20 +6488,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) return -ENOLCK; - /* - * Don't rely on the VFS having checked the file open mode, - * since it won't do this for flock() locks. - */ - switch (request->fl_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - return -EBADF; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - } - status = nfs4_set_lock_state(state, request); if (status != 0) return status; @@ -7155,8 +7159,6 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, }; struct rpc_task *task; - dprintk("--> %s\n", __func__); - nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id); if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) args.dir = NFS4_CDFC4_FORE; @@ -7176,24 +7178,20 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, if (memcmp(res.sessionid.data, clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("NFS: %s: Session ID mismatch\n", __func__); - status = -EIO; - goto out; + return -EIO; } if ((res.dir & args.dir) != res.dir || res.dir == 0) { dprintk("NFS: %s: Unexpected direction from server\n", __func__); - status = -EIO; - goto out; + return -EIO; } if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) { dprintk("NFS: %s: Server returned RDMA mode = true\n", __func__); - status = -EIO; - goto out; + return -EIO; } } -out: - dprintk("<-- %s status= %d\n", __func__, status); + return status; } @@ -7459,15 +7457,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, }; struct nfs41_exchange_id_data *calldata; struct rpc_task *task; - int status = -EIO; + int status; if (!atomic_inc_not_zero(&clp->cl_count)) - goto out; + return -EIO; - status = -ENOMEM; calldata = kzalloc(sizeof(*calldata), GFP_NOFS); - if (!calldata) - goto out; + if (!calldata) { + nfs_put_client(clp); + return -ENOMEM; + } if (!xprt) nfs4_init_boot_verifier(clp, &verifier); @@ -7476,10 +7475,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (status) goto out_calldata; - dprintk("NFS call exchange_id auth=%s, '%s'\n", - clp->cl_rpcclient->cl_auth->au_ops->au_name, - clp->cl_owner_id); - calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), GFP_NOFS); status = -ENOMEM; @@ -7545,13 +7540,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, rpc_put_task(task); out: - if (clp->cl_implid != NULL) - dprintk("NFS reply exchange_id: Server Implementation ID: " - "domain: %s, name: %s, date: %llu,%u\n", - clp->cl_implid->domain, clp->cl_implid->name, - clp->cl_implid->date.seconds, - clp->cl_implid->date.nseconds); - dprintk("NFS reply exchange_id: %d\n", status); return status; out_impl_id: @@ -7769,17 +7757,13 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); nfs4_set_sequence_privileged(&args.la_seq_args); - dprintk("--> %s\n", __func__); task = rpc_run_task(&task_setup); if (IS_ERR(task)) - status = PTR_ERR(task); - else { - status = task->tk_status; - rpc_put_task(task); - } - dprintk("<-- %s return %d\n", __func__, status); + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); return status; } @@ -8180,6 +8164,12 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf /* fall through */ case -NFS4ERR_RETRY_UNCACHED_REP: return -EAGAIN; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + nfs4_schedule_session_recovery(clp->cl_session, + task->tk_status); + break; default: nfs4_schedule_lease_recovery(clp); } @@ -8258,7 +8248,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, if (status == 0) status = task->tk_status; rpc_put_task(task); - return 0; out: dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -8357,6 +8346,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, */ pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); + nfs_commit_inode(inode, 0); pnfs_free_lseg_list(&head); status = -EAGAIN; goto out; @@ -8427,6 +8417,7 @@ static void nfs4_layoutget_release(void *calldata) size_t max_pages = max_response_pages(server); dprintk("--> %s\n", __func__); + nfs4_sequence_free_slot(&lgp->res.seq_res); nfs4_free_pages(lgp->args.layout.pages, max_pages); pnfs_put_layout_hdr(NFS_I(inode)->layout); put_nfs_open_context(lgp->args.ctx); @@ -8501,7 +8492,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); - nfs4_sequence_free_slot(&lgp->res.seq_res); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 8156bad6b441..cbf82b0d4467 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1649,13 +1649,14 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); } -static void nfs4_reclaim_complete(struct nfs_client *clp, +static int nfs4_reclaim_complete(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops, struct rpc_cred *cred) { /* Notify the server we're done reclaiming our state */ if (ops->reclaim_complete) - (void)ops->reclaim_complete(clp, cred); + return ops->reclaim_complete(clp, cred); + return 0; } static void nfs4_clear_reclaim_server(struct nfs_server *server) @@ -1702,13 +1703,16 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) { const struct nfs4_state_recovery_ops *ops; struct rpc_cred *cred; + int err; if (!nfs4_state_clear_reclaim_reboot(clp)) return; ops = clp->cl_mvops->reboot_recovery_ops; cred = nfs4_get_clid_cred(clp); - nfs4_reclaim_complete(clp, ops, cred); + err = nfs4_reclaim_complete(clp, ops, cred); put_rpccred(cred); + if (err == -NFS4ERR_CONN_NOT_BOUND_TO_SESSION) + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); } static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) @@ -2130,6 +2134,8 @@ again: put_rpccred(cred); switch (status) { case 0: + case -EINTR: + case -ERESTARTSYS: break; case -ETIMEDOUT: if (clnt->cl_softrtry) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 80ce289eea05..3aebfdc82b30 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1000,8 +1000,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, + const umode_t *umask, const struct nfs_server *server, - bool excl_check, const umode_t *umask) + const uint32_t attrmask[]) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -1016,22 +1017,20 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, /* * We reserve enough space to write the entire attribute buffer at once. */ - if (iap->ia_valid & ATTR_SIZE) { + if ((iap->ia_valid & ATTR_SIZE) && (attrmask[0] & FATTR4_WORD0_SIZE)) { bmval[0] |= FATTR4_WORD0_SIZE; len += 8; } - if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) - umask = NULL; if (iap->ia_valid & ATTR_MODE) { - if (umask) { + if (umask && (attrmask[2] & FATTR4_WORD2_MODE_UMASK)) { bmval[2] |= FATTR4_WORD2_MODE_UMASK; len += 8; - } else { + } else if (attrmask[1] & FATTR4_WORD1_MODE) { bmval[1] |= FATTR4_WORD1_MODE; len += 4; } } - if (iap->ia_valid & ATTR_UID) { + if ((iap->ia_valid & ATTR_UID) && (attrmask[1] & FATTR4_WORD1_OWNER)) { owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", @@ -1044,7 +1043,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[1] |= FATTR4_WORD1_OWNER; len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } - if (iap->ia_valid & ATTR_GID) { + if ((iap->ia_valid & ATTR_GID) && + (attrmask[1] & FATTR4_WORD1_OWNER_GROUP)) { owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", @@ -1056,32 +1056,26 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[1] |= FATTR4_WORD1_OWNER_GROUP; len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); } - if (iap->ia_valid & ATTR_ATIME_SET) { - bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 16; - } else if (iap->ia_valid & ATTR_ATIME) { - bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 4; - } - if (iap->ia_valid & ATTR_MTIME_SET) { - bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 16; - } else if (iap->ia_valid & ATTR_MTIME) { - bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 4; + if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) { + if (iap->ia_valid & ATTR_ATIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; + len += 16; + } else if (iap->ia_valid & ATTR_ATIME) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; + len += 4; + } } - - if (excl_check) { - const u32 *excl_bmval = server->exclcreat_bitmask; - bmval[0] &= excl_bmval[0]; - bmval[1] &= excl_bmval[1]; - bmval[2] &= excl_bmval[2]; - - if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) - label = NULL; + if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) { + if (iap->ia_valid & ATTR_MTIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; + len += 16; + } else if (iap->ia_valid & ATTR_MTIME) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; + len += 4; + } } - if (label) { + if (label && (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL)) { len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; } @@ -1188,8 +1182,8 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - encode_attrs(xdr, create->attrs, create->label, create->server, false, - &create->umask); + encode_attrs(xdr, create->attrs, create->label, &create->umask, + create->server, create->server->attr_bitmask); } static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) @@ -1409,13 +1403,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op switch(arg->createmode) { case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->attr_bitmask); break; case NFS4_CREATE_GUARDED: *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->attr_bitmask); break; case NFS4_CREATE_EXCLUSIVE: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); @@ -1424,8 +1418,8 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op case NFS4_CREATE_EXCLUSIVE4_1: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); encode_nfs4_verifier(xdr, &arg->u.verifier); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->exclcreat_bitmask); } } @@ -1681,7 +1675,8 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); encode_nfs4_stateid(xdr, &arg->stateid); - encode_attrs(xdr, arg->iap, arg->label, server, false, NULL); + encode_attrs(xdr, arg->iap, arg->label, NULL, server, + server->attr_bitmask); } static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) @@ -2005,16 +2000,10 @@ encode_layoutcommit(struct xdr_stream *xdr, *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ - if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) { - NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( - NFS_I(inode)->layout, xdr, args); - } else { - encode_uint32(xdr, args->layoutupdate_len); - if (args->layoutupdate_pages) { - xdr_write_pages(xdr, args->layoutupdate_pages, 0, - args->layoutupdate_len); - } - } + encode_uint32(xdr, args->layoutupdate_len); + if (args->layoutupdate_pages) + xdr_write_pages(xdr, args->layoutupdate_pages, 0, + args->layoutupdate_len); return 0; } @@ -2024,7 +2013,6 @@ encode_layoutreturn(struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args, struct compound_hdr *hdr) { - const struct pnfs_layoutdriver_type *lr_ops = NFS_SERVER(args->inode)->pnfs_curr_ld; __be32 *p; encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr); @@ -2041,8 +2029,6 @@ encode_layoutreturn(struct xdr_stream *xdr, spin_unlock(&args->inode->i_lock); if (args->ld_private->ops && args->ld_private->ops->encode) args->ld_private->ops->encode(xdr, args, args->ld_private); - else if (lr_ops->encode_layoutreturn) - lr_ops->encode_layoutreturn(xdr, args); else encode_uint32(xdr, 0); } @@ -5579,6 +5565,8 @@ static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) unsigned int i; p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; bitmap_words = be32_to_cpup(p++); if (bitmap_words > NFS4_OP_MAP_NUM_WORDS) return -EIO; diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild deleted file mode 100644 index ed30ea072bb8..000000000000 --- a/fs/nfs/objlayout/Kbuild +++ /dev/null @@ -1,5 +0,0 @@ -# -# Makefile for the pNFS Objects Layout Driver kernel module -# -objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o -obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c deleted file mode 100644 index 049c1b1f2932..000000000000 --- a/fs/nfs/objlayout/objio_osd.c +++ /dev/null @@ -1,675 +0,0 @@ -/* - * pNFS Objects layout implementation over open-osd initiator library - * - * Copyright (C) 2009 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <ooo@electrozaur.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <linux/module.h> -#include <scsi/osd_ore.h> - -#include "objlayout.h" -#include "../internal.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -struct objio_dev_ent { - struct nfs4_deviceid_node id_node; - struct ore_dev od; -}; - -static void -objio_free_deviceid_node(struct nfs4_deviceid_node *d) -{ - struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); - - dprintk("%s: free od=%p\n", __func__, de->od.od); - osduld_put_device(de->od.od); - kfree_rcu(d, rcu); -} - -struct objio_segment { - struct pnfs_layout_segment lseg; - - struct ore_layout layout; - struct ore_components oc; -}; - -static inline struct objio_segment * -OBJIO_LSEG(struct pnfs_layout_segment *lseg) -{ - return container_of(lseg, struct objio_segment, lseg); -} - -struct objio_state { - /* Generic layer */ - struct objlayout_io_res oir; - - bool sync; - /*FIXME: Support for extra_bytes at ore_get_rw_state() */ - struct ore_io_state *ios; -}; - -/* Send and wait for a get_device_info of devices in the layout, - then look them up with the osd_initiator library */ -struct nfs4_deviceid_node * -objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, - gfp_t gfp_flags) -{ - struct pnfs_osd_deviceaddr *deviceaddr; - struct objio_dev_ent *ode = NULL; - struct osd_dev *od; - struct osd_dev_info odi; - bool retry_flag = true; - __be32 *p; - int err; - - deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags); - if (!deviceaddr) - return NULL; - - p = page_address(pdev->pages[0]); - pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p); - - odi.systemid_len = deviceaddr->oda_systemid.len; - if (odi.systemid_len > sizeof(odi.systemid)) { - dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", - __func__, sizeof(odi.systemid)); - err = -EINVAL; - goto out; - } else if (odi.systemid_len) - memcpy(odi.systemid, deviceaddr->oda_systemid.data, - odi.systemid_len); - odi.osdname_len = deviceaddr->oda_osdname.len; - odi.osdname = (u8 *)deviceaddr->oda_osdname.data; - - if (!odi.osdname_len && !odi.systemid_len) { - dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", - __func__); - err = -ENODEV; - goto out; - } - -retry_lookup: - od = osduld_info_lookup(&odi); - if (IS_ERR(od)) { - err = PTR_ERR(od); - dprintk("%s: osduld_info_lookup => %d\n", __func__, err); - if (err == -ENODEV && retry_flag) { - err = objlayout_autologin(deviceaddr); - if (likely(!err)) { - retry_flag = false; - goto retry_lookup; - } - } - goto out; - } - - dprintk("Adding new dev_id(%llx:%llx)\n", - _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id)); - - ode = kzalloc(sizeof(*ode), gfp_flags); - if (!ode) { - dprintk("%s: -ENOMEM od=%p\n", __func__, od); - goto out; - } - - nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id); - kfree(deviceaddr); - - ode->od.od = od; - return &ode->id_node; - -out: - kfree(deviceaddr); - return NULL; -} - -static void copy_single_comp(struct ore_components *oc, unsigned c, - struct pnfs_osd_object_cred *src_comp) -{ - struct ore_comp *ocomp = &oc->comps[c]; - - WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ - WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); - - ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; - ocomp->obj.id = src_comp->oc_object_id.oid_object_id; - - memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); -} - -static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, - struct objio_segment **pseg) -{ -/* This is the in memory structure of the objio_segment - * - * struct __alloc_objio_segment { - * struct objio_segment olseg; - * struct ore_dev *ods[numdevs]; - * struct ore_comp comps[numdevs]; - * } *aolseg; - * NOTE: The code as above compiles and runs perfectly. It is elegant, - * type safe and compact. At some Past time Linus has decided he does not - * like variable length arrays, For the sake of this principal we uglify - * the code as below. - */ - struct objio_segment *lseg; - size_t lseg_size = sizeof(*lseg) + - numdevs * sizeof(lseg->oc.ods[0]) + - numdevs * sizeof(*lseg->oc.comps); - - lseg = kzalloc(lseg_size, gfp_flags); - if (unlikely(!lseg)) { - dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__, - numdevs, lseg_size); - return -ENOMEM; - } - - lseg->oc.numdevs = numdevs; - lseg->oc.single_comp = EC_MULTPLE_COMPS; - lseg->oc.ods = (void *)(lseg + 1); - lseg->oc.comps = (void *)(lseg->oc.ods + numdevs); - - *pseg = lseg; - return 0; -} - -int objio_alloc_lseg(struct pnfs_layout_segment **outp, - struct pnfs_layout_hdr *pnfslay, - struct pnfs_layout_range *range, - struct xdr_stream *xdr, - gfp_t gfp_flags) -{ - struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode); - struct objio_segment *objio_seg; - struct pnfs_osd_xdr_decode_layout_iter iter; - struct pnfs_osd_layout layout; - struct pnfs_osd_object_cred src_comp; - unsigned cur_comp; - int err; - - err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); - if (unlikely(err)) - return err; - - err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); - if (unlikely(err)) - return err; - - objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; - objio_seg->layout.group_width = layout.olo_map.odm_group_width; - objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; - objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; - objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; - - err = ore_verify_layout(layout.olo_map.odm_num_comps, - &objio_seg->layout); - if (unlikely(err)) - goto err; - - objio_seg->oc.first_dev = layout.olo_comps_index; - cur_comp = 0; - while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { - struct nfs4_deviceid_node *d; - struct objio_dev_ent *ode; - - copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); - - d = nfs4_find_get_deviceid(server, - &src_comp.oc_object_id.oid_device_id, - pnfslay->plh_lc_cred, gfp_flags); - if (!d) { - err = -ENXIO; - goto err; - } - - ode = container_of(d, struct objio_dev_ent, id_node); - objio_seg->oc.ods[cur_comp++] = &ode->od; - } - /* pnfs_osd_xdr_decode_layout_comp returns false on error */ - if (unlikely(err)) - goto err; - - *outp = &objio_seg->lseg; - return 0; - -err: - kfree(objio_seg); - dprintk("%s: Error: return %d\n", __func__, err); - *outp = NULL; - return err; -} - -void objio_free_lseg(struct pnfs_layout_segment *lseg) -{ - int i; - struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - - for (i = 0; i < objio_seg->oc.numdevs; i++) { - struct ore_dev *od = objio_seg->oc.ods[i]; - struct objio_dev_ent *ode; - - if (!od) - break; - ode = container_of(od, typeof(*ode), od); - nfs4_put_deviceid_node(&ode->id_node); - } - kfree(objio_seg); -} - -static int -objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, - struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, - loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, - struct objio_state **outp) -{ - struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - struct ore_io_state *ios; - int ret; - struct __alloc_objio_state { - struct objio_state objios; - struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; - } *aos; - - aos = kzalloc(sizeof(*aos), gfp_flags); - if (unlikely(!aos)) - return -ENOMEM; - - objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, - aos->ioerrs, rpcdata, pnfs_layout_type); - - ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, - offset, count, &ios); - if (unlikely(ret)) { - kfree(aos); - return ret; - } - - ios->pages = pages; - ios->pgbase = pgbase; - ios->private = aos; - BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); - - aos->objios.sync = 0; - aos->objios.ios = ios; - *outp = &aos->objios; - return 0; -} - -void objio_free_result(struct objlayout_io_res *oir) -{ - struct objio_state *objios = container_of(oir, struct objio_state, oir); - - ore_put_io_state(objios->ios); - kfree(objios); -} - -static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) -{ - switch (oep) { - case OSD_ERR_PRI_NO_ERROR: - return (enum pnfs_osd_errno)0; - - case OSD_ERR_PRI_CLEAR_PAGES: - BUG_ON(1); - return 0; - - case OSD_ERR_PRI_RESOURCE: - return PNFS_OSD_ERR_RESOURCE; - case OSD_ERR_PRI_BAD_CRED: - return PNFS_OSD_ERR_BAD_CRED; - case OSD_ERR_PRI_NO_ACCESS: - return PNFS_OSD_ERR_NO_ACCESS; - case OSD_ERR_PRI_UNREACHABLE: - return PNFS_OSD_ERR_UNREACHABLE; - case OSD_ERR_PRI_NOT_FOUND: - return PNFS_OSD_ERR_NOT_FOUND; - case OSD_ERR_PRI_NO_SPACE: - return PNFS_OSD_ERR_NO_SPACE; - default: - WARN_ON(1); - /* fallthrough */ - case OSD_ERR_PRI_EIO: - return PNFS_OSD_ERR_EIO; - } -} - -static void __on_dev_error(struct ore_io_state *ios, - struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, - u64 dev_offset, u64 dev_len) -{ - struct objio_state *objios = ios->private; - struct pnfs_osd_objid pooid; - struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); - /* FIXME: what to do with more-then-one-group layouts. We need to - * translate from ore_io_state index to oc->comps index - */ - unsigned comp = dev_index; - - pooid.oid_device_id = ode->id_node.deviceid; - pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; - pooid.oid_object_id = ios->oc->comps[comp].obj.id; - - objlayout_io_set_result(&objios->oir, comp, - &pooid, osd_pri_2_pnfs_err(oep), - dev_offset, dev_len, !ios->reading); -} - -/* - * read - */ -static void _read_done(struct ore_io_state *ios, void *private) -{ - struct objio_state *objios = private; - ssize_t status; - int ret = ore_check_io(ios, &__on_dev_error); - - /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ - - if (likely(!ret)) - status = ios->length; - else - status = ret; - - objlayout_read_done(&objios->oir, status, objios->sync); -} - -int objio_read_pagelist(struct nfs_pgio_header *hdr) -{ - struct objio_state *objios; - int ret; - - ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true, - hdr->lseg, hdr->args.pages, hdr->args.pgbase, - hdr->args.offset, hdr->args.count, hdr, - GFP_KERNEL, &objios); - if (unlikely(ret)) - return ret; - - objios->ios->done = _read_done; - dprintk("%s: offset=0x%llx length=0x%x\n", __func__, - hdr->args.offset, hdr->args.count); - ret = ore_read(objios->ios); - if (unlikely(ret)) - objio_free_result(&objios->oir); - return ret; -} - -/* - * write - */ -static void _write_done(struct ore_io_state *ios, void *private) -{ - struct objio_state *objios = private; - ssize_t status; - int ret = ore_check_io(ios, &__on_dev_error); - - /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ - - if (likely(!ret)) { - /* FIXME: should be based on the OSD's persistence model - * See OSD2r05 Section 4.13 Data persistence model */ - objios->oir.committed = NFS_FILE_SYNC; - status = ios->length; - } else { - status = ret; - } - - objlayout_write_done(&objios->oir, status, objios->sync); -} - -static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) -{ - struct objio_state *objios = priv; - struct nfs_pgio_header *hdr = objios->oir.rpcdata; - struct address_space *mapping = hdr->inode->i_mapping; - pgoff_t index = offset / PAGE_SIZE; - struct page *page; - loff_t i_size = i_size_read(hdr->inode); - - if (offset >= i_size) { - *uptodate = true; - dprintk("%s: g_zero_page index=0x%lx\n", __func__, index); - return ZERO_PAGE(0); - } - - page = find_get_page(mapping, index); - if (!page) { - page = find_or_create_page(mapping, index, GFP_NOFS); - if (unlikely(!page)) { - dprintk("%s: grab_cache_page Failed index=0x%lx\n", - __func__, index); - return NULL; - } - unlock_page(page); - } - *uptodate = PageUptodate(page); - dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); - return page; -} - -static void __r4w_put_page(void *priv, struct page *page) -{ - dprintk("%s: index=0x%lx\n", __func__, - (page == ZERO_PAGE(0)) ? -1UL : page->index); - if (ZERO_PAGE(0) != page) - put_page(page); - return; -} - -static const struct _ore_r4w_op _r4w_op = { - .get_page = &__r4w_get_page, - .put_page = &__r4w_put_page, -}; - -int objio_write_pagelist(struct nfs_pgio_header *hdr, int how) -{ - struct objio_state *objios; - int ret; - - ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false, - hdr->lseg, hdr->args.pages, hdr->args.pgbase, - hdr->args.offset, hdr->args.count, hdr, GFP_NOFS, - &objios); - if (unlikely(ret)) - return ret; - - objios->sync = 0 != (how & FLUSH_SYNC); - objios->ios->r4w = &_r4w_op; - - if (!objios->sync) - objios->ios->done = _write_done; - - dprintk("%s: offset=0x%llx length=0x%x\n", __func__, - hdr->args.offset, hdr->args.count); - ret = ore_write(objios->ios); - if (unlikely(ret)) { - objio_free_result(&objios->oir); - return ret; - } - - if (objios->sync) - _write_done(objios->ios, objios); - - return 0; -} - -/* - * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number - * of bytes (maximum @req->wb_bytes) that can be coalesced. - */ -static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, - struct nfs_page *prev, struct nfs_page *req) -{ - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio); - unsigned int size; - - size = pnfs_generic_pg_test(pgio, prev, req); - - if (!size || mirror->pg_count + req->wb_bytes > - (unsigned long)pgio->pg_layout_private) - return 0; - - return min(size, req->wb_bytes); -} - -static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - pnfs_generic_pg_init_read(pgio, req); - if (unlikely(pgio->pg_lseg == NULL)) - return; /* Not pNFS */ - - pgio->pg_layout_private = (void *) - OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; -} - -static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout, - unsigned long *stripe_end) -{ - u32 stripe_off; - unsigned stripe_size; - - if (layout->raid_algorithm == PNFS_OSD_RAID_0) - return true; - - stripe_size = layout->stripe_unit * - (layout->group_width - layout->parity); - - div_u64_rem(offset, stripe_size, &stripe_off); - if (!stripe_off) - return true; - - *stripe_end = stripe_size - stripe_off; - return false; -} - -static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - unsigned long stripe_end = 0; - u64 wb_size; - - if (pgio->pg_dreq == NULL) - wb_size = i_size_read(pgio->pg_inode) - req_offset(req); - else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); - - pnfs_generic_pg_init_write(pgio, req, wb_size); - if (unlikely(pgio->pg_lseg == NULL)) - return; /* Not pNFS */ - - if (req->wb_offset || - !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE, - &OBJIO_LSEG(pgio->pg_lseg)->layout, - &stripe_end)) { - pgio->pg_layout_private = (void *)stripe_end; - } else { - pgio->pg_layout_private = (void *) - OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; - } -} - -static const struct nfs_pageio_ops objio_pg_read_ops = { - .pg_init = objio_init_read, - .pg_test = objio_pg_test, - .pg_doio = pnfs_generic_pg_readpages, - .pg_cleanup = pnfs_generic_pg_cleanup, -}; - -static const struct nfs_pageio_ops objio_pg_write_ops = { - .pg_init = objio_init_write, - .pg_test = objio_pg_test, - .pg_doio = pnfs_generic_pg_writepages, - .pg_cleanup = pnfs_generic_pg_cleanup, -}; - -static struct pnfs_layoutdriver_type objlayout_type = { - .id = LAYOUT_OSD2_OBJECTS, - .name = "LAYOUT_OSD2_OBJECTS", - .flags = PNFS_LAYOUTRET_ON_SETATTR | - PNFS_LAYOUTRET_ON_ERROR, - - .max_deviceinfo_size = PAGE_SIZE, - .owner = THIS_MODULE, - .alloc_layout_hdr = objlayout_alloc_layout_hdr, - .free_layout_hdr = objlayout_free_layout_hdr, - - .alloc_lseg = objlayout_alloc_lseg, - .free_lseg = objlayout_free_lseg, - - .read_pagelist = objlayout_read_pagelist, - .write_pagelist = objlayout_write_pagelist, - .pg_read_ops = &objio_pg_read_ops, - .pg_write_ops = &objio_pg_write_ops, - - .sync = pnfs_generic_sync, - - .free_deviceid_node = objio_free_deviceid_node, - - .encode_layoutcommit = objlayout_encode_layoutcommit, - .encode_layoutreturn = objlayout_encode_layoutreturn, -}; - -MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); -MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>"); -MODULE_LICENSE("GPL"); - -static int __init -objlayout_init(void) -{ - int ret = pnfs_register_layoutdriver(&objlayout_type); - - if (ret) - printk(KERN_INFO - "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n", - __func__, ret); - else - printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n", - __func__); - return ret; -} - -static void __exit -objlayout_exit(void) -{ - pnfs_unregister_layoutdriver(&objlayout_type); - printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n", - __func__); -} - -MODULE_ALIAS("nfs-layouttype4-2"); - -module_init(objlayout_init); -module_exit(objlayout_exit); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c deleted file mode 100644 index 8f3d2acb81c3..000000000000 --- a/fs/nfs/objlayout/objlayout.c +++ /dev/null @@ -1,706 +0,0 @@ -/* - * pNFS Objects layout driver high level definitions - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <ooo@electrozaur.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <linux/kmod.h> -#include <linux/moduleparam.h> -#include <linux/ratelimit.h> -#include <scsi/osd_initiator.h> -#include "objlayout.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD -/* - * Create a objlayout layout structure for the given inode and return it. - */ -struct pnfs_layout_hdr * -objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) -{ - struct objlayout *objlay; - - objlay = kzalloc(sizeof(struct objlayout), gfp_flags); - if (!objlay) - return NULL; - spin_lock_init(&objlay->lock); - INIT_LIST_HEAD(&objlay->err_list); - dprintk("%s: Return %p\n", __func__, objlay); - return &objlay->pnfs_layout; -} - -/* - * Free an objlayout layout structure - */ -void -objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) -{ - struct objlayout *objlay = OBJLAYOUT(lo); - - dprintk("%s: objlay %p\n", __func__, objlay); - - WARN_ON(!list_empty(&objlay->err_list)); - kfree(objlay); -} - -/* - * Unmarshall layout and store it in pnfslay. - */ -struct pnfs_layout_segment * -objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, - struct nfs4_layoutget_res *lgr, - gfp_t gfp_flags) -{ - int status = -ENOMEM; - struct xdr_stream stream; - struct xdr_buf buf = { - .pages = lgr->layoutp->pages, - .page_len = lgr->layoutp->len, - .buflen = lgr->layoutp->len, - .len = lgr->layoutp->len, - }; - struct page *scratch; - struct pnfs_layout_segment *lseg; - - dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay); - - scratch = alloc_page(gfp_flags); - if (!scratch) - goto err_nofree; - - xdr_init_decode(&stream, &buf, NULL); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); - - status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags); - if (unlikely(status)) { - dprintk("%s: objio_alloc_lseg Return err %d\n", __func__, - status); - goto err; - } - - __free_page(scratch); - - dprintk("%s: Return %p\n", __func__, lseg); - return lseg; - -err: - __free_page(scratch); -err_nofree: - dprintk("%s: Err Return=>%d\n", __func__, status); - return ERR_PTR(status); -} - -/* - * Free a layout segement - */ -void -objlayout_free_lseg(struct pnfs_layout_segment *lseg) -{ - dprintk("%s: freeing layout segment %p\n", __func__, lseg); - - if (unlikely(!lseg)) - return; - - objio_free_lseg(lseg); -} - -/* - * I/O Operations - */ -static inline u64 -end_offset(u64 start, u64 len) -{ - u64 end; - - end = start + len; - return end >= start ? end : NFS4_MAX_UINT64; -} - -static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, - struct page ***p_pages, unsigned *p_pgbase, - u64 offset, unsigned long count) -{ - u64 lseg_end_offset; - - BUG_ON(offset < lseg->pls_range.offset); - lseg_end_offset = end_offset(lseg->pls_range.offset, - lseg->pls_range.length); - BUG_ON(offset >= lseg_end_offset); - WARN_ON(offset + count > lseg_end_offset); - - if (*p_pgbase > PAGE_SIZE) { - dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); - *p_pages += *p_pgbase >> PAGE_SHIFT; - *p_pgbase &= ~PAGE_MASK; - } -} - -/* - * I/O done common code - */ -static void -objlayout_iodone(struct objlayout_io_res *oir) -{ - if (likely(oir->status >= 0)) { - objio_free_result(oir); - } else { - struct objlayout *objlay = oir->objlay; - - spin_lock(&objlay->lock); - objlay->delta_space_valid = OBJ_DSU_INVALID; - list_add(&objlay->err_list, &oir->err_list); - spin_unlock(&objlay->lock); - } -} - -/* - * objlayout_io_set_result - Set an osd_error code on a specific osd comp. - * - * The @index component IO failed (error returned from target). Register - * the error for later reporting at layout-return. - */ -void -objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, - struct pnfs_osd_objid *pooid, int osd_error, - u64 offset, u64 length, bool is_write) -{ - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; - - BUG_ON(index >= oir->num_comps); - if (osd_error) { - ioerr->oer_component = *pooid; - ioerr->oer_comp_offset = offset; - ioerr->oer_comp_length = length; - ioerr->oer_iswrite = is_write; - ioerr->oer_errno = osd_error; - - dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " - "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", - __func__, index, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - } else { - /* User need not call if no error is reported */ - ioerr->oer_errno = 0; - } -} - -/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). - * This is because the osd completion is called with ints-off from - * the block layer - */ -static void _rpc_read_complete(struct work_struct *work) -{ - struct rpc_task *task; - struct nfs_pgio_header *hdr; - - dprintk("%s enter\n", __func__); - task = container_of(work, struct rpc_task, u.tk_work); - hdr = container_of(task, struct nfs_pgio_header, task); - - pnfs_ld_read_done(hdr); -} - -void -objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) -{ - struct nfs_pgio_header *hdr = oir->rpcdata; - - oir->status = hdr->task.tk_status = status; - if (status >= 0) - hdr->res.count = status; - else - hdr->pnfs_error = status; - objlayout_iodone(oir); - /* must not use oir after this point */ - - dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, - status, hdr->res.eof, sync); - - if (sync) - pnfs_ld_read_done(hdr); - else { - INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete); - schedule_work(&hdr->task.u.tk_work); - } -} - -/* - * Perform sync or async reads. - */ -enum pnfs_try_status -objlayout_read_pagelist(struct nfs_pgio_header *hdr) -{ - struct inode *inode = hdr->inode; - loff_t offset = hdr->args.offset; - size_t count = hdr->args.count; - int err; - loff_t eof; - - eof = i_size_read(inode); - if (unlikely(offset + count > eof)) { - if (offset >= eof) { - err = 0; - hdr->res.count = 0; - hdr->res.eof = 1; - /*FIXME: do we need to call pnfs_ld_read_done() */ - goto out; - } - count = eof - offset; - } - - hdr->res.eof = (offset + count) >= eof; - _fix_verify_io_params(hdr->lseg, &hdr->args.pages, - &hdr->args.pgbase, - hdr->args.offset, hdr->args.count); - - dprintk("%s: inode(%lx) offset 0x%llx count 0x%zx eof=%d\n", - __func__, inode->i_ino, offset, count, hdr->res.eof); - - err = objio_read_pagelist(hdr); - out: - if (unlikely(err)) { - hdr->pnfs_error = err; - dprintk("%s: Returned Error %d\n", __func__, err); - return PNFS_NOT_ATTEMPTED; - } - return PNFS_ATTEMPTED; -} - -/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). - * This is because the osd completion is called with ints-off from - * the block layer - */ -static void _rpc_write_complete(struct work_struct *work) -{ - struct rpc_task *task; - struct nfs_pgio_header *hdr; - - dprintk("%s enter\n", __func__); - task = container_of(work, struct rpc_task, u.tk_work); - hdr = container_of(task, struct nfs_pgio_header, task); - - pnfs_ld_write_done(hdr); -} - -void -objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) -{ - struct nfs_pgio_header *hdr = oir->rpcdata; - - oir->status = hdr->task.tk_status = status; - if (status >= 0) { - hdr->res.count = status; - hdr->verf.committed = oir->committed; - } else { - hdr->pnfs_error = status; - } - objlayout_iodone(oir); - /* must not use oir after this point */ - - dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, - status, hdr->verf.committed, sync); - - if (sync) - pnfs_ld_write_done(hdr); - else { - INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete); - schedule_work(&hdr->task.u.tk_work); - } -} - -/* - * Perform sync or async writes. - */ -enum pnfs_try_status -objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how) -{ - int err; - - _fix_verify_io_params(hdr->lseg, &hdr->args.pages, - &hdr->args.pgbase, - hdr->args.offset, hdr->args.count); - - err = objio_write_pagelist(hdr, how); - if (unlikely(err)) { - hdr->pnfs_error = err; - dprintk("%s: Returned Error %d\n", __func__, err); - return PNFS_NOT_ATTEMPTED; - } - return PNFS_ATTEMPTED; -} - -void -objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *args) -{ - struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct pnfs_osd_layoutupdate lou; - __be32 *start; - - dprintk("%s: Begin\n", __func__); - - spin_lock(&objlay->lock); - lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); - lou.dsu_delta = objlay->delta_space_used; - objlay->delta_space_used = 0; - objlay->delta_space_valid = OBJ_DSU_INIT; - lou.olu_ioerr_flag = !list_empty(&objlay->err_list); - spin_unlock(&objlay->lock); - - start = xdr_reserve_space(xdr, 4); - - BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); - - *start = cpu_to_be32((xdr->p - start - 1) * 4); - - dprintk("%s: Return delta_space_used %lld err %d\n", __func__, - lou.dsu_delta, lou.olu_ioerr_flag); -} - -static int -err_prio(u32 oer_errno) -{ - switch (oer_errno) { - case 0: - return 0; - - case PNFS_OSD_ERR_RESOURCE: - return OSD_ERR_PRI_RESOURCE; - case PNFS_OSD_ERR_BAD_CRED: - return OSD_ERR_PRI_BAD_CRED; - case PNFS_OSD_ERR_NO_ACCESS: - return OSD_ERR_PRI_NO_ACCESS; - case PNFS_OSD_ERR_UNREACHABLE: - return OSD_ERR_PRI_UNREACHABLE; - case PNFS_OSD_ERR_NOT_FOUND: - return OSD_ERR_PRI_NOT_FOUND; - case PNFS_OSD_ERR_NO_SPACE: - return OSD_ERR_PRI_NO_SPACE; - default: - WARN_ON(1); - /* fallthrough */ - case PNFS_OSD_ERR_EIO: - return OSD_ERR_PRI_EIO; - } -} - -static void -merge_ioerr(struct pnfs_osd_ioerr *dest_err, - const struct pnfs_osd_ioerr *src_err) -{ - u64 dest_end, src_end; - - if (!dest_err->oer_errno) { - *dest_err = *src_err; - /* accumulated device must be blank */ - memset(&dest_err->oer_component.oid_device_id, 0, - sizeof(dest_err->oer_component.oid_device_id)); - - return; - } - - if (dest_err->oer_component.oid_partition_id != - src_err->oer_component.oid_partition_id) - dest_err->oer_component.oid_partition_id = 0; - - if (dest_err->oer_component.oid_object_id != - src_err->oer_component.oid_object_id) - dest_err->oer_component.oid_object_id = 0; - - if (dest_err->oer_comp_offset > src_err->oer_comp_offset) - dest_err->oer_comp_offset = src_err->oer_comp_offset; - - dest_end = end_offset(dest_err->oer_comp_offset, - dest_err->oer_comp_length); - src_end = end_offset(src_err->oer_comp_offset, - src_err->oer_comp_length); - if (dest_end < src_end) - dest_end = src_end; - - dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; - - if ((src_err->oer_iswrite == dest_err->oer_iswrite) && - (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { - dest_err->oer_errno = src_err->oer_errno; - } else if (src_err->oer_iswrite) { - dest_err->oer_iswrite = true; - dest_err->oer_errno = src_err->oer_errno; - } -} - -static void -encode_accumulated_error(struct objlayout *objlay, __be32 *p) -{ - struct objlayout_io_res *oir, *tmp; - struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; - - list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { - unsigned i; - - for (i = 0; i < oir->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; - - if (!ioerr->oer_errno) - continue; - - printk(KERN_ERR "NFS: %s: err[%d]: errno=%d " - "is_write=%d dev(%llx:%llx) par=0x%llx " - "obj=0x%llx offset=0x%llx length=0x%llx\n", - __func__, i, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - - merge_ioerr(&accumulated_err, ioerr); - } - list_del(&oir->err_list); - objio_free_result(oir); - } - - pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); -} - -void -objlayout_encode_layoutreturn(struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args) -{ - struct pnfs_layout_hdr *pnfslay = args->layout; - struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct objlayout_io_res *oir, *tmp; - __be32 *start; - - dprintk("%s: Begin\n", __func__); - start = xdr_reserve_space(xdr, 4); - BUG_ON(!start); - - spin_lock(&objlay->lock); - - list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { - __be32 *last_xdr = NULL, *p; - unsigned i; - int res = 0; - - for (i = 0; i < oir->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; - - if (!ioerr->oer_errno) - continue; - - dprintk("%s: err[%d]: errno=%d is_write=%d " - "dev(%llx:%llx) par=0x%llx obj=0x%llx " - "offset=0x%llx length=0x%llx\n", - __func__, i, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - - p = pnfs_osd_xdr_ioerr_reserve_space(xdr); - if (unlikely(!p)) { - res = -E2BIG; - break; /* accumulated_error */ - } - - last_xdr = p; - pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); - } - - /* TODO: use xdr_write_pages */ - if (unlikely(res)) { - /* no space for even one error descriptor */ - BUG_ON(!last_xdr); - - /* we've encountered a situation with lots and lots of - * errors and no space to encode them all. Use the last - * available slot to report the union of all the - * remaining errors. - */ - encode_accumulated_error(objlay, last_xdr); - goto loop_done; - } - list_del(&oir->err_list); - objio_free_result(oir); - } -loop_done: - spin_unlock(&objlay->lock); - - *start = cpu_to_be32((xdr->p - start - 1) * 4); - dprintk("%s: Return\n", __func__); -} - -enum { - OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, - OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, - OSD_LOGIN_UPCALL_PATHLEN = 256 -}; - -static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login"; - -module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog), - 0600); -MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program"); - -struct __auto_login { - char uri[OBJLAYOUT_MAX_URI_LEN]; - char osdname[OBJLAYOUT_MAX_OSDNAME_LEN]; - char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN]; -}; - -static int __objlayout_upcall(struct __auto_login *login) -{ - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL - }; - char *argv[8]; - int ret; - - if (unlikely(!osd_login_prog[0])) { - dprintk("%s: osd_login_prog is disabled\n", __func__); - return -EACCES; - } - - dprintk("%s uri: %s\n", __func__, login->uri); - dprintk("%s osdname %s\n", __func__, login->osdname); - dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex); - - argv[0] = (char *)osd_login_prog; - argv[1] = "-u"; - argv[2] = login->uri; - argv[3] = "-o"; - argv[4] = login->osdname; - argv[5] = "-s"; - argv[6] = login->systemid_hex; - argv[7] = NULL; - - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); - /* - * Disable the upcall mechanism if we're getting an ENOENT or - * EACCES error. The admin can re-enable it on the fly by using - * sysfs to set the objlayoutdriver.osd_login_prog module parameter once - * the problem has been fixed. - */ - if (ret == -ENOENT || ret == -EACCES) { - printk(KERN_ERR "PNFS-OBJ: %s was not found please set " - "objlayoutdriver.osd_login_prog kernel parameter!\n", - osd_login_prog); - osd_login_prog[0] = '\0'; - } - dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret); - - return ret; -} - -/* Assume dest is all zeros */ -static void __copy_nfsS_and_zero_terminate(struct nfs4_string s, - char *dest, int max_len, - const char *var_name) -{ - if (!s.len) - return; - - if (s.len >= max_len) { - pr_warn_ratelimited( - "objlayout_autologin: %s: s.len(%d) >= max_len(%d)", - var_name, s.len, max_len); - s.len = max_len - 1; /* space for null terminator */ - } - - memcpy(dest, s.data, s.len); -} - -/* Assume sysid is all zeros */ -static void _sysid_2_hex(struct nfs4_string s, - char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN]) -{ - int i; - char *cur; - - if (!s.len) - return; - - if (s.len != OSD_SYSTEMID_LEN) { - pr_warn_ratelimited( - "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN", - s.len); - if (s.len > OSD_SYSTEMID_LEN) - s.len = OSD_SYSTEMID_LEN; - } - - cur = sysid; - for (i = 0; i < s.len; i++) - cur = hex_byte_pack(cur, s.data[i]); -} - -int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr) -{ - int rc; - struct __auto_login login; - - if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len) - return -ENODEV; - - memset(&login, 0, sizeof(login)); - __copy_nfsS_and_zero_terminate( - deviceaddr->oda_targetaddr.ota_netaddr.r_addr, - login.uri, sizeof(login.uri), "URI"); - - __copy_nfsS_and_zero_terminate( - deviceaddr->oda_osdname, - login.osdname, sizeof(login.osdname), "OSDNAME"); - - _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex); - - rc = __objlayout_upcall(&login); - if (rc > 0) /* script returns positive values */ - rc = -ENODEV; - - return rc; -} diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h deleted file mode 100644 index fc94a5872ed4..000000000000 --- a/fs/nfs/objlayout/objlayout.h +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Data types and function declerations for interfacing with the - * pNFS standard object layout driver. - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <ooo@electrozaur.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _OBJLAYOUT_H -#define _OBJLAYOUT_H - -#include <linux/nfs_fs.h> -#include <linux/pnfs_osd_xdr.h> -#include "../pnfs.h" - -/* - * per-inode layout - */ -struct objlayout { - struct pnfs_layout_hdr pnfs_layout; - - /* for layout_commit */ - enum osd_delta_space_valid_enum { - OBJ_DSU_INIT = 0, - OBJ_DSU_VALID, - OBJ_DSU_INVALID, - } delta_space_valid; - s64 delta_space_used; /* consumed by write ops */ - - /* for layout_return */ - spinlock_t lock; - struct list_head err_list; -}; - -static inline struct objlayout * -OBJLAYOUT(struct pnfs_layout_hdr *lo) -{ - return container_of(lo, struct objlayout, pnfs_layout); -} - -/* - * per-I/O operation state - * embedded in objects provider io_state data structure - */ -struct objlayout_io_res { - struct objlayout *objlay; - - void *rpcdata; - int status; /* res */ - int committed; /* res */ - - /* Error reporting (layout_return) */ - struct list_head err_list; - unsigned num_comps; - /* Pointer to array of error descriptors of size num_comps. - * It should contain as many entries as devices in the osd_layout - * that participate in the I/O. It is up to the io_engine to allocate - * needed space and set num_comps. - */ - struct pnfs_osd_ioerr *ioerrs; -}; - -static inline -void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, - struct pnfs_osd_ioerr *ioerrs, void *rpcdata, - struct pnfs_layout_hdr *pnfs_layout_type) -{ - oir->objlay = OBJLAYOUT(pnfs_layout_type); - oir->rpcdata = rpcdata; - INIT_LIST_HEAD(&oir->err_list); - oir->num_comps = num_comps; - oir->ioerrs = ioerrs; -} - -/* - * Raid engine I/O API - */ -extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, - struct pnfs_layout_hdr *pnfslay, - struct pnfs_layout_range *range, - struct xdr_stream *xdr, - gfp_t gfp_flags); -extern void objio_free_lseg(struct pnfs_layout_segment *lseg); - -/* objio_free_result will free these @oir structs received from - * objlayout_{read,write}_done - */ -extern void objio_free_result(struct objlayout_io_res *oir); - -extern int objio_read_pagelist(struct nfs_pgio_header *rdata); -extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how); - -/* - * callback API - */ -extern void objlayout_io_set_result(struct objlayout_io_res *oir, - unsigned index, struct pnfs_osd_objid *pooid, - int osd_error, u64 offset, u64 length, bool is_write); - -static inline void -objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) -{ - /* If one of the I/Os errored out and the delta_space_used was - * invalid we render the complete report as invalid. Protocol mandate - * the DSU be accurate or not reported. - */ - spin_lock(&objlay->lock); - if (objlay->delta_space_valid != OBJ_DSU_INVALID) { - objlay->delta_space_valid = OBJ_DSU_VALID; - objlay->delta_space_used += space_used; - } - spin_unlock(&objlay->lock); -} - -extern void objlayout_read_done(struct objlayout_io_res *oir, - ssize_t status, bool sync); -extern void objlayout_write_done(struct objlayout_io_res *oir, - ssize_t status, bool sync); - -/* - * exported generic objects function vectors - */ - -extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags); -extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); - -extern struct pnfs_layout_segment *objlayout_alloc_lseg( - struct pnfs_layout_hdr *, - struct nfs4_layoutget_res *, - gfp_t gfp_flags); -extern void objlayout_free_lseg(struct pnfs_layout_segment *); - -extern enum pnfs_try_status objlayout_read_pagelist( - struct nfs_pgio_header *); - -extern enum pnfs_try_status objlayout_write_pagelist( - struct nfs_pgio_header *, - int how); - -extern void objlayout_encode_layoutcommit( - struct pnfs_layout_hdr *, - struct xdr_stream *, - const struct nfs4_layoutcommit_args *); - -extern void objlayout_encode_layoutreturn( - struct xdr_stream *, - const struct nfs4_layoutreturn_args *); - -extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr); - -#endif /* _OBJLAYOUT_H */ diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c deleted file mode 100644 index f093c7ec983b..000000000000 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ /dev/null @@ -1,415 +0,0 @@ -/* - * Object-Based pNFS Layout XDR layer - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <ooo@electrozaur.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <linux/pnfs_osd_xdr.h> - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -/* - * The following implementation is based on RFC5664 - */ - -/* - * struct pnfs_osd_objid { - * struct nfs4_deviceid oid_device_id; - * u64 oid_partition_id; - * u64 oid_object_id; - * }; // xdr size 32 bytes - */ -static __be32 * -_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) -{ - p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data, - sizeof(objid->oid_device_id.data)); - - p = xdr_decode_hyper(p, &objid->oid_partition_id); - p = xdr_decode_hyper(p, &objid->oid_object_id); - return p; -} -/* - * struct pnfs_osd_opaque_cred { - * u32 cred_len; - * void *cred; - * }; // xdr size [variable] - * The return pointers are from the xdr buffer - */ -static int -_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred, - struct xdr_stream *xdr) -{ - __be32 *p = xdr_inline_decode(xdr, 1); - - if (!p) - return -EINVAL; - - opaque_cred->cred_len = be32_to_cpu(*p++); - - p = xdr_inline_decode(xdr, opaque_cred->cred_len); - if (!p) - return -EINVAL; - - opaque_cred->cred = p; - return 0; -} - -/* - * struct pnfs_osd_object_cred { - * struct pnfs_osd_objid oc_object_id; - * u32 oc_osd_version; - * u32 oc_cap_key_sec; - * struct pnfs_osd_opaque_cred oc_cap_key - * struct pnfs_osd_opaque_cred oc_cap; - * }; // xdr size 32 + 4 + 4 + [variable] + [variable] - */ -static int -_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp, - struct xdr_stream *xdr) -{ - __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4); - int ret; - - if (!p) - return -EIO; - - p = _osd_xdr_decode_objid(p, &comp->oc_object_id); - comp->oc_osd_version = be32_to_cpup(p++); - comp->oc_cap_key_sec = be32_to_cpup(p); - - ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr); - if (unlikely(ret)) - return ret; - - ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr); - return ret; -} - -/* - * struct pnfs_osd_data_map { - * u32 odm_num_comps; - * u64 odm_stripe_unit; - * u32 odm_group_width; - * u32 odm_group_depth; - * u32 odm_mirror_cnt; - * u32 odm_raid_algorithm; - * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4 - */ -static inline int -_osd_data_map_xdr_sz(void) -{ - return 4 + 8 + 4 + 4 + 4 + 4; -} - -static __be32 * -_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map) -{ - data_map->odm_num_comps = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &data_map->odm_stripe_unit); - data_map->odm_group_width = be32_to_cpup(p++); - data_map->odm_group_depth = be32_to_cpup(p++); - data_map->odm_mirror_cnt = be32_to_cpup(p++); - data_map->odm_raid_algorithm = be32_to_cpup(p++); - dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " - "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", - __func__, - data_map->odm_num_comps, - (unsigned long long)data_map->odm_stripe_unit, - data_map->odm_group_width, - data_map->odm_group_depth, - data_map->odm_mirror_cnt, - data_map->odm_raid_algorithm); - return p; -} - -int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, - struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr) -{ - __be32 *p; - - memset(iter, 0, sizeof(*iter)); - - p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4); - if (unlikely(!p)) - return -EINVAL; - - p = _osd_xdr_decode_data_map(p, &layout->olo_map); - layout->olo_comps_index = be32_to_cpup(p++); - layout->olo_num_comps = be32_to_cpup(p++); - dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, - layout->olo_comps_index, layout->olo_num_comps); - - iter->total_comps = layout->olo_num_comps; - return 0; -} - -bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp, - struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr, - int *err) -{ - BUG_ON(iter->decoded_comps > iter->total_comps); - if (iter->decoded_comps == iter->total_comps) - return false; - - *err = _osd_xdr_decode_object_cred(comp, xdr); - if (unlikely(*err)) { - dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d " - "total_comps=%d\n", __func__, *err, - iter->decoded_comps, iter->total_comps); - return false; /* stop the loop */ - } - dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx " - "key_len=%u cap_len=%u\n", - __func__, - _DEVID_LO(&comp->oc_object_id.oid_device_id), - _DEVID_HI(&comp->oc_object_id.oid_device_id), - comp->oc_object_id.oid_partition_id, - comp->oc_object_id.oid_object_id, - comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); - - iter->decoded_comps++; - return true; -} - -/* - * Get Device Information Decoding - * - * Note: since Device Information is currently done synchronously, all - * variable strings fields are left inside the rpc buffer and are only - * pointed to by the pnfs_osd_deviceaddr members. So the read buffer - * should not be freed while the returned information is in use. - */ -/* - *struct nfs4_string { - * unsigned int len; - * char *data; - *}; // size [variable] - * NOTE: Returned string points to inside the XDR buffer - */ -static __be32 * -__read_u8_opaque(__be32 *p, struct nfs4_string *str) -{ - str->len = be32_to_cpup(p++); - str->data = (char *)p; - - p += XDR_QUADLEN(str->len); - return p; -} - -/* - * struct pnfs_osd_targetid { - * u32 oti_type; - * struct nfs4_string oti_scsi_device_id; - * };// size 4 + [variable] - */ -static __be32 * -__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid) -{ - u32 oti_type; - - oti_type = be32_to_cpup(p++); - targetid->oti_type = oti_type; - - switch (oti_type) { - case OBJ_TARGET_SCSI_NAME: - case OBJ_TARGET_SCSI_DEVICE_ID: - p = __read_u8_opaque(p, &targetid->oti_scsi_device_id); - } - - return p; -} - -/* - * struct pnfs_osd_net_addr { - * struct nfs4_string r_netid; - * struct nfs4_string r_addr; - * }; - */ -static __be32 * -__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr) -{ - p = __read_u8_opaque(p, &netaddr->r_netid); - p = __read_u8_opaque(p, &netaddr->r_addr); - - return p; -} - -/* - * struct pnfs_osd_targetaddr { - * u32 ota_available; - * struct pnfs_osd_net_addr ota_netaddr; - * }; - */ -static __be32 * -__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr) -{ - u32 ota_available; - - ota_available = be32_to_cpup(p++); - targetaddr->ota_available = ota_available; - - if (ota_available) - p = __read_net_addr(p, &targetaddr->ota_netaddr); - - - return p; -} - -/* - * struct pnfs_osd_deviceaddr { - * struct pnfs_osd_targetid oda_targetid; - * struct pnfs_osd_targetaddr oda_targetaddr; - * u8 oda_lun[8]; - * struct nfs4_string oda_systemid; - * struct pnfs_osd_object_cred oda_root_obj_cred; - * struct nfs4_string oda_osdname; - * }; - */ - -/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does - * not have an xdr_stream - */ -static __be32 * -__read_opaque_cred(__be32 *p, - struct pnfs_osd_opaque_cred *opaque_cred) -{ - opaque_cred->cred_len = be32_to_cpu(*p++); - opaque_cred->cred = p; - return p + XDR_QUADLEN(opaque_cred->cred_len); -} - -static __be32 * -__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp) -{ - p = _osd_xdr_decode_objid(p, &comp->oc_object_id); - comp->oc_osd_version = be32_to_cpup(p++); - comp->oc_cap_key_sec = be32_to_cpup(p++); - - p = __read_opaque_cred(p, &comp->oc_cap_key); - p = __read_opaque_cred(p, &comp->oc_cap); - return p; -} - -void pnfs_osd_xdr_decode_deviceaddr( - struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p) -{ - p = __read_targetid(p, &deviceaddr->oda_targetid); - - p = __read_targetaddr(p, &deviceaddr->oda_targetaddr); - - p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun, - sizeof(deviceaddr->oda_lun)); - - p = __read_u8_opaque(p, &deviceaddr->oda_systemid); - - p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred); - - p = __read_u8_opaque(p, &deviceaddr->oda_osdname); - - /* libosd likes this terminated in dbg. It's last, so no problems */ - deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0; -} - -/* - * struct pnfs_osd_layoutupdate { - * u32 dsu_valid; - * s64 dsu_delta; - * u32 olu_ioerr_flag; - * }; xdr size 4 + 8 + 4 - */ -int -pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, - struct pnfs_osd_layoutupdate *lou) -{ - __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4); - - if (!p) - return -E2BIG; - - *p++ = cpu_to_be32(lou->dsu_valid); - if (lou->dsu_valid) - p = xdr_encode_hyper(p, lou->dsu_delta); - *p++ = cpu_to_be32(lou->olu_ioerr_flag); - return 0; -} - -/* - * struct pnfs_osd_objid { - * struct nfs4_deviceid oid_device_id; - * u64 oid_partition_id; - * u64 oid_object_id; - * }; // xdr size 32 bytes - */ -static inline __be32 * -pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id) -{ - p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, - sizeof(object_id->oid_device_id.data)); - p = xdr_encode_hyper(p, object_id->oid_partition_id); - p = xdr_encode_hyper(p, object_id->oid_object_id); - - return p; -} - -/* - * struct pnfs_osd_ioerr { - * struct pnfs_osd_objid oer_component; - * u64 oer_comp_offset; - * u64 oer_comp_length; - * u32 oer_iswrite; - * u32 oer_errno; - * }; // xdr size 32 + 24 bytes - */ -void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr) -{ - p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component); - p = xdr_encode_hyper(p, ioerr->oer_comp_offset); - p = xdr_encode_hyper(p, ioerr->oer_comp_length); - *p++ = cpu_to_be32(ioerr->oer_iswrite); - *p = cpu_to_be32(ioerr->oer_errno); -} - -__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, 32 + 24); - if (unlikely(!p)) - dprintk("%s: out of xdr space\n", __func__); - - return p; -} diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6e629b856a00..ad92b401326c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -29,19 +29,6 @@ static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; -static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) -{ - p->npages = pagecount; - if (pagecount <= ARRAY_SIZE(p->page_array)) - p->pagevec = p->page_array; - else { - p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); - if (!p->pagevec) - p->npages = 0; - } - return p->pagevec != NULL; -} - struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc) { @@ -115,6 +102,35 @@ nfs_iocounter_wait(struct nfs_lock_context *l_ctx) TASK_KILLABLE); } +/** + * nfs_async_iocounter_wait - wait on a rpc_waitqueue for I/O + * to complete + * @task: the rpc_task that should wait + * @l_ctx: nfs_lock_context with io_counter to check + * + * Returns true if there is outstanding I/O to wait on and the + * task has been put to sleep. + */ +bool +nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) +{ + struct inode *inode = d_inode(l_ctx->open_context->dentry); + bool ret = false; + + if (atomic_read(&l_ctx->io_count) > 0) { + rpc_sleep_on(&NFS_SERVER(inode)->uoc_rpcwaitq, task, NULL); + ret = true; + } + + if (atomic_read(&l_ctx->io_count) == 0) { + rpc_wake_up_queued_task(&NFS_SERVER(inode)->uoc_rpcwaitq, task); + ret = false; + } + + return ret; +} +EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); + /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked @@ -398,8 +414,11 @@ static void nfs_clear_request(struct nfs_page *req) req->wb_page = NULL; } if (l_ctx != NULL) { - if (atomic_dec_and_test(&l_ctx->io_count)) + if (atomic_dec_and_test(&l_ctx->io_count)) { wake_up_atomic_t(&l_ctx->io_count); + if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags)) + rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq); + } nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } @@ -677,7 +696,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int io_flags) + int io_flags, + gfp_t gfp_flags) { struct nfs_pgio_mirror *new; int i; @@ -701,7 +721,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, /* until we have a request, we don't have an lseg and no * idea how many mirrors there will be */ new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX, - sizeof(struct nfs_pgio_mirror), GFP_KERNEL); + sizeof(struct nfs_pgio_mirror), gfp_flags); desc->pg_mirrors_dynamic = new; desc->pg_mirrors = new; @@ -754,13 +774,24 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, *last_page; struct list_head *head = &mirror->pg_list; struct nfs_commit_info cinfo; + struct nfs_page_array *pg_array = &hdr->page_array; unsigned int pagecount, pageused; + gfp_t gfp_flags = GFP_KERNEL; pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); - if (!nfs_pgarray_set(&hdr->page_array, pagecount)) { - nfs_pgio_error(hdr); - desc->pg_error = -ENOMEM; - return desc->pg_error; + + if (pagecount <= ARRAY_SIZE(pg_array->page_array)) + pg_array->pagevec = pg_array->page_array; + else { + if (hdr->rw_mode == FMODE_WRITE) + gfp_flags = GFP_NOIO; + pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags); + if (!pg_array->pagevec) { + pg_array->npages = 0; + nfs_pgio_error(hdr); + desc->pg_error = -ENOMEM; + return desc->pg_error; + } } nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); @@ -1256,8 +1287,10 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) mirror = &desc->pg_mirrors[midx]; if (!list_empty(&mirror->pg_list)) { prev = nfs_list_entry(mirror->pg_list.prev); - if (index != prev->wb_index + 1) - nfs_pageio_complete_mirror(desc, midx); + if (index != prev->wb_index + 1) { + nfs_pageio_complete(desc); + break; + } } } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index dd042498ce7c..c383d0913b54 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -322,9 +322,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, static void pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) { + struct pnfs_layout_segment *lseg; lo->plh_return_iomode = 0; lo->plh_return_seq = 0; clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + continue; + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); + } } static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) @@ -367,9 +373,9 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, *next; set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_clear_layoutreturn_info(lo); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) pnfs_clear_lseg_state(lseg, lseg_list); + pnfs_clear_layoutreturn_info(lo); pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) @@ -563,7 +569,6 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) } } } -EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked); /* * is l2 fully contained in l1? @@ -728,6 +733,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); + nfs_commit_inode(&nfsi->vfs_inode, 0); pnfs_put_layout_hdr(lo); } else spin_unlock(&nfsi->vfs_inode.i_lock); @@ -1209,7 +1215,6 @@ out: dprintk("<-- %s status: %d\n", __func__, status); return status; } -EXPORT_SYMBOL_GPL(_pnfs_return_layout); int pnfs_commit_and_return_layout(struct inode *inode) @@ -1991,6 +1996,8 @@ out_forget: spin_unlock(&ino->i_lock); lseg->pls_layout = lo; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + if (!pnfs_layout_is_valid(lo)) + nfs_commit_inode(ino, 0); return ERR_PTR(-EAGAIN); } @@ -2051,9 +2058,11 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, bool return_now = false; spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + return; + } pnfs_set_plh_return_info(lo, range.iomode, 0); - /* Block LAYOUTGET */ - set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() @@ -2075,10 +2084,36 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); void +pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) +{ + if (pgio->pg_lseg == NULL || + test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags)) + return; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); + +/* + * Check for any intersection between the request and the pgio->pg_lseg, + * and if none, put this pgio->pg_lseg away. + */ +static void +pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) { + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + } +} + +void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { u64 rd_size = req->wb_bytes; + pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_range(pgio, req); if (pgio->pg_lseg == NULL) { if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); @@ -2109,6 +2144,8 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { + pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_range(pgio, req); if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -2169,16 +2206,10 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset, pgio->pg_lseg->pls_range.length); req_start = req_offset(req); - WARN_ON_ONCE(req_start >= seg_end); + /* start of request is past the last byte of this segment */ - if (req_start >= seg_end) { - /* reference the new lseg */ - if (pgio->pg_ops->pg_cleanup) - pgio->pg_ops->pg_cleanup(pgio); - if (pgio->pg_ops->pg_init) - pgio->pg_ops->pg_init(pgio, req); + if (req_start >= seg_end) return 0; - } /* adjust 'size' iff there are fewer bytes left in the * segment than what nfs_generic_pg_test returned */ @@ -2277,8 +2308,20 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc, enum pnfs_try_status trypnfs; trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); - if (trypnfs == PNFS_NOT_ATTEMPTED) + switch (trypnfs) { + case PNFS_NOT_ATTEMPTED: pnfs_write_through_mds(desc, hdr); + case PNFS_ATTEMPTED: + break; + case PNFS_TRY_AGAIN: + /* cleanup hdr and prepare to redo pnfs */ + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + list_splice_init(&hdr->pages, &mirror->pg_list); + mirror->pg_recoalesce = 1; + } + hdr->mds_ops->rpc_release(hdr); + } } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) @@ -2408,10 +2451,20 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) enum pnfs_try_status trypnfs; trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); - if (trypnfs == PNFS_TRY_AGAIN) - pnfs_read_resend_pnfs(hdr); - if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status) + switch (trypnfs) { + case PNFS_NOT_ATTEMPTED: pnfs_read_through_mds(desc, hdr); + case PNFS_ATTEMPTED: + break; + case PNFS_TRY_AGAIN: + /* cleanup hdr and prepare to redo pnfs */ + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + list_splice_init(&hdr->pages, &mirror->pg_list); + mirror->pg_recoalesce = 1; + } + hdr->mds_ops->rpc_release(hdr); + } } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 590e1e35781f..99731e3e332f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -173,14 +173,9 @@ struct pnfs_layoutdriver_type { gfp_t gfp_flags); int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *); - void (*encode_layoutreturn) (struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args); void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); - void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *args); int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); }; @@ -239,6 +234,7 @@ void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); +void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, @@ -597,6 +593,16 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, return pnfs_is_range_intersecting(l1->offset, end1, l2->offset, end2); } +static inline bool +pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page *req) +{ + u64 seg_last = pnfs_end_offset(lseg->pls_range.offset, lseg->pls_range.length); + u64 req_last = req_offset(req) + req->wb_bytes; + + return pnfs_is_range_intersecting(lseg->pls_range.offset, seg_last, + req_offset(req), req_last); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 7250b95549ec..d40755a0984b 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -217,7 +217,14 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { if (list_empty(&bucket->committing)) continue; - data = nfs_commitdata_alloc(); + /* + * If the layout segment is invalid, then let + * pnfs_generic_retry_commit() clean up the bucket. + */ + if (bucket->clseg && !pnfs_is_valid_lseg(bucket->clseg) && + !test_bit(NFS_LSEG_LAYOUTRETURN, &bucket->clseg->pls_flags)) + break; + data = nfs_commitdata_alloc(false); if (!data) break; data->ds_commit_index = i; @@ -283,16 +290,10 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, unsigned int nreq = 0; if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(); - if (data != NULL) { - data->ds_commit_index = -1; - list_add(&data->pages, &list); - nreq++; - } else { - nfs_retry_commit(mds_pages, NULL, cinfo, 0); - pnfs_generic_retry_commit(cinfo, 0); - return -ENOMEM; - } + data = nfs_commitdata_alloc(true); + data->ds_commit_index = -1; + list_add(&data->pages, &list); + nreq++; } nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); @@ -619,7 +620,6 @@ void nfs4_pnfs_v3_ds_connect_unload(void) get_v3_ds_connect = NULL; } } -EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload); static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b7bca8303989..9872cf676a50 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -638,7 +638,7 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); - return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); + return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL); } /* Helper functions for NFS lock bounds checking */ diff --git a/fs/nfs/read.c b/fs/nfs/read.c index defc9233e985..a8421d9dab6a 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -35,7 +35,11 @@ static struct kmem_cache *nfs_rdata_cachep; static struct nfs_pgio_header *nfs_readhdr_alloc(void) { - return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); + struct nfs_pgio_header *p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); + + if (p) + p->rw_mode = FMODE_READ; + return p; } static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) @@ -64,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_read_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, - server->rsize, 0); + server->rsize, 0, GFP_KERNEL); } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); @@ -451,7 +455,6 @@ void nfs_destroy_readpagecache(void) } static const struct nfs_rw_ops nfs_rw_read_ops = { - .rw_mode = FMODE_READ, .rw_alloc_header = nfs_readhdr_alloc, .rw_free_header = nfs_readhdr_free, .rw_done = nfs_readpage_done, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2f3822a4a7d5..c5334c0e23a1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2301,7 +2301,7 @@ EXPORT_SYMBOL_GPL(nfs_remount); /* * Initialise the common bits of the superblock */ -inline void nfs_initialise_sb(struct super_block *sb) +static void nfs_initialise_sb(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); @@ -2348,7 +2348,8 @@ EXPORT_SYMBOL_GPL(nfs_fill_super); /* * Finish setting up a cloned NFS2/3/4 superblock */ -void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) +static void nfs_clone_super(struct super_block *sb, + struct nfs_mount_info *mount_info) { const struct super_block *old_sb = mount_info->cloned->sb; struct nfs_server *server = NFS_SB(sb); @@ -2544,10 +2545,25 @@ EXPORT_SYMBOL_GPL(nfs_set_sb_security); int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { + int error; + unsigned long kflags = 0, kflags_out = 0; + /* clone any lsm security options from the parent to the new sb */ if (d_inode(mntroot)->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) return -ESTALE; - return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); + + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + kflags |= SECURITY_LSM_NATIVE_LABELS; + + error = security_sb_clone_mnt_opts(mount_info->cloned->sb, s, kflags, + &kflags_out); + if (error) + return error; + + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) + NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; + return 0; } EXPORT_SYMBOL_GPL(nfs_clone_sb_security); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index cc341fc7fd44..db7ba542559e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -60,14 +60,28 @@ static mempool_t *nfs_wdata_mempool; static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_commit_data *nfs_commitdata_alloc(void) +struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail) { - struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); + struct nfs_commit_data *p; - if (p) { - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->pages); + if (never_fail) + p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); + else { + /* It is OK to do some reclaim, not no safe to wait + * for anything to be returned to the pool. + * mempool_alloc() cannot handle that particular combination, + * so we need two separate attempts. + */ + p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT); + if (!p) + p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO | + __GFP_NOWARN | __GFP_NORETRY); + if (!p) + return NULL; } + + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); return p; } EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); @@ -82,8 +96,10 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void) { struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - if (p) + if (p) { memset(p, 0, sizeof(*p)); + p->rw_mode = FMODE_WRITE; + } return p; } @@ -547,9 +563,21 @@ static void nfs_write_error_remove_page(struct nfs_page *req) { nfs_unlock_request(req); nfs_end_page_writeback(req); - nfs_release_request(req); generic_error_remove_page(page_file_mapping(req->wb_page), req->wb_page); + nfs_release_request(req); +} + +static bool +nfs_error_is_fatal_on_server(int err) +{ + switch (err) { + case 0: + case -ERESTARTSYS: + case -EINTR: + return false; + } + return nfs_error_is_fatal(err); } /* @@ -557,8 +585,7 @@ static void nfs_write_error_remove_page(struct nfs_page *req) * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock, - bool launder) + struct page *page, bool nonblock) { struct nfs_page *req; int ret = 0; @@ -574,19 +601,19 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); ret = 0; + /* If there is a fatal error that covers this write, just exit */ + if (nfs_error_is_fatal_on_server(req->wb_context->error)) + goto out_launder; + if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* - * Remove the problematic req upon fatal errors - * in launder case, while other dirty pages can - * still be around until they get flushed. + * Remove the problematic req upon fatal errors on the server */ if (nfs_error_is_fatal(ret)) { nfs_context_set_write_error(req->wb_context, ret); - if (launder) { - nfs_write_error_remove_page(req); - goto out; - } + if (nfs_error_is_fatal_on_server(ret)) + goto out_launder; } nfs_redirty_request(req); ret = -EAGAIN; @@ -595,16 +622,18 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, NFSIOS_WRITEPAGES, 1); out: return ret; +out_launder: + nfs_write_error_remove_page(req); + return ret; } static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio, bool launder) + struct nfs_pageio_descriptor *pgio) { int ret; nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE, - launder); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -616,8 +645,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, * Write an mmapped page to the server. */ static int nfs_writepage_locked(struct page *page, - struct writeback_control *wbc, - bool launder) + struct writeback_control *wbc) { struct nfs_pageio_descriptor pgio; struct inode *inode = page_file_mapping(page)->host; @@ -626,7 +654,7 @@ static int nfs_writepage_locked(struct page *page, nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); - err = nfs_do_writepage(page, wbc, &pgio, launder); + err = nfs_do_writepage(page, wbc, &pgio); nfs_pageio_complete(&pgio); if (err < 0) return err; @@ -639,7 +667,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) { int ret; - ret = nfs_writepage_locked(page, wbc, false); + ret = nfs_writepage_locked(page, wbc); unlock_page(page); return ret; } @@ -648,7 +676,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * { int ret; - ret = nfs_do_writepage(page, wbc, data, false); + ret = nfs_do_writepage(page, wbc, data); unlock_page(page); return ret; } @@ -1367,7 +1395,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_write_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, - server->wsize, ioflags); + server->wsize, ioflags, GFP_NOIO); } EXPORT_SYMBOL_GPL(nfs_pageio_init_write); @@ -1704,50 +1732,14 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, if (list_empty(head)) return 0; - data = nfs_commitdata_alloc(); - - if (!data) - goto out_bad; + data = nfs_commitdata_alloc(true); /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); atomic_inc(&cinfo->mds->rpcs_out); return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), data->mds_ops, how, 0); - out_bad: - nfs_retry_commit(head, NULL, cinfo, 0); - return -ENOMEM; -} - -int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf) -{ - struct inode *inode = file_inode(file); - struct nfs_open_context *open; - struct nfs_commit_info cinfo; - struct nfs_page *req; - int ret; - - open = get_nfs_open_context(nfs_file_open_context(file)); - req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode)); - if (IS_ERR(req)) { - ret = PTR_ERR(req); - goto out_put; - } - - nfs_init_cinfo_from_inode(&cinfo, inode); - - memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier)); - nfs_request_add_commit_list(req, &cinfo); - ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (ret > 0) - ret = 0; - - nfs_free_request(req); -out_put: - put_nfs_open_context(open); - return ret; } -EXPORT_SYMBOL_GPL(nfs_commit_file); /* * COMMIT call returned @@ -1985,7 +1977,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) /* * Write back all requests on one page - we do this before reading it. */ -int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) +int nfs_wb_page(struct inode *inode, struct page *page) { loff_t range_start = page_file_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1); @@ -2002,7 +1994,7 @@ int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc, launder); + ret = nfs_writepage_locked(page, &wbc); if (ret < 0) goto out_error; continue; @@ -2107,7 +2099,6 @@ void nfs_destroy_writepagecache(void) } static const struct nfs_rw_ops nfs_rw_write_ops = { - .rw_mode = FMODE_WRITE, .rw_alloc_header = nfs_writehdr_alloc, .rw_free_header = nfs_writehdr_free, .rw_done = nfs_writeback_done, diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index fb5213afc854..c862c2489df0 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -219,6 +219,9 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, u8 *buf, *d, type, assoc; int error; + if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q))) + return -EINVAL; + buf = kzalloc(bufflen, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -229,7 +232,6 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, goto out_free_buf; } req = scsi_req(rq); - scsi_req_init(rq); error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); if (error) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index e71f11b1a180..3bc08c394a3f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -486,7 +486,7 @@ secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif static inline int -uuid_parse(char **mesg, char *buf, unsigned char **puuid) +nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) { int len; @@ -586,7 +586,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); else if (strcmp(buf, "uuid") == 0) - err = uuid_parse(&mesg, buf, &exp.ex_uuid); + err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); else diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index d86031b6ad79..dadb3bf305b2 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1259,7 +1259,8 @@ nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) return NULL; } - if (!(exp->ex_layout_types & (1 << layout_type))) { + if (layout_type >= LAYOUT_TYPE_MAX || + !(exp->ex_layout_types & (1 << layout_type))) { dprintk("%s: layout type %d not supported\n", __func__, layout_type); return NULL; @@ -1768,6 +1769,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, opdesc->op_get_currentstateid(cstate, &op->u); op->status = opdesc->op_func(rqstp, cstate, &op->u); + /* Only from SEQUENCE */ + if (cstate->status == nfserr_replay_cache) { + dprintk("%s NFS4.1 replay from cache\n", __func__); + status = op->status; + goto out; + } if (!op->status) { if (opdesc->op_set_currentstateid) opdesc->op_set_currentstateid(cstate, &op->u); @@ -1778,14 +1785,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (need_wrongsec_check(rqstp)) op->status = check_nfsd_access(current_fh->fh_export, rqstp); } - encode_op: - /* Only from SEQUENCE */ - if (cstate->status == nfserr_replay_cache) { - dprintk("%s NFS4.1 replay from cache\n", __func__); - status = op->status; - goto out; - } if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; nfsd4_encode_replay(&resp->xdr, op); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e9ef50addddb..22002fb75a18 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1912,28 +1912,15 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) target->cl_clientid.cl_id = source->cl_clientid.cl_id; } -int strdup_if_nonnull(char **target, char *source) -{ - if (source) { - *target = kstrdup(source, GFP_KERNEL); - if (!*target) - return -ENOMEM; - } else - *target = NULL; - return 0; -} - static int copy_cred(struct svc_cred *target, struct svc_cred *source) { - int ret; + target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL); + target->cr_raw_principal = kstrdup(source->cr_raw_principal, + GFP_KERNEL); + if ((source->cr_principal && ! target->cr_principal) || + (source->cr_raw_principal && ! target->cr_raw_principal)) + return -ENOMEM; - ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal); - if (ret) - return ret; - ret = strdup_if_nonnull(&target->cr_raw_principal, - source->cr_raw_principal); - if (ret) - return ret; target->cr_flavor = source->cr_flavor; target->cr_uid = source->cr_uid; target->cr_gid = source->cr_gid; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 33017d652b1d..26780d53a6f9 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2831,9 +2831,14 @@ out_acl: } #endif /* CONFIG_NFSD_PNFS */ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0, - NFSD_SUPPATTR_EXCLCREAT_WORD1, - NFSD_SUPPATTR_EXCLCREAT_WORD2); + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); + supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; + supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; + supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; + + status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]); if (status) goto out; } @@ -4119,8 +4124,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getdeviceinfo *gdev) { struct xdr_stream *xdr = &resp->xdr; - const struct nfsd4_layout_ops *ops = - nfsd4_layout_ops[gdev->gd_layout_type]; + const struct nfsd4_layout_ops *ops; u32 starting_len = xdr->buf->len, needed_len; __be32 *p; @@ -4137,6 +4141,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, /* If maxcount is 0 then just update notifications */ if (gdev->gd_maxcount != 0) { + ops = nfsd4_layout_ops[gdev->gd_layout_type]; nfserr = ops->encode_getdeviceinfo(xdr, gdev); if (nfserr) { /* @@ -4189,8 +4194,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_layoutget *lgp) { struct xdr_stream *xdr = &resp->xdr; - const struct nfsd4_layout_ops *ops = - nfsd4_layout_ops[lgp->lg_layout_type]; + const struct nfsd4_layout_ops *ops; __be32 *p; dprintk("%s: err %d\n", __func__, nfserr); @@ -4213,6 +4217,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, *p++ = cpu_to_be32(lgp->lg_seg.iomode); *p++ = cpu_to_be32(lgp->lg_layout_type); + ops = nfsd4_layout_ops[lgp->lg_layout_type]; nfserr = ops->encode_layoutget(xdr, lgp); out: kfree(lgp->lg_content); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9aaf6ca77569..38d0383dc7f9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -94,6 +94,12 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, err = follow_down(&path); if (err < 0) goto out; + if (path.mnt == exp->ex_path.mnt && path.dentry == dentry && + nfsd_mountpoint(dentry, exp) == 2) { + /* This is only a mountpoint in some other namespace */ + path_put(&path); + goto out; + } exp2 = rqst_exp_get_by_name(rqstp, &path); if (IS_ERR(exp2)) { @@ -167,16 +173,26 @@ static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, st /* * For nfsd purposes, we treat V4ROOT exports as though there was an * export at *every* directory. + * We return: + * '1' if this dentry *must* be an export point, + * '2' if it might be, if there is really a mount here, and + * '0' if there is no chance of an export point here. */ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) { - if (d_mountpoint(dentry)) + if (!d_inode(dentry)) + return 0; + if (exp->ex_flags & NFSEXP_V4ROOT) return 1; if (nfsd4_is_junction(dentry)) return 1; - if (!(exp->ex_flags & NFSEXP_V4ROOT)) - return 0; - return d_inode(dentry) != NULL; + if (d_mountpoint(dentry)) + /* + * Might only be a mountpoint in a different namespace, + * but we need to check. + */ + return 2; + return 0; } __be32 @@ -895,24 +911,13 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { - mm_segment_t oldfs; + struct iov_iter iter; int host_err; - oldfs = get_fs(); - set_fs(KERNEL_DS); - host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0); - set_fs(oldfs); - return nfsd_finish_read(file, count, host_err); -} + iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count); + host_err = vfs_iter_read(file, &iter, &offset, 0); -static __be32 -nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) -{ - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) - return nfsd_splice_read(rqstp, file, offset, count); - else - return nfsd_readv(file, offset, vec, vlen, count); + return nfsd_finish_read(file, count, host_err); } /* @@ -958,7 +963,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, unsigned long *cnt, int stable) { struct svc_export *exp; - mm_segment_t oldfs; + struct iov_iter iter; __be32 err = 0; int host_err; int use_wgather; @@ -984,10 +989,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (stable && !use_wgather) flags |= RWF_SYNC; - /* Write the data. */ - oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, flags); - set_fs(oldfs); + iov_iter_kvec(&iter, WRITE | ITER_KVEC, vec, vlen, *cnt); + host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) goto out_nfserr; *cnt = host_err; @@ -1028,7 +1031,12 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ra = nfsd_init_raparms(file); trace_read_opened(rqstp, fhp, offset, vlen); - err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); + + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) + err = nfsd_splice_read(rqstp, file, offset, count); + else + err = nfsd_readv(file, offset, vec, vlen, count); + trace_read_io_done(rqstp, fhp, offset, vlen); if (ra) @@ -1448,41 +1456,34 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) { - mm_segment_t oldfs; __be32 err; - int host_err; + const char *link; struct path path; + DEFINE_DELAYED_CALL(done); + int len; err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP); - if (err) - goto out; + if (unlikely(err)) + return err; path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; - err = nfserr_inval; - if (!d_is_symlink(path.dentry)) - goto out; + if (unlikely(!d_is_symlink(path.dentry))) + return nfserr_inval; touch_atime(&path); - /* N.B. Why does this call need a get_fs()?? - * Remove the set_fs and watch the fireworks:-) --okir - */ - oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_readlink(path.dentry, (char __user *)buf, *lenp); - set_fs(oldfs); + link = vfs_get_link(path.dentry, &done); + if (IS_ERR(link)) + return nfserrno(PTR_ERR(link)); - if (host_err < 0) - goto out_nfserr; - *lenp = host_err; - err = 0; -out: - return err; - -out_nfserr: - err = nfserrno(host_err); - goto out; + len = strlen(link); + if (len < *lenp) + *lenp = len; + memcpy(buf, link, *lenp); + do_delayed_call(&done); + return 0; } /* diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6f87b2ac1aeb..e73c86d9855c 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -338,7 +338,7 @@ static void nilfs_end_bio_write(struct bio *bio) { struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) atomic_inc(&segbuf->sb_err); bio_put(bio); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index febed1217b3f..70ded52dc1dd 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2161,7 +2161,7 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino) } struct nilfs_segctor_wait_request { - wait_queue_t wq; + wait_queue_entry_t wq; __u32 seq; int err; atomic_t done; @@ -2206,8 +2206,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err) unsigned long flags; spin_lock_irqsave(&sci->sc_wait_request.lock, flags); - list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list, - wq.task_list) { + list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.head, wq.entry) { if (!atomic_read(&wrq->done) && nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) { wrq->err = err; diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 358258364616..4690cd75d8d7 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -159,7 +159,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, PTR_ERR(dent_inode)); kfree(name); /* Return the error code. */ - return (struct dentry *)dent_inode; + return ERR_CAST(dent_inode); } /* It is guaranteed that @name is no longer allocated at this point. */ if (MREF_ERR(mref) == -ENOENT) { diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 0da0332725aa..ffe003982d95 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -516,9 +516,9 @@ static void o2hb_bio_end_io(struct bio *bio) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (bio->bi_error) { - mlog(ML_ERROR, "IO Error %d\n", bio->bi_error); - wc->wc_error = bio->bi_error; + if (bio->bi_status) { + mlog(ML_ERROR, "IO Error %d\n", bio->bi_status); + wc->wc_error = blk_status_to_errno(bio->bi_status); } o2hb_bio_wait_dec(wc, 1); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3b7c937a36b5..4689940a953c 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2591,6 +2591,10 @@ void ocfs2_inode_unlock_tracker(struct inode *inode, struct ocfs2_lock_res *lockres; lockres = &OCFS2_I(inode)->ip_inode_lockres; + /* had_lock means that the currect process already takes the cluster + * lock previously. If had_lock is 1, we have nothing to do here, and + * it will get unlocked where we got the lock. + */ if (!had_lock) { ocfs2_remove_holder(lockres, oh); ocfs2_inode_unlock(inode, ex); diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 827fc9809bc2..9f88188060db 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -119,7 +119,7 @@ check_err: if (IS_ERR(inode)) { mlog_errno(PTR_ERR(inode)); - result = (void *)inode; + result = ERR_CAST(inode); goto bail; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ca1646fbcaef..83005f486451 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2062,7 +2062,7 @@ static int ocfs2_initialize_super(struct super_block *sb, cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); - memcpy(sb->s_uuid, di->id2.i_super.s_uuid, + memcpy(&sb->s_uuid, di->id2.i_super.s_uuid, sizeof(di->id2.i_super.s_uuid)); osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 3c5384d9b3a5..f70c3778d600 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1328,20 +1328,21 @@ static int ocfs2_xattr_get(struct inode *inode, void *buffer, size_t buffer_size) { - int ret; + int ret, had_lock; struct buffer_head *di_bh = NULL; + struct ocfs2_lock_holder oh; - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - mlog_errno(ret); - return ret; + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh); + if (had_lock < 0) { + mlog_errno(had_lock); + return had_lock; } down_read(&OCFS2_I(inode)->ip_xattr_sem); ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, name, buffer, buffer_size); up_read(&OCFS2_I(inode)->ip_xattr_sem); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); @@ -3537,11 +3538,12 @@ int ocfs2_xattr_set(struct inode *inode, { struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - int ret, credits, ref_meta = 0, ref_credits = 0; + int ret, credits, had_lock, ref_meta = 0, ref_credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, }; struct ocfs2_refcount_tree *ref_tree = NULL; + struct ocfs2_lock_holder oh; struct ocfs2_xattr_info xi = { .xi_name_index = name_index, @@ -3572,8 +3574,9 @@ int ocfs2_xattr_set(struct inode *inode, return -ENOMEM; } - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 1, &oh); + if (had_lock < 0) { + ret = had_lock; mlog_errno(ret); goto cleanup_nolock; } @@ -3670,7 +3673,7 @@ cleanup: if (ret) mlog_errno(ret); } - ocfs2_inode_unlock(inode, 1); + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); cleanup_nolock: brelse(di_bh); brelse(xbs.xattr_bh); diff --git a/fs/open.c b/fs/open.c index 373787afd638..3fe0c4aa7d27 100644 --- a/fs/open.c +++ b/fs/open.c @@ -193,7 +193,8 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) goto out_putf; error = -EPERM; - if (IS_APPEND(inode)) + /* Check IS_APPEND on real upper inode */ + if (IS_APPEND(file_inode(f.file))) goto out_putf; sb_start_write(inode->i_sb); @@ -459,20 +460,17 @@ out: SYSCALL_DEFINE1(fchdir, unsigned int, fd) { struct fd f = fdget_raw(fd); - struct inode *inode; - int error = -EBADF; + int error; error = -EBADF; if (!f.file) goto out; - inode = file_inode(f.file); - error = -ENOTDIR; - if (!S_ISDIR(inode->i_mode)) + if (!d_can_lookup(f.file->f_path.dentry)) goto out_putf; - error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); + error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &f.file->f_path); out_putf: @@ -761,6 +759,7 @@ static int do_dentry_open(struct file *f, likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 83b506020718..038d67545d9f 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -46,8 +46,8 @@ static void run_down(struct slot_map *m) spin_lock(&m->q.lock); if (m->c != -1) { for (;;) { - if (likely(list_empty(&wait.task_list))) - __add_wait_queue_tail(&m->q, &wait); + if (likely(list_empty(&wait.entry))) + __add_wait_queue_entry_tail(&m->q, &wait); set_current_state(TASK_UNINTERRUPTIBLE); if (m->c == -1) @@ -84,8 +84,8 @@ static int wait_for_free(struct slot_map *m) do { long n = left, t; - if (likely(list_empty(&wait.task_list))) - __add_wait_queue_tail_exclusive(&m->q, &wait); + if (likely(list_empty(&wait.entry))) + __add_wait_queue_entry_tail_exclusive(&m->q, &wait); set_current_state(TASK_INTERRUPTIBLE); if (m->c > 0) @@ -108,8 +108,8 @@ static int wait_for_free(struct slot_map *m) left = -EINTR; } while (left > 0); - if (!list_empty(&wait.task_list)) - list_del(&wait.task_list); + if (!list_empty(&wait.entry)) + list_del(&wait.entry); else if (left <= 0 && waitqueue_active(&m->q)) __wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL); __set_current_state(TASK_RUNNING); diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 0daac5112f7a..c0c9683934b7 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -1,5 +1,6 @@ config OVERLAY_FS tristate "Overlay filesystem support" + select EXPORTFS help An overlay filesystem combines two filesystems - an 'upper' filesystem and a 'lower' filesystem. When a name exists in both filesystems, the diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 906ea6c93260..e5869f91b3ab 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -20,6 +20,7 @@ #include <linux/namei.h> #include <linux/fdtable.h> #include <linux/ratelimit.h> +#include <linux/exportfs.h> #include "overlayfs.h" #include "ovl_entry.h" @@ -232,6 +233,82 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) return err; } +static struct ovl_fh *ovl_encode_fh(struct dentry *lower, uuid_t *uuid) +{ + struct ovl_fh *fh; + int fh_type, fh_len, dwords; + void *buf; + int buflen = MAX_HANDLE_SZ; + + buf = kmalloc(buflen, GFP_TEMPORARY); + if (!buf) + return ERR_PTR(-ENOMEM); + + /* + * We encode a non-connectable file handle for non-dir, because we + * only need to find the lower inode number and we don't want to pay + * the price or reconnecting the dentry. + */ + dwords = buflen >> 2; + fh_type = exportfs_encode_fh(lower, buf, &dwords, 0); + buflen = (dwords << 2); + + fh = ERR_PTR(-EIO); + if (WARN_ON(fh_type < 0) || + WARN_ON(buflen > MAX_HANDLE_SZ) || + WARN_ON(fh_type == FILEID_INVALID)) + goto out; + + BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255); + fh_len = offsetof(struct ovl_fh, fid) + buflen; + fh = kmalloc(fh_len, GFP_KERNEL); + if (!fh) { + fh = ERR_PTR(-ENOMEM); + goto out; + } + + fh->version = OVL_FH_VERSION; + fh->magic = OVL_FH_MAGIC; + fh->type = fh_type; + fh->flags = OVL_FH_FLAG_CPU_ENDIAN; + fh->len = fh_len; + fh->uuid = *uuid; + memcpy(fh->fid, buf, buflen); + +out: + kfree(buf); + return fh; +} + +static int ovl_set_origin(struct dentry *dentry, struct dentry *lower, + struct dentry *upper) +{ + struct super_block *sb = lower->d_sb; + const struct ovl_fh *fh = NULL; + int err; + + /* + * When lower layer doesn't support export operations store a 'null' fh, + * so we can use the overlay.origin xattr to distignuish between a copy + * up and a pure upper inode. + */ + if (sb->s_export_op && sb->s_export_op->fh_to_dentry && + !uuid_is_null(&sb->s_uuid)) { + fh = ovl_encode_fh(lower, &sb->s_uuid); + if (IS_ERR(fh)) + return PTR_ERR(fh); + } + + /* + * Do not fail when upper doesn't support xattrs. + */ + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh, + fh ? fh->len : 0, 0); + kfree(fh); + + return err; +} + static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, struct dentry *dentry, struct path *lowerpath, struct kstat *stat, const char *link, @@ -252,15 +329,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, .link = link }; - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); - err = PTR_ERR(upper); - if (IS_ERR(upper)) - goto out; - err = security_inode_copy_up(dentry, &new_creds); if (err < 0) - goto out1; + goto out; if (new_creds) old_creds = override_creds(new_creds); @@ -268,13 +339,14 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (tmpfile) temp = ovl_do_tmpfile(upperdir, stat->mode); else - temp = ovl_lookup_temp(workdir, dentry); - err = PTR_ERR(temp); - if (IS_ERR(temp)) - goto out1; - + temp = ovl_lookup_temp(workdir); err = 0; - if (!tmpfile) + if (IS_ERR(temp)) { + err = PTR_ERR(temp); + temp = NULL; + } + + if (!err && !tmpfile) err = ovl_create_real(wdir, temp, &cattr, NULL, true); if (new_creds) { @@ -283,7 +355,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, } if (err) - goto out2; + goto out; if (S_ISREG(stat->mode)) { struct path upperpath; @@ -316,6 +388,27 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (err) goto out_cleanup; + /* + * Store identifier of lower inode in upper inode xattr to + * allow lookup of the copy up origin inode. + * + * Don't set origin when we are breaking the association with a lower + * hard link. + */ + if (S_ISDIR(stat->mode) || stat->nlink == 1) { + err = ovl_set_origin(dentry, lowerpath->dentry, temp); + if (err) + goto out_cleanup; + } + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + if (IS_ERR(upper)) { + err = PTR_ERR(upper); + upper = NULL; + goto out_cleanup; + } + if (tmpfile) err = ovl_do_link(temp, udir, upper, true); else @@ -329,17 +422,15 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, /* Restore timestamps on parent (best effort) */ ovl_set_timestamps(upperdir, pstat); -out2: +out: dput(temp); -out1: dput(upper); -out: return err; out_cleanup: if (!tmpfile) ovl_cleanup(wdir, temp); - goto out2; + goto out; } /* @@ -372,6 +463,11 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, ovl_path_upper(parent, &parentpath); upperdir = parentpath.dentry; + /* Mark parent "impure" because it may now contain non-pure upper */ + err = ovl_set_impure(parent, upperdir); + if (err) + return err; + err = vfs_getattr(&parentpath, &pstat, STATX_ATIME | STATX_MTIME, AT_STATX_SYNC_AS_STAT); if (err) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 6515796460df..a63a71656e9b 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -41,7 +41,7 @@ void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) } } -struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) +struct dentry *ovl_lookup_temp(struct dentry *workdir) { struct dentry *temp; char name[20]; @@ -68,7 +68,7 @@ static struct dentry *ovl_whiteout(struct dentry *workdir, struct dentry *whiteout; struct inode *wdir = workdir->d_inode; - whiteout = ovl_lookup_temp(workdir, dentry); + whiteout = ovl_lookup_temp(workdir); if (IS_ERR(whiteout)) return whiteout; @@ -127,45 +127,26 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry, return err; } -static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) +static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, + int xerr) { int err; - err = ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0); + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_OPAQUE, "y", 1, xerr); if (!err) ovl_dentry_set_opaque(dentry); return err; } -static int ovl_dir_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) { - struct dentry *dentry = path->dentry; - int err; - enum ovl_path_type type; - struct path realpath; - const struct cred *old_cred; - - type = ovl_path_real(dentry, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getattr(&realpath, stat, request_mask, flags); - revert_creds(old_cred); - if (err) - return err; - - stat->dev = dentry->d_sb->s_dev; - stat->ino = dentry->d_inode->i_ino; - /* - * It's probably not worth it to count subdirs to get the - * correct link count. nlink=1 seems to pacify 'find' and - * other utilities. + * Fail with -EIO when trying to create opaque dir and upper doesn't + * support xattrs. ovl_rename() calls ovl_set_opaque_xerr(-EXDEV) to + * return a specific error for noxattr case. */ - if (OVL_TYPE_MERGE(type)) - stat->nlink = 1; - - return 0; + return ovl_set_opaque_xerr(dentry, upperdentry, -EIO); } /* Common operations required to be done after creation of file on upper */ @@ -182,6 +163,9 @@ static void ovl_instantiate(struct dentry *dentry, struct inode *inode, inc_nlink(inode); } d_instantiate(dentry, inode); + /* Force lookup of new upper hardlink to find its lower */ + if (hardlink) + d_drop(dentry); } static bool ovl_type_merge(struct dentry *dentry) @@ -189,6 +173,11 @@ static bool ovl_type_merge(struct dentry *dentry) return OVL_TYPE_MERGE(ovl_path_type(dentry)); } +static bool ovl_type_origin(struct dentry *dentry) +{ + return OVL_TYPE_ORIGIN(ovl_path_type(dentry)); +} + static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct cattr *attr, struct dentry *hardlink) { @@ -210,7 +199,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, if (err) goto out_dput; - if (ovl_type_merge(dentry->d_parent)) { + if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) { /* Setting opaque here is just an optimization, allow to fail */ ovl_set_opaque(dentry, newdentry); } @@ -277,7 +266,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (upper->d_parent->d_inode != udir) goto out_unlock; - opaquedir = ovl_lookup_temp(workdir, dentry); + opaquedir = ovl_lookup_temp(workdir); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out_unlock; @@ -409,7 +398,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out; - newdentry = ovl_lookup_temp(workdir, dentry); + newdentry = ovl_lookup_temp(workdir); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) goto out_unlock; @@ -873,18 +862,16 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) if (IS_ERR(redirect)) return PTR_ERR(redirect); - err = ovl_do_setxattr(ovl_dentry_upper(dentry), OVL_XATTR_REDIRECT, - redirect, strlen(redirect), 0); + err = ovl_check_setxattr(dentry, ovl_dentry_upper(dentry), + OVL_XATTR_REDIRECT, + redirect, strlen(redirect), -EXDEV); if (!err) { spin_lock(&dentry->d_lock); ovl_dentry_set_redirect(dentry, redirect); spin_unlock(&dentry->d_lock); } else { kfree(redirect); - if (err == -EOPNOTSUPP) - ovl_clear_redirect_dir(dentry->d_sb); - else - pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err); + pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err); /* Fall back to userspace copy-up */ err = -EXDEV; } @@ -970,6 +957,25 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, old_upperdir = ovl_dentry_upper(old->d_parent); new_upperdir = ovl_dentry_upper(new->d_parent); + if (!samedir) { + /* + * When moving a merge dir or non-dir with copy up origin into + * a new parent, we are marking the new parent dir "impure". + * When ovl_iterate() iterates an "impure" upper dir, it will + * lookup the origin inodes of the entries to fill d_ino. + */ + if (ovl_type_origin(old)) { + err = ovl_set_impure(new->d_parent, new_upperdir); + if (err) + goto out_revert_creds; + } + if (!overwrite && ovl_type_origin(new)) { + err = ovl_set_impure(old->d_parent, old_upperdir); + if (err) + goto out_revert_creds; + } + } + trap = lock_rename(new_upperdir, old_upperdir); olddentry = lookup_one_len(old->d_name.name, old_upperdir, @@ -1019,7 +1025,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (ovl_type_merge_or_lower(old)) err = ovl_set_redirect(old, samedir); else if (!old_opaque && ovl_type_merge(new->d_parent)) - err = ovl_set_opaque(old, olddentry); + err = ovl_set_opaque_xerr(old, olddentry, -EXDEV); if (err) goto out_dput; } @@ -1027,7 +1033,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (ovl_type_merge_or_lower(new)) err = ovl_set_redirect(new, samedir); else if (!new_opaque && ovl_type_merge(old->d_parent)) - err = ovl_set_opaque(new, newdentry); + err = ovl_set_opaque_xerr(new, newdentry, -EXDEV); if (err) goto out_dput; } @@ -1070,7 +1076,7 @@ const struct inode_operations ovl_dir_inode_operations = { .create = ovl_create, .mknod = ovl_mknod, .permission = ovl_permission, - .getattr = ovl_dir_getattr, + .getattr = ovl_getattr, .listxattr = ovl_listxattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index f8fe6bf2036d..d613e2c41242 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -57,18 +57,78 @@ out: return err; } -static int ovl_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int ovl_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; + enum ovl_path_type type; struct path realpath; const struct cred *old_cred; + bool is_dir = S_ISDIR(dentry->d_inode->i_mode); int err; - ovl_path_real(dentry, &realpath); + type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getattr(&realpath, stat, request_mask, flags); + if (err) + goto out; + + /* + * When all layers are on the same fs, all real inode number are + * unique, so we use the overlay st_dev, which is friendly to du -x. + * + * We also use st_ino of the copy up origin, if we know it. + * This guaranties constant st_dev/st_ino across copy up. + * + * If filesystem supports NFS export ops, this also guaranties + * persistent st_ino across mount cycle. + */ + if (ovl_same_sb(dentry->d_sb)) { + if (OVL_TYPE_ORIGIN(type)) { + struct kstat lowerstat; + u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0); + + ovl_path_lower(dentry, &realpath); + err = vfs_getattr(&realpath, &lowerstat, + lowermask, flags); + if (err) + goto out; + + WARN_ON_ONCE(stat->dev != lowerstat.dev); + /* + * Lower hardlinks are broken on copy up to different + * upper files, so we cannot use the lower origin st_ino + * for those different files, even for the same fs case. + */ + if (is_dir || lowerstat.nlink == 1) + stat->ino = lowerstat.ino; + } + stat->dev = dentry->d_sb->s_dev; + } else if (is_dir) { + /* + * If not all layers are on the same fs the pair {real st_ino; + * overlay st_dev} is not unique, so use the non persistent + * overlay st_ino. + * + * Always use the overlay st_dev for directories, so 'find + * -xdev' will scan the entire overlay mount and won't cross the + * overlay mount boundaries. + */ + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + } + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (is_dir && OVL_TYPE_MERGE(type)) + stat->nlink = 1; + +out: revert_creds(old_cred); + return err; } @@ -180,6 +240,16 @@ int ovl_xattr_get(struct dentry *dentry, const char *name, return res; } +static bool ovl_can_list(const char *s) +{ + /* List all non-trusted xatts */ + if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) + return true; + + /* Never list trusted.overlay, list other trusted for superuser only */ + return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN); +} + ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); @@ -203,7 +273,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return -EIO; len -= slen; - if (ovl_is_private_xattr(s)) { + if (!ovl_can_list(s)) { res -= slen; memmove(s, s + slen, len); } else { @@ -303,6 +373,41 @@ static const struct inode_operations ovl_symlink_inode_operations = { .update_time = ovl_update_time, }; +/* + * It is possible to stack overlayfs instance on top of another + * overlayfs instance as lower layer. We need to annonate the + * stackable i_mutex locks according to stack level of the super + * block instance. An overlayfs instance can never be in stack + * depth 0 (there is always a real fs below it). An overlayfs + * inode lock will use the lockdep annotaion ovl_i_mutex_key[depth]. + * + * For example, here is a snip from /proc/lockdep_chains after + * dir_iterate of nested overlayfs: + * + * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) + * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) + * [...] &type->i_mutex_dir_key (stack_depth=0) + */ +#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH + +static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) +{ +#ifdef CONFIG_LOCKDEP + static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; + static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; + + int depth = inode->i_sb->s_stack_depth - 1; + + if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING)) + depth = 0; + + if (S_ISDIR(inode->i_mode)) + lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); + else + lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); +#endif +} + static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_ino = get_next_ino(); @@ -312,6 +417,8 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; #endif + ovl_lockdep_annotate_inode_mutex_key(inode); + switch (mode & S_IFMT) { case S_IFREG: inode->i_op = &ovl_file_inode_operations; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index b8b077821fb0..de0d4f742f36 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -12,6 +12,8 @@ #include <linux/namei.h> #include <linux/xattr.h> #include <linux/ratelimit.h> +#include <linux/mount.h> +#include <linux/exportfs.h> #include "overlayfs.h" #include "ovl_entry.h" @@ -81,19 +83,93 @@ invalid: goto err_free; } -static bool ovl_is_opaquedir(struct dentry *dentry) +static int ovl_acceptable(void *ctx, struct dentry *dentry) +{ + return 1; +} + +static struct dentry *ovl_get_origin(struct dentry *dentry, + struct vfsmount *mnt) { int res; - char val; + struct ovl_fh *fh = NULL; + struct dentry *origin = NULL; + int bytes; - if (!d_is_dir(dentry)) - return false; + res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0); + if (res < 0) { + if (res == -ENODATA || res == -EOPNOTSUPP) + return NULL; + goto fail; + } + /* Zero size value means "copied up but origin unknown" */ + if (res == 0) + return NULL; - res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); - if (res == 1 && val == 'y') - return true; + fh = kzalloc(res, GFP_TEMPORARY); + if (!fh) + return ERR_PTR(-ENOMEM); + + res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, fh, res); + if (res < 0) + goto fail; + + if (res < sizeof(struct ovl_fh) || res < fh->len) + goto invalid; + + if (fh->magic != OVL_FH_MAGIC) + goto invalid; + + /* Treat larger version and unknown flags as "origin unknown" */ + if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL) + goto out; + + /* Treat endianness mismatch as "origin unknown" */ + if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) && + (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) + goto out; + + bytes = (fh->len - offsetof(struct ovl_fh, fid)); + + /* + * Make sure that the stored uuid matches the uuid of the lower + * layer where file handle will be decoded. + */ + if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid)) + goto out; - return false; + origin = exportfs_decode_fh(mnt, (struct fid *)fh->fid, + bytes >> 2, (int)fh->type, + ovl_acceptable, NULL); + if (IS_ERR(origin)) { + /* Treat stale file handle as "origin unknown" */ + if (origin == ERR_PTR(-ESTALE)) + origin = NULL; + goto out; + } + + if (ovl_dentry_weird(origin) || + ((d_inode(origin)->i_mode ^ d_inode(dentry)->i_mode) & S_IFMT)) { + dput(origin); + origin = NULL; + goto invalid; + } + +out: + kfree(fh); + return origin; + +fail: + pr_warn_ratelimited("overlayfs: failed to get origin (%i)\n", res); + goto out; +invalid: + pr_warn_ratelimited("overlayfs: invalid origin (%*phN)\n", res, fh); + goto out; +} + +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + return ovl_check_dir_xattr(dentry, OVL_XATTR_OPAQUE); } static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, @@ -192,6 +268,45 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, return 0; } + +static int ovl_check_origin(struct dentry *dentry, struct dentry *upperdentry, + struct path **stackp, unsigned int *ctrp) +{ + struct super_block *same_sb = ovl_same_sb(dentry->d_sb); + struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; + struct vfsmount *mnt; + struct dentry *origin; + + if (!same_sb || !roe->numlower) + return 0; + + /* + * Since all layers are on the same fs, we use the first layer for + * decoding the file handle. We may get a disconnected dentry, + * which is fine, because we only need to hold the origin inode in + * cache and use its inode number. We may even get a connected dentry, + * that is not under the first layer's root. That is also fine for + * using it's inode number - it's the same as if we held a reference + * to a dentry in first layer that was moved under us. + */ + mnt = roe->lowerstack[0].mnt; + + origin = ovl_get_origin(upperdentry, mnt); + if (IS_ERR_OR_NULL(origin)) + return PTR_ERR(origin); + + BUG_ON(*stackp || *ctrp); + *stackp = kmalloc(sizeof(struct path), GFP_TEMPORARY); + if (!*stackp) { + dput(origin); + return -ENOMEM; + } + **stackp = (struct path) { .dentry = origin, .mnt = mnt }; + *ctrp = 1; + + return 0; +} + /* * Returns next layer in stack starting from top. * Returns -1 if this is the last layer. @@ -220,11 +335,13 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, const struct cred *old_cred; struct ovl_fs *ofs = dentry->d_sb->s_fs_info; struct ovl_entry *poe = dentry->d_parent->d_fsdata; + struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; struct path *stack = NULL; struct dentry *upperdir, *upperdentry = NULL; unsigned int ctr = 0; struct inode *inode = NULL; bool upperopaque = false; + bool upperimpure = false; char *upperredirect = NULL; struct dentry *this; unsigned int i; @@ -253,15 +370,24 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, err = -EREMOTE; goto out; } + if (upperdentry && !d.is_dir) { + BUG_ON(!d.stop || d.redirect); + err = ovl_check_origin(dentry, upperdentry, + &stack, &ctr); + if (err) + goto out; + } if (d.redirect) { upperredirect = kstrdup(d.redirect, GFP_KERNEL); if (!upperredirect) goto out_put_upper; if (d.redirect[0] == '/') - poe = dentry->d_sb->s_root->d_fsdata; + poe = roe; } upperopaque = d.opaque; + if (upperdentry && d.is_dir) + upperimpure = ovl_is_impuredir(upperdentry); } if (!d.stop && poe->numlower) { @@ -290,10 +416,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (d.stop) break; - if (d.redirect && - d.redirect[0] == '/' && - poe != dentry->d_sb->s_root->d_fsdata) { - poe = dentry->d_sb->s_root->d_fsdata; + if (d.redirect && d.redirect[0] == '/' && poe != roe) { + poe = roe; /* Find the current layer on the root dentry */ for (i = 0; i < poe->numlower; i++) @@ -332,6 +456,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, revert_creds(old_cred); oe->opaque = upperopaque; + oe->impure = upperimpure; oe->redirect = upperredirect; oe->__upperdentry = upperdentry; memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 741dc0b6931f..10863b4105fa 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -8,18 +8,57 @@ */ #include <linux/kernel.h> +#include <linux/uuid.h> enum ovl_path_type { __OVL_PATH_UPPER = (1 << 0), __OVL_PATH_MERGE = (1 << 1), + __OVL_PATH_ORIGIN = (1 << 2), }; #define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) #define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) +#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN) #define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay." #define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque" #define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect" +#define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin" +#define OVL_XATTR_IMPURE OVL_XATTR_PREFIX "impure" + +/* + * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, + * where: + * origin.fh - exported file handle of the lower file + * origin.uuid - uuid of the lower filesystem + */ +#define OVL_FH_VERSION 0 +#define OVL_FH_MAGIC 0xfb + +/* CPU byte order required for fid decoding: */ +#define OVL_FH_FLAG_BIG_ENDIAN (1 << 0) +#define OVL_FH_FLAG_ANY_ENDIAN (1 << 1) + +#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN) + +#if defined(__LITTLE_ENDIAN) +#define OVL_FH_FLAG_CPU_ENDIAN 0 +#elif defined(__BIG_ENDIAN) +#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN +#else +#error Endianness not defined +#endif + +/* On-disk and in-memeory format for redirect by file handle */ +struct ovl_fh { + u8 version; /* 0 */ + u8 magic; /* 0xfb */ + u8 len; /* size of this header + size of fid */ + u8 flags; /* OVL_FH_FLAG_* */ + u8 type; /* fid_type of fid */ + uuid_t uuid; /* uuid of filesystem */ + u8 fid[0]; /* file identifier */ +} __packed; #define OVL_ISUPPER_MASK 1UL @@ -151,6 +190,7 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); +struct super_block *ovl_same_sb(struct super_block *sb); struct ovl_entry *ovl_alloc_entry(unsigned int numlower); bool ovl_dentry_remote(struct dentry *dentry); bool ovl_dentry_weird(struct dentry *dentry); @@ -164,10 +204,10 @@ struct dentry *ovl_dentry_real(struct dentry *dentry); struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); bool ovl_dentry_is_opaque(struct dentry *dentry); +bool ovl_dentry_is_impure(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); bool ovl_redirect_dir(struct super_block *sb); -void ovl_clear_redirect_dir(struct super_block *sb); const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); @@ -180,6 +220,17 @@ bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry); void ovl_copy_up_end(struct dentry *dentry); +bool ovl_check_dir_xattr(struct dentry *dentry, const char *name); +int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, + const char *name, const void *value, size_t size, + int xerr); +int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); + +static inline bool ovl_is_impuredir(struct dentry *dentry) +{ + return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); +} + /* namei.c */ int ovl_path_next(int idx, struct dentry *dentry, struct path *path); @@ -197,6 +248,8 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, /* inode.c */ int ovl_setattr(struct dentry *dentry, struct iattr *attr); +int ovl_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); int ovl_permission(struct inode *inode, int mask); int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); @@ -222,7 +275,7 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; -struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry); +struct dentry *ovl_lookup_temp(struct dentry *workdir); struct cattr { dev_t rdev; umode_t mode; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 59614faa14c3..34bc4a9f5c61 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -28,7 +28,10 @@ struct ovl_fs { /* creds of process who forced instantiation of super block */ const struct cred *creator_cred; bool tmpfile; + bool noxattr; wait_queue_head_t copyup_wq; + /* sb common to all layers */ + struct super_block *same_sb; }; /* private information held for every overlayfs dentry */ @@ -40,6 +43,7 @@ struct ovl_entry { u64 version; const char *redirect; bool opaque; + bool impure; bool copying; }; struct rcu_head rcu; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index c9e70d39c1ea..4882ffb37bae 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -49,11 +49,28 @@ static void ovl_dentry_release(struct dentry *dentry) } } +static int ovl_check_append_only(struct inode *inode, int flag) +{ + /* + * This test was moot in vfs may_open() because overlay inode does + * not have the S_APPEND flag, so re-check on real upper inode + */ + if (IS_APPEND(inode)) { + if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) + return -EPERM; + if (flag & O_TRUNC) + return -EPERM; + } + + return 0; +} + static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode, unsigned int open_flags) { struct dentry *real; + int err; if (!d_is_reg(dentry)) { if (!inode || inode == d_inode(dentry)) @@ -65,15 +82,20 @@ static struct dentry *ovl_d_real(struct dentry *dentry, return dentry; if (open_flags) { - int err = ovl_open_maybe_copy_up(dentry, open_flags); - + err = ovl_open_maybe_copy_up(dentry, open_flags); if (err) return ERR_PTR(err); } real = ovl_dentry_upper(dentry); - if (real && (!inode || inode == d_inode(real))) + if (real && (!inode || inode == d_inode(real))) { + if (!inode) { + err = ovl_check_append_only(d_inode(real), open_flags); + if (err) + return ERR_PTR(err); + } return real; + } real = ovl_dentry_lower(dentry); if (!real) @@ -709,8 +731,8 @@ static const struct xattr_handler *ovl_xattr_handlers[] = { static int ovl_fill_super(struct super_block *sb, void *data, int silent) { - struct path upperpath = { NULL, NULL }; - struct path workpath = { NULL, NULL }; + struct path upperpath = { }; + struct path workpath = { }; struct dentry *root_dentry; struct inode *realinode; struct ovl_entry *oe; @@ -869,6 +891,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) dput(temp); else pr_warn("overlayfs: upper fs does not support tmpfile.\n"); + + /* + * Check if upper/work fs supports trusted.overlay.* + * xattr + */ + err = ovl_do_setxattr(ufs->workdir, OVL_XATTR_OPAQUE, + "0", 1, 0); + if (err) { + ufs->noxattr = true; + pr_warn("overlayfs: upper fs does not support xattr.\n"); + } else { + vfs_removexattr(ufs->workdir, OVL_XATTR_OPAQUE); + } } } @@ -892,11 +927,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->lower_mnt[ufs->numlower] = mnt; ufs->numlower++; + + /* Check if all lower layers are on same sb */ + if (i == 0) + ufs->same_sb = mnt->mnt_sb; + else if (ufs->same_sb != mnt->mnt_sb) + ufs->same_sb = NULL; } /* If the upper fs is nonexistent, we mark overlayfs r/o too */ if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; + else if (ufs->upper_mnt->mnt_sb != ufs->same_sb) + ufs->same_sb = NULL; if (remote) sb->s_d_op = &ovl_reval_dentry_operations; @@ -931,7 +974,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) path_put(&workpath); kfree(lowertmp); - oe->__upperdentry = upperpath.dentry; + if (upperpath.dentry) { + oe->__upperdentry = upperpath.dentry; + oe->impure = ovl_is_impuredir(upperpath.dentry); + } for (i = 0; i < numlower; i++) { oe->lowerstack[i].dentry = stack[i].dentry; oe->lowerstack[i].mnt = ufs->lower_mnt[i]; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 6e610a205e15..809048913889 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -40,6 +40,13 @@ const struct cred *ovl_override_creds(struct super_block *sb) return override_creds(ofs->creator_cred); } +struct super_block *ovl_same_sb(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->same_sb; +} + struct ovl_entry *ovl_alloc_entry(unsigned int numlower) { size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); @@ -75,11 +82,13 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) type = __OVL_PATH_UPPER; /* - * Non-dir dentry can hold lower dentry from previous - * location. + * Non-dir dentry can hold lower dentry of its copy up origin. */ - if (oe->numlower && d_is_dir(dentry)) - type |= __OVL_PATH_MERGE; + if (oe->numlower) { + type |= __OVL_PATH_ORIGIN; + if (d_is_dir(dentry)) + type |= __OVL_PATH_MERGE; + } } else { if (oe->numlower > 1) type |= __OVL_PATH_MERGE; @@ -100,7 +109,7 @@ void ovl_path_lower(struct dentry *dentry, struct path *path) { struct ovl_entry *oe = dentry->d_fsdata; - *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL }; + *path = oe->numlower ? oe->lowerstack[0] : (struct path) { }; } enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) @@ -166,6 +175,13 @@ bool ovl_dentry_is_opaque(struct dentry *dentry) return oe->opaque; } +bool ovl_dentry_is_impure(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->impure; +} + bool ovl_dentry_is_whiteout(struct dentry *dentry) { return !dentry->d_inode && ovl_dentry_is_opaque(dentry); @@ -182,14 +198,7 @@ bool ovl_redirect_dir(struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; - return ofs->config.redirect_dir; -} - -void ovl_clear_redirect_dir(struct super_block *sb) -{ - struct ovl_fs *ofs = sb->s_fs_info; - - ofs->config.redirect_dir = false; + return ofs->config.redirect_dir && !ofs->noxattr; } const char *ovl_dentry_get_redirect(struct dentry *dentry) @@ -294,3 +303,59 @@ void ovl_copy_up_end(struct dentry *dentry) wake_up_locked(&ofs->copyup_wq); spin_unlock(&ofs->copyup_wq.lock); } + +bool ovl_check_dir_xattr(struct dentry *dentry, const char *name) +{ + int res; + char val; + + if (!d_is_dir(dentry)) + return false; + + res = vfs_getxattr(dentry, name, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, + const char *name, const void *value, size_t size, + int xerr) +{ + int err; + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + if (ofs->noxattr) + return xerr; + + err = ovl_do_setxattr(upperdentry, name, value, size, 0); + + if (err == -EOPNOTSUPP) { + pr_warn("overlayfs: cannot set %s xattr on upper\n", name); + ofs->noxattr = true; + return xerr; + } + + return err; +} + +int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) +{ + int err; + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->impure) + return 0; + + /* + * Do not fail when upper doesn't support xattrs. + * Upper inodes won't have origin nor redirect xattr anyway. + */ + err = ovl_check_setxattr(dentry, upperdentry, OVL_XATTR_IMPURE, + "y", 1, 0); + if (!err) + oe->impure = true; + + return err; +} diff --git a/fs/pnode.c b/fs/pnode.c index 5bc7896d122a..53d411a371ce 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p) return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); } +static inline struct mount *last_slave(struct mount *p) +{ + return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave); +} + static inline struct mount *next_slave(struct mount *p) { return list_entry(p->mnt_slave.next, struct mount, mnt_slave); @@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m, } } +static struct mount *skip_propagation_subtree(struct mount *m, + struct mount *origin) +{ + /* + * Advance m such that propagation_next will not return + * the slaves of m. + */ + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + m = last_slave(m); + + return m; +} + static struct mount *next_group(struct mount *m, struct mount *origin) { while (1) { @@ -413,65 +431,104 @@ void propagate_mount_unlock(struct mount *mnt) } } -/* - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted. - */ -static void mark_umount_candidates(struct mount *mnt) +static void umount_one(struct mount *mnt, struct list_head *to_umount) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; - - BUG_ON(parent == mnt); - - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { - struct mount *child = __lookup_mnt(&m->mnt, - mnt->mnt_mountpoint); - if (!child || (child->mnt.mnt_flags & MNT_UMOUNT)) - continue; - if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) { - SET_MNT_MARK(child); - } - } + CLEAR_MNT_MARK(mnt); + mnt->mnt.mnt_flags |= MNT_UMOUNT; + list_del_init(&mnt->mnt_child); + list_del_init(&mnt->mnt_umounting); + list_move_tail(&mnt->mnt_list, to_umount); } /* * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ -static void __propagate_umount(struct mount *mnt) +static bool __propagate_umount(struct mount *mnt, + struct list_head *to_umount, + struct list_head *to_restore) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; + bool progress = false; + struct mount *child; - BUG_ON(parent == mnt); + /* + * The state of the parent won't change if this mount is + * already unmounted or marked as without children. + */ + if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED)) + goto out; - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { - struct mount *topper; - struct mount *child = __lookup_mnt(&m->mnt, - mnt->mnt_mountpoint); - /* - * umount the child only if the child has no children - * and the child is marked safe to unmount. - */ - if (!child || !IS_MNT_MARKED(child)) + /* Verify topper is the only grandchild that has not been + * speculatively unmounted. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (child->mnt_mountpoint == mnt->mnt.mnt_root) continue; - CLEAR_MNT_MARK(child); + if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child)) + continue; + /* Found a mounted child */ + goto children; + } - /* If there is exactly one mount covering all of child - * replace child with that mount. - */ - topper = find_topper(child); - if (topper) - mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, - topper); + /* Mark mounts that can be unmounted if not locked */ + SET_MNT_MARK(mnt); + progress = true; + + /* If a mount is without children and not locked umount it. */ + if (!IS_MNT_LOCKED(mnt)) { + umount_one(mnt, to_umount); + } else { +children: + list_move_tail(&mnt->mnt_umounting, to_restore); + } +out: + return progress; +} + +static void umount_list(struct list_head *to_umount, + struct list_head *to_restore) +{ + struct mount *mnt, *child, *tmp; + list_for_each_entry(mnt, to_umount, mnt_list) { + list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) { + /* topper? */ + if (child->mnt_mountpoint == mnt->mnt.mnt_root) + list_move_tail(&child->mnt_umounting, to_restore); + else + umount_one(child, to_umount); + } + } +} - if (list_empty(&child->mnt_mounts)) { - list_del_init(&child->mnt_child); - child->mnt.mnt_flags |= MNT_UMOUNT; - list_move_tail(&child->mnt_list, &mnt->mnt_list); +static void restore_mounts(struct list_head *to_restore) +{ + /* Restore mounts to a clean working state */ + while (!list_empty(to_restore)) { + struct mount *mnt, *parent; + struct mountpoint *mp; + + mnt = list_first_entry(to_restore, struct mount, mnt_umounting); + CLEAR_MNT_MARK(mnt); + list_del_init(&mnt->mnt_umounting); + + /* Should this mount be reparented? */ + mp = mnt->mnt_mp; + parent = mnt->mnt_parent; + while (parent->mnt.mnt_flags & MNT_UMOUNT) { + mp = parent->mnt_mp; + parent = parent->mnt_parent; } + if (parent != mnt->mnt_parent) + mnt_change_mountpoint(parent, mp, mnt); + } +} + +static void cleanup_umount_visitations(struct list_head *visited) +{ + while (!list_empty(visited)) { + struct mount *mnt = + list_first_entry(visited, struct mount, mnt_umounting); + list_del_init(&mnt->mnt_umounting); } } @@ -485,11 +542,68 @@ static void __propagate_umount(struct mount *mnt) int propagate_umount(struct list_head *list) { struct mount *mnt; + LIST_HEAD(to_restore); + LIST_HEAD(to_umount); + LIST_HEAD(visited); + + /* Find candidates for unmounting */ + list_for_each_entry_reverse(mnt, list, mnt_list) { + struct mount *parent = mnt->mnt_parent; + struct mount *m; + + /* + * If this mount has already been visited it is known that it's + * entire peer group and all of their slaves in the propagation + * tree for the mountpoint has already been visited and there is + * no need to visit them again. + */ + if (!list_empty(&mnt->mnt_umounting)) + continue; + + list_add_tail(&mnt->mnt_umounting, &visited); + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + struct mount *child = __lookup_mnt(&m->mnt, + mnt->mnt_mountpoint); + if (!child) + continue; + + if (!list_empty(&child->mnt_umounting)) { + /* + * If the child has already been visited it is + * know that it's entire peer group and all of + * their slaves in the propgation tree for the + * mountpoint has already been visited and there + * is no need to visit this subtree again. + */ + m = skip_propagation_subtree(m, parent); + continue; + } else if (child->mnt.mnt_flags & MNT_UMOUNT) { + /* + * We have come accross an partially unmounted + * mount in list that has not been visited yet. + * Remember it has been visited and continue + * about our merry way. + */ + list_add_tail(&child->mnt_umounting, &visited); + continue; + } + + /* Check the child and parents while progress is made */ + while (__propagate_umount(child, + &to_umount, &to_restore)) { + /* Is the parent a umount candidate? */ + child = child->mnt_parent; + if (list_empty(&child->mnt_umounting)) + break; + } + } + } - list_for_each_entry_reverse(mnt, list, mnt_list) - mark_umount_candidates(mnt); + umount_list(&to_umount, &to_restore); + restore_mounts(&to_restore); + cleanup_umount_visitations(&visited); + list_splice_tail(&to_umount, list); - list_for_each_entry(mnt, list, mnt_list) - __propagate_umount(mnt); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 45f6bf68fff3..f1e1927ccd48 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -821,7 +821,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - flags = write ? FOLL_WRITE : 0; + flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 4ee55274f155..45629f4b5402 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -504,7 +504,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (&m->list == &kclist_head) { if (clear_user(buffer, tsz)) return -EFAULT; - } else if (is_vmalloc_or_module_addr((void *)start)) { + } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ if (copy_to_user(buffer, buf, tsz)) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f0c8b33d99b1..520802da059c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -300,11 +300,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; - if (stack_guard_page_start(vma, start)) - start += PAGE_SIZE; end = vma->vm_end; - if (stack_guard_page_end(vma, end)) - end -= PAGE_SIZE; seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 792a4e5f9226..4d02c3b65061 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -349,48 +349,48 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) switch (record->type) { case PSTORE_TYPE_DMESG: - scnprintf(name, sizeof(name), "dmesg-%s-%lld%s", + scnprintf(name, sizeof(name), "dmesg-%s-%llu%s", record->psi->name, record->id, record->compressed ? ".enc.z" : ""); break; case PSTORE_TYPE_CONSOLE: - scnprintf(name, sizeof(name), "console-%s-%lld", + scnprintf(name, sizeof(name), "console-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_FTRACE: - scnprintf(name, sizeof(name), "ftrace-%s-%lld", + scnprintf(name, sizeof(name), "ftrace-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_MCE: - scnprintf(name, sizeof(name), "mce-%s-%lld", + scnprintf(name, sizeof(name), "mce-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_RTAS: - scnprintf(name, sizeof(name), "rtas-%s-%lld", + scnprintf(name, sizeof(name), "rtas-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OF: - scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld", + scnprintf(name, sizeof(name), "powerpc-ofw-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_COMMON: - scnprintf(name, sizeof(name), "powerpc-common-%s-%lld", + scnprintf(name, sizeof(name), "powerpc-common-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PMSG: - scnprintf(name, sizeof(name), "pmsg-%s-%lld", + scnprintf(name, sizeof(name), "pmsg-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OPAL: - scnprintf(name, sizeof(name), "powerpc-opal-%s-%lld", + scnprintf(name, sizeof(name), "powerpc-opal-%s-%llu", record->psi->name, record->id); break; case PSTORE_TYPE_UNKNOWN: - scnprintf(name, sizeof(name), "unknown-%s-%lld", + scnprintf(name, sizeof(name), "unknown-%s-%llu", record->psi->name, record->id); break; default: - scnprintf(name, sizeof(name), "type%d-%s-%lld", + scnprintf(name, sizeof(name), "type%d-%s-%llu", record->type, record->psi->name, record->id); break; } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index c416e653dc4f..58051265626f 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -30,5 +30,7 @@ extern void pstore_get_backend_records(struct pstore_info *psi, extern int pstore_mkfile(struct dentry *root, struct pstore_record *record); extern bool pstore_is_mounted(void); +extern void pstore_record_init(struct pstore_record *record, + struct pstore_info *psi); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index d468eec9b8a6..1b6e0ff6bff5 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -474,6 +474,20 @@ static size_t copy_kmsg_to_buffer(int hsize, size_t len) return total_len; } +void pstore_record_init(struct pstore_record *record, + struct pstore_info *psinfo) +{ + memset(record, 0, sizeof(*record)); + + record->psi = psinfo; + + /* Report zeroed timestamp if called before timekeeping has resumed. */ + if (__getnstimeofday(&record->time)) { + record->time.tv_sec = 0; + record->time.tv_nsec = 0; + } +} + /* * callback from kmsg_dump. (s2,l2) has the most recently * written bytes, older bytes are in (s1,l1). Save as much @@ -509,15 +523,14 @@ static void pstore_dump(struct kmsg_dumper *dumper, int header_size; int zipped_len = -1; size_t dump_size; - struct pstore_record record = { - .type = PSTORE_TYPE_DMESG, - .count = oopscount, - .reason = reason, - .part = part, - .compressed = false, - .buf = psinfo->buf, - .psi = psinfo, - }; + struct pstore_record record; + + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_DMESG; + record.count = oopscount; + record.reason = reason; + record.part = part; + record.buf = psinfo->buf; if (big_oops_buf && is_locked) { dst = big_oops_buf; @@ -587,12 +600,12 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) const char *e = s + c; while (s < e) { - struct pstore_record record = { - .type = PSTORE_TYPE_CONSOLE, - .psi = psinfo, - }; + struct pstore_record record; unsigned long flags; + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_CONSOLE; + if (c > psinfo->bufsize) c = psinfo->bufsize; @@ -640,19 +653,16 @@ static int pstore_write_user_compat(struct pstore_record *record, if (record->buf) return -EINVAL; - record->buf = kmalloc(record->size, GFP_KERNEL); - if (!record->buf) - return -ENOMEM; - - if (unlikely(copy_from_user(record->buf, buf, record->size))) { - ret = -EFAULT; + record->buf = memdup_user(buf, record->size); + if (unlikely(IS_ERR(record->buf))) { + ret = PTR_ERR(record->buf); goto out; } ret = record->psi->write(record); -out: kfree(record->buf); +out: record->buf = NULL; return unlikely(ret < 0) ? ret : record->size; @@ -770,8 +780,11 @@ static void decompress_record(struct pstore_record *record) int unzipped_len; char *decompressed; + if (!record->compressed) + return; + /* Only PSTORE_TYPE_DMESG support compression. */ - if (!record->compressed || record->type != PSTORE_TYPE_DMESG) { + if (record->type != PSTORE_TYPE_DMESG) { pr_warn("ignored compressed record type %d\n", record->type); return; } @@ -819,6 +832,7 @@ void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet) { int failed = 0; + unsigned int stop_loop = 65536; if (!psi || !root) return; @@ -832,7 +846,7 @@ void pstore_get_backend_records(struct pstore_info *psi, * may reallocate record.buf. On success, pstore_mkfile() will keep * the record.buf, so free it only on failure. */ - for (;;) { + for (; stop_loop; stop_loop--) { struct pstore_record *record; int rc; @@ -841,13 +855,15 @@ void pstore_get_backend_records(struct pstore_info *psi, pr_err("out of memory creating record\n"); break; } - record->psi = psi; + pstore_record_init(record, psi); record->size = psi->read(record); /* No more records left in backend? */ - if (record->size <= 0) + if (record->size <= 0) { + kfree(record); break; + } decompress_record(record); rc = pstore_mkfile(root, record); @@ -865,8 +881,11 @@ out: mutex_unlock(&psi->read_mutex); if (failed) - pr_warn("failed to load %d record(s) from '%s'\n", + pr_warn("failed to create %d record(s) from '%s'\n", failed, psi->name); + if (!stop_loop) + pr_err("looping? Too many records seen from '%s'\n", + psi->name); } static void pstore_dowork(struct work_struct *work) diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 209755e0d7c8..24db02de1787 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -22,16 +22,16 @@ static DEFINE_MUTEX(pmsg_lock); static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct pstore_record record = { - .type = PSTORE_TYPE_PMSG, - .size = count, - .psi = psinfo, - }; + struct pstore_record record; int ret; if (!count) return 0; + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_PMSG; + record.size = count; + /* check outside lock, page in any data. write_user also checks */ if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 5523df7f17ef..7125b398d312 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -27,7 +27,6 @@ #include <linux/module.h> #include <linux/version.h> #include <linux/pstore.h> -#include <linux/time.h> #include <linux/io.h> #include <linux/ioport.h> #include <linux/platform_device.h> @@ -58,7 +57,7 @@ module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400); MODULE_PARM_DESC(pmsg_size, "size of user space message log"); static unsigned long long mem_address; -module_param(mem_address, ullong, 0400); +module_param_hw(mem_address, ullong, other, 0400); MODULE_PARM_DESC(mem_address, "start of reserved RAM used to store oops/panic logs"); @@ -356,20 +355,15 @@ out: } static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, - bool compressed) + struct pstore_record *record) { char *hdr; - struct timespec timestamp; size_t len; - /* Report zeroed timestamp if called before timekeeping has resumed. */ - if (__getnstimeofday(×tamp)) { - timestamp.tv_sec = 0; - timestamp.tv_nsec = 0; - } hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n", - (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000), - compressed ? 'C' : 'D'); + record->time.tv_sec, + record->time.tv_nsec / 1000, + record->compressed ? 'C' : 'D'); WARN_ON_ONCE(!hdr); len = hdr ? strlen(hdr) : 0; persistent_ram_write(prz, hdr, len); @@ -440,7 +434,7 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) prz = cxt->dprzs[cxt->dump_write_cnt]; /* Build header and append record contents. */ - hlen = ramoops_write_kmsg_hdr(prz, record->compressed); + hlen = ramoops_write_kmsg_hdr(prz, record); size = record->size; if (size + hlen > prz->buffer_size) size = prz->buffer_size - hlen; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ebf80c7739e1..48813aeaab80 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1512,6 +1512,22 @@ int dquot_initialize(struct inode *inode) } EXPORT_SYMBOL(dquot_initialize); +bool dquot_initialize_needed(struct inode *inode) +{ + struct dquot **dquots; + int i; + + if (!dquot_active(inode)) + return false; + + dquots = i_dquot(inode); + for (i = 0; i < MAXQUOTAS; i++) + if (!dquots[i] && sb_has_quota_active(inode->i_sb, i)) + return true; + return false; +} +EXPORT_SYMBOL(dquot_initialize_needed); + /* * Release all quotas referenced by inode. * diff --git a/fs/read_write.c b/fs/read_write.c index 47c1d4484df9..a2cbc8303dae 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -356,46 +356,6 @@ out_putf: } #endif -ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - if (!file->f_op->read_iter) - return -EINVAL; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - - iter->type |= READ; - ret = call_read_iter(file, &kiocb, iter); - BUG_ON(ret == -EIOCBQUEUED); - if (ret > 0) - *ppos = kiocb.ki_pos; - return ret; -} -EXPORT_SYMBOL(vfs_iter_read); - -ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - if (!file->f_op->write_iter) - return -EINVAL; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - - iter->type |= WRITE; - ret = call_write_iter(file, &kiocb, iter); - BUG_ON(ret == -EIOCBQUEUED); - if (ret > 0) - *ppos = kiocb.ki_pos; - return ret; -} -EXPORT_SYMBOL(vfs_iter_write); - int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { struct inode *inode; @@ -678,16 +638,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, struct kiocb kiocb; ssize_t ret; - if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)) - return -EOPNOTSUPP; - init_sync_kiocb(&kiocb, filp); - if (flags & RWF_HIPRI) - kiocb.ki_flags |= IOCB_HIPRI; - if (flags & RWF_DSYNC) - kiocb.ki_flags |= IOCB_DSYNC; - if (flags & RWF_SYNC) - kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + ret = kiocb_set_rw_flags(&kiocb, flags); + if (ret) + return ret; kiocb.ki_pos = *ppos; if (type == READ) @@ -916,86 +870,114 @@ out: } #endif -static ssize_t __do_readv_writev(int type, struct file *file, - struct iov_iter *iter, loff_t *pos, int flags) +static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, + loff_t *pos, int flags) { size_t tot_len; ssize_t ret = 0; + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + tot_len = iov_iter_count(iter); if (!tot_len) goto out; - ret = rw_verify_area(type, file, pos, tot_len); + ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) - goto out; - - if (type != READ) - file_start_write(file); + return ret; - if ((type == READ && file->f_op->read_iter) || - (type == WRITE && file->f_op->write_iter)) - ret = do_iter_readv_writev(file, iter, pos, type, flags); + if (file->f_op->read_iter) + ret = do_iter_readv_writev(file, iter, pos, READ, flags); else - ret = do_loop_readv_writev(file, iter, pos, type, flags); - - if (type != READ) - file_end_write(file); - + ret = do_loop_readv_writev(file, iter, pos, READ, flags); out: - if ((ret + (type == READ)) > 0) { - if (type == READ) - fsnotify_access(file); - else - fsnotify_modify(file); - } + if (ret >= 0) + fsnotify_access(file); return ret; } -static ssize_t do_readv_writev(int type, struct file *file, - const struct iovec __user *uvector, - unsigned long nr_segs, loff_t *pos, - int flags) +ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, + int flags) { - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; - ssize_t ret; + if (!file->f_op->read_iter) + return -EINVAL; + return do_iter_read(file, iter, ppos, flags); +} +EXPORT_SYMBOL(vfs_iter_read); + +static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, + loff_t *pos, int flags) +{ + size_t tot_len; + ssize_t ret = 0; - ret = import_iovec(type, uvector, nr_segs, - ARRAY_SIZE(iovstack), &iov, &iter); + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + tot_len = iov_iter_count(iter); + if (!tot_len) + return 0; + ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) return ret; - ret = __do_readv_writev(type, file, &iter, pos, flags); - kfree(iov); - + file_start_write(file); + if (file->f_op->write_iter) + ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); + else + ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); + file_end_write(file); + if (ret > 0) + fsnotify_modify(file); return ret; } +ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, + int flags) +{ + if (!file->f_op->write_iter) + return -EINVAL; + return do_iter_write(file, iter, ppos, flags); +} +EXPORT_SYMBOL(vfs_iter_write); + ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, int flags) { - if (!(file->f_mode & FMODE_READ)) - return -EBADF; - if (!(file->f_mode & FMODE_CAN_READ)) - return -EINVAL; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; - return do_readv_writev(READ, file, vec, vlen, pos, flags); -} + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret >= 0) { + ret = do_iter_read(file, &iter, pos, flags); + kfree(iov); + } + return ret; +} EXPORT_SYMBOL(vfs_readv); ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, int flags) { - if (!(file->f_mode & FMODE_WRITE)) - return -EBADF; - if (!(file->f_mode & FMODE_CAN_WRITE)) - return -EINVAL; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; - return do_readv_writev(WRITE, file, vec, vlen, pos, flags); + ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret >= 0) { + ret = do_iter_write(file, &iter, pos, flags); + kfree(iov); + } + return ret; } - EXPORT_SYMBOL(vfs_writev); static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, @@ -1143,44 +1125,20 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, } #ifdef CONFIG_COMPAT - -static ssize_t compat_do_readv_writev(int type, struct file *file, - const struct compat_iovec __user *uvector, - unsigned long nr_segs, loff_t *pos, - int flags) +static size_t compat_readv(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos, int flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t ret; - ret = compat_import_iovec(type, uvector, nr_segs, - UIO_FASTIOV, &iov, &iter); - if (ret < 0) - return ret; - - ret = __do_readv_writev(type, file, &iter, pos, flags); - kfree(iov); - - return ret; -} - -static size_t compat_readv(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) -{ - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_READ)) - goto out; - - ret = -EINVAL; - if (!(file->f_mode & FMODE_CAN_READ)) - goto out; - - ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags); - -out: + ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter); + if (ret >= 0) { + ret = do_iter_read(file, &iter, pos, flags); + kfree(iov); + } if (ret > 0) add_rchar(current, ret); inc_syscr(current); @@ -1276,18 +1234,16 @@ static size_t compat_writev(struct file *file, const struct compat_iovec __user *vec, unsigned long vlen, loff_t *pos, int flags) { - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_WRITE)) - goto out; - - ret = -EINVAL; - if (!(file->f_mode & FMODE_CAN_WRITE)) - goto out; - - ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0); + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; -out: + ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter); + if (ret >= 0) { + ret = do_iter_write(file, &iter, pos, flags); + kfree(iov); + } if (ret > 0) add_wchar(current, ret); inc_syscw(current); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index da01f497180a..a11d773e5ff3 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1112,7 +1112,7 @@ static int flush_commit_list(struct super_block *s, depth = reiserfs_write_unlock_nested(s); if (reiserfs_barrier_flush(s)) __sync_dirty_buffer(jl->j_commit_bh, - REQ_PREFLUSH | REQ_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(jl->j_commit_bh); reiserfs_write_lock_nested(s, depth); @@ -1271,7 +1271,7 @@ static int _update_journal_header_block(struct super_block *sb, if (reiserfs_barrier_flush(sb)) __sync_dirty_buffer(journal->j_header_bh, - REQ_PREFLUSH | REQ_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(journal->j_header_bh); @@ -2956,7 +2956,7 @@ void reiserfs_wait_on_write_block(struct super_block *s) static void queue_log_writer(struct super_block *s) { - wait_queue_t wait; + wait_queue_entry_t wait; struct reiserfs_journal *journal = SB_JOURNAL(s); set_bit(J_WRITERS_QUEUED, &journal->j_state); diff --git a/fs/select.c b/fs/select.c index d6c652a31e99..5b524a977d91 100644 --- a/fs/select.c +++ b/fs/select.c @@ -180,7 +180,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) return table->entry++; } -static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); @@ -206,7 +206,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) return default_wake_function(&dummy_wait, mode, sync, key); } -static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; diff --git a/fs/signalfd.c b/fs/signalfd.c index 270221fcef42..593b022ac11b 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -38,12 +38,12 @@ void signalfd_cleanup(struct sighand_struct *sighand) /* * The lockless check can race with remove_wait_queue() in progress, * but in this case its caller should run under rcu_read_lock() and - * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. + * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return. */ if (likely(!waitqueue_active(wqh))) return; - /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */ wake_up_poll(wqh, POLLHUP | POLLFREE); } diff --git a/fs/splice.c b/fs/splice.c index 540c4a44756c..ae41201d0325 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -762,7 +762,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n, sd.total_len - left); - ret = vfs_iter_write(out, &from, &sd.pos); + ret = vfs_iter_write(out, &from, &sd.pos, 0); if (ret <= 0) break; diff --git a/fs/stat.c b/fs/stat.c index f494b182c7c7..c35610845ab1 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -672,6 +672,7 @@ void __inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_bytes -= 512; } } +EXPORT_SYMBOL(__inode_add_bytes); void inode_add_bytes(struct inode *inode, loff_t bytes) { diff --git a/fs/statfs.c b/fs/statfs.c index 4e4623c7a126..41a6a82da5e2 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -244,6 +244,7 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) #ifdef CONFIG_COMPAT static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) { + struct compat_statfs buf; if (sizeof ubuf->f_blocks == 4) { if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) @@ -257,20 +258,20 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * && (kbuf->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + memset(&buf, 0, sizeof(struct compat_statfs)); + buf.f_type = kbuf->f_type; + buf.f_bsize = kbuf->f_bsize; + buf.f_blocks = kbuf->f_blocks; + buf.f_bfree = kbuf->f_bfree; + buf.f_bavail = kbuf->f_bavail; + buf.f_files = kbuf->f_files; + buf.f_ffree = kbuf->f_ffree; + buf.f_namelen = kbuf->f_namelen; + buf.f_fsid.val[0] = kbuf->f_fsid.val[0]; + buf.f_fsid.val[1] = kbuf->f_fsid.val[1]; + buf.f_frsize = kbuf->f_frsize; + buf.f_flags = kbuf->f_flags; + if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs))) return -EFAULT; return 0; } @@ -299,6 +300,7 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { + struct compat_statfs64 buf; if (sizeof(ubuf->f_bsize) == 4) { if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) @@ -312,20 +314,20 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat && (kbuf->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + memset(&buf, 0, sizeof(struct compat_statfs64)); + buf.f_type = kbuf->f_type; + buf.f_bsize = kbuf->f_bsize; + buf.f_blocks = kbuf->f_blocks; + buf.f_bfree = kbuf->f_bfree; + buf.f_bavail = kbuf->f_bavail; + buf.f_files = kbuf->f_files; + buf.f_ffree = kbuf->f_ffree; + buf.f_namelen = kbuf->f_namelen; + buf.f_fsid.val[0] = kbuf->f_fsid.val[0]; + buf.f_fsid.val[1] = kbuf->f_fsid.val[1]; + buf.f_frsize = kbuf->f_frsize; + buf.f_flags = kbuf->f_flags; + if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs64))) return -EFAULT; return 0; } diff --git a/fs/timerfd.c b/fs/timerfd.c index c543cdb5f8ed..ece0c02d7e63 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -169,7 +169,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) } static int timerfd_setup(struct timerfd_ctx *ctx, int flags, - const struct itimerspec *ktmr) + const struct itimerspec64 *ktmr) { enum hrtimer_mode htmode; ktime_t texp; @@ -178,10 +178,10 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, htmode = (flags & TFD_TIMER_ABSTIME) ? HRTIMER_MODE_ABS: HRTIMER_MODE_REL; - texp = timespec_to_ktime(ktmr->it_value); + texp = timespec64_to_ktime(ktmr->it_value); ctx->expired = 0; ctx->ticks = 0; - ctx->tintv = timespec_to_ktime(ktmr->it_interval); + ctx->tintv = timespec64_to_ktime(ktmr->it_interval); if (isalarm(ctx)) { alarm_init(&ctx->t.alarm, @@ -432,16 +432,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) } static int do_timerfd_settime(int ufd, int flags, - const struct itimerspec *new, - struct itimerspec *old) + const struct itimerspec64 *new, + struct itimerspec64 *old) { struct fd f; struct timerfd_ctx *ctx; int ret; if ((flags & ~TFD_SETTIME_FLAGS) || - !timespec_valid(&new->it_value) || - !timespec_valid(&new->it_interval)) + !itimerspec64_valid(new)) return -EINVAL; ret = timerfd_fget(ufd, &f); @@ -487,8 +486,8 @@ static int do_timerfd_settime(int ufd, int flags, hrtimer_forward_now(&ctx->t.tmr, ctx->tintv); } - old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - old->it_interval = ktime_to_timespec(ctx->tintv); + old->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); + old->it_interval = ktime_to_timespec64(ctx->tintv); /* * Re-program the timer to the new value ... @@ -500,7 +499,7 @@ static int do_timerfd_settime(int ufd, int flags, return ret; } -static int do_timerfd_gettime(int ufd, struct itimerspec *t) +static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) { struct fd f; struct timerfd_ctx *ctx; @@ -525,8 +524,8 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t) hrtimer_restart(&ctx->t.tmr); } } - t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - t->it_interval = ktime_to_timespec(ctx->tintv); + t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); + t->it_interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); fdput(f); return 0; @@ -536,15 +535,15 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, const struct itimerspec __user *, utmr, struct itimerspec __user *, otmr) { - struct itimerspec new, old; + struct itimerspec64 new, old; int ret; - if (copy_from_user(&new, utmr, sizeof(new))) + if (get_itimerspec64(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; - if (otmr && copy_to_user(otmr, &old, sizeof(old))) + if (otmr && put_itimerspec64(&old, otmr)) return -EFAULT; return ret; @@ -552,11 +551,11 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) { - struct itimerspec kotmr; + struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; - return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; + return put_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT @@ -564,15 +563,15 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, const struct compat_itimerspec __user *, utmr, struct compat_itimerspec __user *, otmr) { - struct itimerspec new, old; + struct itimerspec64 new, old; int ret; - if (get_compat_itimerspec(&new, utmr)) + if (get_compat_itimerspec64(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; - if (otmr && put_compat_itimerspec(otmr, &old)) + if (otmr && put_compat_itimerspec64(&old, otmr)) return -EFAULT; return ret; } @@ -580,10 +579,10 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct compat_itimerspec __user *, otmr) { - struct itimerspec kotmr; + struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; - return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0; + return put_compat_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; } #endif diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index b0d0623c83ed..83a961bf7280 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -61,3 +61,16 @@ config UBIFS_FS_ENCRYPTION feature is similar to ecryptfs, but it is more memory efficient since it avoids caching the encrypted and decrypted pages in the page cache. + +config UBIFS_FS_SECURITY + bool "UBIFS Security Labels" + depends on UBIFS_FS + default y + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the ubifs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 718b749fa11a..7cd8a7b95299 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2391,8 +2391,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) ubifs_dump_node(c, sa->node); return -EINVAL; } - if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && - sa->type != UBIFS_XENT_NODE) { + if (sb->type != UBIFS_INO_NODE && sb->type != UBIFS_DENT_NODE && + sb->type != UBIFS_XENT_NODE) { ubifs_err(c, "bad node type %d", sb->type); ubifs_dump_node(c, sb->node); return -EINVAL; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 12b9eb5005ff..fdc311246807 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -53,7 +53,7 @@ void ubifs_set_inode_flags(struct inode *inode) * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags. * @ioctl_flags: flags to convert * - * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags + * This function converts ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags * (@UBIFS_COMPR_FL, etc). */ static int ioctl2ubifs(int ioctl_flags) @@ -78,8 +78,8 @@ static int ioctl2ubifs(int ioctl_flags) * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags. * @ubifs_flags: flags to convert * - * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags - * (@FS_COMPR_FL, etc). + * This function converts UBIFS inode flags (@UBIFS_COMPR_FL, etc) to ioctl + * flags (@FS_COMPR_FL, etc). */ static int ubifs2ioctl(int ubifs_flags) { diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 586d59347fff..3af4472061cc 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -442,7 +442,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, { int empty_offs, pad_len; - lnum = lnum; dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs); ubifs_assert(!(*offs & 7)); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 4da10a6d702a..298b4d89eee9 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1753,13 +1753,23 @@ int ubifs_check_dir_empty(struct inode *dir); /* xattr.c */ extern const struct xattr_handler *ubifs_xattr_handlers[]; ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); -int ubifs_init_security(struct inode *dentry, struct inode *inode, - const struct qstr *qstr); int ubifs_xattr_set(struct inode *host, const char *name, const void *value, size_t size, int flags); ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, size_t size); +#ifdef CONFIG_UBIFS_FS_SECURITY +extern int ubifs_init_security(struct inode *dentry, struct inode *inode, + const struct qstr *qstr); +#else +static inline int ubifs_init_security(struct inode *dentry, + struct inode *inode, const struct qstr *qstr) +{ + return 0; +} +#endif + + /* super.c */ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 3e53fdbf7997..6c9e62c2ef55 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -559,6 +559,7 @@ out_free: return err; } +#ifdef CONFIG_UBIFS_FS_SECURITY static int init_xattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { @@ -599,6 +600,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, } return err; } +#endif static int xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, @@ -639,15 +641,19 @@ static const struct xattr_handler ubifs_trusted_xattr_handler = { .set = xattr_set, }; +#ifdef CONFIG_UBIFS_FS_SECURITY static const struct xattr_handler ubifs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = xattr_get, .set = xattr_set, }; +#endif const struct xattr_handler *ubifs_xattr_handlers[] = { &ubifs_user_xattr_handler, &ubifs_trusted_xattr_handler, +#ifdef CONFIG_UBIFS_FS_SECURITY &ubifs_security_xattr_handler, +#endif NULL }; diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index a0376a2c1c29..f80be4c5df9d 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -82,7 +82,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ufs_error (sb, "ufs_free_fragments", "bit already cleared for fragment %u", i); } - + + inode_sub_bytes(inode, count << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, count); uspi->cs_total.cs_nffree += count; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); @@ -184,6 +185,7 @@ do_more: ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); } ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); @@ -398,10 +400,12 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, /* * There is not enough space for user on the device */ - if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { - mutex_unlock(&UFS_SB(sb)->s_lock); - UFSD("EXIT (FAILED)\n"); - return 0; + if (unlikely(ufs_freefrags(uspi) <= uspi->s_root_blocks)) { + if (!capable(CAP_SYS_RESOURCE)) { + mutex_unlock(&UFS_SB(sb)->s_lock); + UFSD("EXIT (FAILED)\n"); + return 0; + } } if (goal >= uspi->s_size) @@ -419,12 +423,12 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, if (result) { ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); + *err = 0; write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); - write_sequnlock(&UFS_I(inode)->meta_lock); - *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); + write_sequnlock(&UFS_I(inode)->meta_lock); } mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); @@ -437,8 +441,10 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, result = ufs_add_fragments(inode, tmp, oldcount, newcount); if (result) { *err = 0; + read_seqlock_excl(&UFS_I(inode)->meta_lock); UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); + read_sequnlock_excl(&UFS_I(inode)->meta_lock); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); mutex_unlock(&UFS_SB(sb)->s_lock); @@ -449,39 +455,29 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, /* * allocate new block and move data */ - switch (fs32_to_cpu(sb, usb1->fs_optim)) { - case UFS_OPTSPACE: + if (fs32_to_cpu(sb, usb1->fs_optim) == UFS_OPTSPACE) { request = newcount; - if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree - > uspi->s_dsize * uspi->s_minfree / (2 * 100)) - break; - usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); - break; - default: - usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); - - case UFS_OPTTIME: + if (uspi->cs_total.cs_nffree < uspi->s_space_to_time) + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); + } else { request = uspi->s_fpb; - if (uspi->cs_total.cs_nffree < uspi->s_dsize * - (uspi->s_minfree - 2) / 100) - break; - usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); - break; + if (uspi->cs_total.cs_nffree > uspi->s_time_to_space) + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE); } result = ufs_alloc_fragments (inode, cgno, goal, request, err); if (result) { ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); + mutex_unlock(&UFS_SB(sb)->s_lock); ufs_change_blocknr(inode, fragment - oldcount, oldcount, uspi->s_sbbase + tmp, uspi->s_sbbase + result, locked_page); + *err = 0; write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); - write_sequnlock(&UFS_I(inode)->meta_lock); - *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - mutex_unlock(&UFS_SB(sb)->s_lock); + write_sequnlock(&UFS_I(inode)->meta_lock); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); @@ -494,6 +490,20 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return 0; } +static bool try_add_frags(struct inode *inode, unsigned frags) +{ + unsigned size = frags * i_blocksize(inode); + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, size); + if (unlikely((u32)inode->i_blocks != inode->i_blocks)) { + __inode_sub_bytes(inode, size); + spin_unlock(&inode->i_lock); + return false; + } + spin_unlock(&inode->i_lock); + return true; +} + static u64 ufs_add_fragments(struct inode *inode, u64 fragment, unsigned oldcount, unsigned newcount) { @@ -530,6 +540,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, for (i = oldcount; i < newcount; i++) if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) return 0; + + if (!try_add_frags(inode, count)) + return 0; /* * Block can be extended */ @@ -647,6 +660,7 @@ cg_found: ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; + inode_sub_bytes(inode, i << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i); @@ -657,6 +671,8 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; + if (!try_add_frags(inode, count)) + return 0; for (i = 0; i < count; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); @@ -716,6 +732,8 @@ norot: return INVBLOCK; ucpi->c_rotor = result; gotit: + if (!try_add_frags(inode, uspi->s_fpb)) + return 0; blkno = ufs_fragstoblks(result); ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7e41aee7b69a..f36d6a53687d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -235,7 +235,8 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, p = ufs_get_direct_data_ptr(uspi, ufsi, block); tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), - new_size, err, locked_page); + new_size - (lastfrag & uspi->s_fpbmask), err, + locked_page); return tmp != 0; } @@ -284,7 +285,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, goal += uspi->s_fpb; } tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), - goal, uspi->s_fpb, err, locked_page); + goal, nfrags, err, locked_page); if (!tmp) { *err = -ENOSPC; @@ -400,11 +401,20 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff u64 phys64 = 0; unsigned frag = fragment & uspi->s_fpbmask; - if (!create) { - phys64 = ufs_frag_map(inode, offsets, depth); - goto out; - } + phys64 = ufs_frag_map(inode, offsets, depth); + if (!create) + goto done; + if (phys64) { + if (fragment >= UFS_NDIR_FRAGMENT) + goto done; + read_seqlock_excl(&UFS_I(inode)->meta_lock); + if (fragment < UFS_I(inode)->i_lastfrag) { + read_sequnlock_excl(&UFS_I(inode)->meta_lock); + goto done; + } + read_sequnlock_excl(&UFS_I(inode)->meta_lock); + } /* This code entered only while writing ....? */ mutex_lock(&UFS_I(inode)->truncate_mutex); @@ -448,6 +458,11 @@ out: } mutex_unlock(&UFS_I(inode)->truncate_mutex); return err; + +done: + if (phys64) + map_bh(bh_result, sb, phys64 + frag); + return 0; } static int ufs_writepage(struct page *page, struct writeback_control *wbc) @@ -551,10 +566,8 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) */ inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode); set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink)); - if (inode->i_nlink == 0) { - ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); - return -1; - } + if (inode->i_nlink == 0) + return -ESTALE; /* * Linux now has 32-bit uid and gid, so we can support EFT. @@ -563,9 +576,9 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode)); inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size); - inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); - inode->i_ctime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec); - inode->i_mtime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); + inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); + inode->i_ctime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec); + inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; @@ -599,10 +612,8 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) */ inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode); set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink)); - if (inode->i_nlink == 0) { - ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); - return -1; - } + if (inode->i_nlink == 0) + return -ESTALE; /* * Linux now has 32-bit uid and gid, so we can support EFT. @@ -642,7 +653,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct buffer_head * bh; struct inode *inode; - int err; + int err = -EIO; UFSD("ENTER, ino %lu\n", ino); @@ -677,9 +688,10 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) err = ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); } - + brelse(bh); if (err) goto bad_inode; + inode->i_version++; ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; @@ -688,15 +700,13 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) ufs_set_inode_ops(inode); - brelse(bh); - UFSD("EXIT\n"); unlock_new_inode(inode); return inode; bad_inode: iget_failed(inode); - return ERR_PTR(-EIO); + return ERR_PTR(err); } static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) @@ -841,8 +851,11 @@ void ufs_evict_inode(struct inode * inode) truncate_inode_pages_final(&inode->i_data); if (want_delete) { inode->i_size = 0; - if (inode->i_blocks) + if (inode->i_blocks && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) ufs_truncate_blocks(inode); + ufs_update_inode(inode, inode_needs_sync(inode)); } invalidate_inode_buffers(inode); @@ -868,7 +881,6 @@ static inline void free_data(struct to_free *ctx, u64 from, unsigned count) ctx->to = from + count; } -#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) static void ufs_trunc_direct(struct inode *inode) @@ -1100,25 +1112,30 @@ out: return err; } -static void __ufs_truncate_blocks(struct inode *inode) +static void ufs_truncate_blocks(struct inode *inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; unsigned offsets[4]; - int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); + int depth; int depth2; unsigned i; struct ufs_buffer_head *ubh[3]; void *p; u64 block; - if (!depth) - return; + if (inode->i_size) { + sector_t last = (inode->i_size - 1) >> uspi->s_bshift; + depth = ufs_block_to_path(inode, last, offsets); + if (!depth) + return; + } else { + depth = 1; + } - /* find the last non-zero in offsets[] */ for (depth2 = depth - 1; depth2; depth2--) - if (offsets[depth2]) + if (offsets[depth2] != uspi->s_apb - 1) break; mutex_lock(&ufsi->truncate_mutex); @@ -1127,9 +1144,8 @@ static void __ufs_truncate_blocks(struct inode *inode) offsets[0] = UFS_IND_BLOCK; } else { /* get the blocks that should be partially emptied */ - p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]); + p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]++); for (i = 0; i < depth2; i++) { - offsets[i]++; /* next branch is fully freed */ block = ufs_data_ptr_to_cpu(sb, p); if (!block) break; @@ -1140,7 +1156,7 @@ static void __ufs_truncate_blocks(struct inode *inode) write_sequnlock(&ufsi->meta_lock); break; } - p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]); + p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]++); } while (i--) free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1); @@ -1155,7 +1171,9 @@ static void __ufs_truncate_blocks(struct inode *inode) free_full_branch(inode, block, i - UFS_IND_BLOCK + 1); } } + read_seqlock_excl(&ufsi->meta_lock); ufsi->i_lastfrag = DIRECT_FRAGMENT; + read_sequnlock_excl(&ufsi->meta_lock); mark_inode_dirty(inode); mutex_unlock(&ufsi->truncate_mutex); } @@ -1183,7 +1201,7 @@ static int ufs_truncate(struct inode *inode, loff_t size) truncate_setsize(inode, size); - __ufs_truncate_blocks(inode); + ufs_truncate_blocks(inode); inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); out: @@ -1191,16 +1209,6 @@ out: return err; } -static void ufs_truncate_blocks(struct inode *inode) -{ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - __ufs_truncate_blocks(inode); -} - int ufs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 131b2b77c818..0a4f58a5073c 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -480,7 +480,7 @@ static void ufs_setup_cstotal(struct super_block *sb) usb3 = ubh_get_usb_third(uspi); if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && - (usb1->fs_flags & UFS_FLAGS_UPDATED)) || + (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) || mtype == UFS_MOUNT_UFSTYPE_UFS2) { /*we have statistic in different place, then usual*/ uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir); @@ -596,9 +596,7 @@ static void ufs_put_cstotal(struct super_block *sb) usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); - if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && - (usb1->fs_flags & UFS_FLAGS_UPDATED)) || - mtype == UFS_MOUNT_UFSTYPE_UFS2) { + if (mtype == UFS_MOUNT_UFSTYPE_UFS2) { /*we have statistic in different place, then usual*/ usb2->fs_un.fs_u2.cs_ndir = cpu_to_fs64(sb, uspi->cs_total.cs_ndir); @@ -608,16 +606,26 @@ static void ufs_put_cstotal(struct super_block *sb) cpu_to_fs64(sb, uspi->cs_total.cs_nifree); usb3->fs_un1.fs_u2.cs_nffree = cpu_to_fs64(sb, uspi->cs_total.cs_nffree); - } else { - usb1->fs_cstotal.cs_ndir = - cpu_to_fs32(sb, uspi->cs_total.cs_ndir); - usb1->fs_cstotal.cs_nbfree = - cpu_to_fs32(sb, uspi->cs_total.cs_nbfree); - usb1->fs_cstotal.cs_nifree = - cpu_to_fs32(sb, uspi->cs_total.cs_nifree); - usb1->fs_cstotal.cs_nffree = - cpu_to_fs32(sb, uspi->cs_total.cs_nffree); + goto out; + } + + if (mtype == UFS_MOUNT_UFSTYPE_44BSD && + (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) { + /* store stats in both old and new places */ + usb2->fs_un.fs_u2.cs_ndir = + cpu_to_fs64(sb, uspi->cs_total.cs_ndir); + usb2->fs_un.fs_u2.cs_nbfree = + cpu_to_fs64(sb, uspi->cs_total.cs_nbfree); + usb3->fs_un1.fs_u2.cs_nifree = + cpu_to_fs64(sb, uspi->cs_total.cs_nifree); + usb3->fs_un1.fs_u2.cs_nffree = + cpu_to_fs64(sb, uspi->cs_total.cs_nffree); } + usb1->fs_cstotal.cs_ndir = cpu_to_fs32(sb, uspi->cs_total.cs_ndir); + usb1->fs_cstotal.cs_nbfree = cpu_to_fs32(sb, uspi->cs_total.cs_nbfree); + usb1->fs_cstotal.cs_nifree = cpu_to_fs32(sb, uspi->cs_total.cs_nifree); + usb1->fs_cstotal.cs_nffree = cpu_to_fs32(sb, uspi->cs_total.cs_nffree); +out: ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_print_super_stuff(sb, usb1, usb2, usb3); UFSD("EXIT\n"); @@ -746,6 +754,23 @@ static void ufs_put_super(struct super_block *sb) return; } +static u64 ufs_max_bytes(struct super_block *sb) +{ + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int bits = uspi->s_apbshift; + u64 res; + + if (bits > 21) + res = ~0ULL; + else + res = UFS_NDADDR + (1LL << bits) + (1LL << (2*bits)) + + (1LL << (3*bits)); + + if (res >= (MAX_LFS_FILESIZE >> uspi->s_bshift)) + return MAX_LFS_FILESIZE; + return res << uspi->s_bshift; +} + static int ufs_fill_super(struct super_block *sb, void *data, int silent) { struct ufs_sb_info * sbi; @@ -812,9 +837,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_dirblksize = UFS_SECTOR_SIZE; super_block_offset=UFS_SBLOCK; - /* Keep 2Gig file limit. Some UFS variants need to override - this but as I don't know which I'll let those in the know loosen - the rules */ + sb->s_maxbytes = MAX_LFS_FILESIZE; + switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { case UFS_MOUNT_UFSTYPE_44BSD: UFSD("ufstype=44bsd\n"); @@ -980,6 +1004,13 @@ again: flags |= UFS_ST_SUN; } + if ((flags & UFS_ST_MASK) == UFS_ST_44BSD && + uspi->s_postblformat == UFS_42POSTBLFMT) { + if (!silent) + pr_err("this is not a 44bsd filesystem"); + goto failed; + } + /* * Check ufs magic number */ @@ -1127,8 +1158,8 @@ magic_found: uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { - uspi->s_u2_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size); - uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); + uspi->s_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size); + uspi->s_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); } else { uspi->s_size = fs32_to_cpu(sb, usb1->fs_size); uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize); @@ -1177,6 +1208,18 @@ magic_found: uspi->s_postbloff = fs32_to_cpu(sb, usb3->fs_postbloff); uspi->s_rotbloff = fs32_to_cpu(sb, usb3->fs_rotbloff); + uspi->s_root_blocks = mul_u64_u32_div(uspi->s_dsize, + uspi->s_minfree, 100); + if (uspi->s_minfree <= 5) { + uspi->s_time_to_space = ~0ULL; + uspi->s_space_to_time = 0; + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE); + } else { + uspi->s_time_to_space = (uspi->s_root_blocks / 2) + 1; + uspi->s_space_to_time = mul_u64_u32_div(uspi->s_dsize, + uspi->s_minfree - 2, 100) - 1; + } + /* * Compute another frequently used values */ @@ -1212,6 +1255,7 @@ magic_found: "fast symlink size (%u)\n", uspi->s_maxsymlinklen); uspi->s_maxsymlinklen = maxsymlen; } + sb->s_maxbytes = ufs_max_bytes(sb); sb->s_max_links = UFS_LINK_MAX; inode = ufs_iget(sb, UFS_ROOTINO); @@ -1365,19 +1409,17 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) mutex_lock(&UFS_SB(sb)->s_lock); usb3 = ubh_get_usb_third(uspi); - if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) buf->f_type = UFS2_MAGIC; - buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); - } else { + else buf->f_type = UFS_MAGIC; - buf->f_blocks = uspi->s_dsize; - } - buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) + - uspi->cs_total.cs_nffree; + + buf->f_blocks = uspi->s_dsize; + buf->f_bfree = ufs_freefrags(uspi); buf->f_ffree = uspi->cs_total.cs_nifree; buf->f_bsize = sb->s_blocksize; - buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree)) - ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; + buf->f_bavail = (buf->f_bfree > uspi->s_root_blocks) + ? (buf->f_bfree - uspi->s_root_blocks) : 0; buf->f_files = uspi->s_ncg * uspi->s_ipg; buf->f_namelen = UFS_MAXNAMLEN; buf->f_fsid.val[0] = (u32)id; diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index 0cbd5d340b67..150eef6f1233 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -733,10 +733,8 @@ struct ufs_sb_private_info { __u32 s_dblkno; /* offset of first data after cg */ __u32 s_cgoffset; /* cylinder group offset in cylinder */ __u32 s_cgmask; /* used to calc mod fs_ntrak */ - __u32 s_size; /* number of blocks (fragments) in fs */ - __u32 s_dsize; /* number of data blocks in fs */ - __u64 s_u2_size; /* ufs2: number of blocks (fragments) in fs */ - __u64 s_u2_dsize; /*ufs2: number of data blocks in fs */ + __u64 s_size; /* number of blocks (fragments) in fs */ + __u64 s_dsize; /* number of data blocks in fs */ __u32 s_ncg; /* number of cylinder groups */ __u32 s_bsize; /* size of basic blocks */ __u32 s_fsize; /* size of fragments */ @@ -793,6 +791,9 @@ struct ufs_sb_private_info { __u32 s_maxsymlinklen;/* upper limit on fast symlinks' size */ __s32 fs_magic; /* filesystem magic */ unsigned int s_dirblksize; + __u64 s_root_blocks; + __u64 s_time_to_space; + __u64 s_space_to_time; }; /* diff --git a/fs/ufs/util.c b/fs/ufs/util.c index f41ad0a6106f..02497a492eb2 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -243,9 +243,8 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index) { - struct page *page; - - page = find_lock_page(mapping, index); + struct inode *inode = mapping->host; + struct page *page = find_lock_page(mapping, index); if (!page) { page = read_mapping_page(mapping, index, NULL); @@ -253,7 +252,7 @@ struct page *ufs_get_locked_page(struct address_space *mapping, printk(KERN_ERR "ufs_change_blocknr: " "read_mapping_page error: ino %lu, index: %lu\n", mapping->host->i_ino, index); - goto out; + return page; } lock_page(page); @@ -262,8 +261,7 @@ struct page *ufs_get_locked_page(struct address_space *mapping, /* Truncate got there first */ unlock_page(page); put_page(page); - page = NULL; - goto out; + return NULL; } if (!PageUptodate(page) || PageError(page)) { @@ -272,11 +270,12 @@ struct page *ufs_get_locked_page(struct address_space *mapping, printk(KERN_ERR "ufs_change_blocknr: " "can not read page: ino %lu, index: %lu\n", - mapping->host->i_ino, index); + inode->i_ino, index); - page = ERR_PTR(-EIO); + return ERR_PTR(-EIO); } } -out: + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, 0); return page; } diff --git a/fs/ufs/util.h b/fs/ufs/util.h index b7fbf53dbc81..9fc7119a1551 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -350,16 +350,11 @@ static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi, #define ubh_blkmap(ubh,begin,bit) \ ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb))) -/* - * Determine the number of available frags given a - * percentage to hold in reserve. - */ static inline u64 -ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved) +ufs_freefrags(struct ufs_sb_private_info *uspi) { return ufs_blkstofrags(uspi->cs_total.cs_nbfree) + - uspi->cs_total.cs_nffree - - (uspi->s_dsize * (percentreserved) / 100); + uspi->cs_total.cs_nffree; } /* @@ -473,15 +468,19 @@ static inline unsigned _ubh_find_last_zero_bit_( static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, unsigned begin, unsigned block) { + u8 mask; switch (uspi->s_fpb) { case 8: return (*ubh_get_addr (ubh, begin + block) == 0xff); case 4: - return (*ubh_get_addr (ubh, begin + (block >> 1)) == (0x0f << ((block & 0x01) << 2))); + mask = 0x0f << ((block & 0x01) << 2); + return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask; case 2: - return (*ubh_get_addr (ubh, begin + (block >> 2)) == (0x03 << ((block & 0x03) << 1))); + mask = 0x03 << ((block & 0x03) << 1); + return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask; case 1: - return (*ubh_get_addr (ubh, begin + (block >> 3)) == (0x01 << (block & 0x07))); + mask = 0x01 << (block & 0x07); + return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask; } return 0; } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f7555fc25877..6148ccd6cccf 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -81,7 +81,7 @@ struct userfaultfd_unmap_ctx { struct userfaultfd_wait_queue { struct uffd_msg msg; - wait_queue_t wq; + wait_queue_entry_t wq; struct userfaultfd_ctx *ctx; bool waken; }; @@ -91,7 +91,7 @@ struct userfaultfd_wake_range { unsigned long len; }; -static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, +static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { struct userfaultfd_wake_range *range = key; @@ -129,7 +129,7 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, * wouldn't be enough, the smp_mb__before_spinlock is * enough to avoid an explicit smp_mb() here. */ - list_del_init(&wq->task_list); + list_del_init(&wq->entry); out: return ret; } @@ -340,9 +340,28 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) bool must_wait, return_to_userland; long blocking_state; - BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - ret = VM_FAULT_SIGBUS; + + /* + * We don't do userfault handling for the final child pid update. + * + * We also don't do userfault handling during + * coredumping. hugetlbfs has the special + * follow_hugetlb_page() to skip missing pages in the + * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with + * the no_page_table() helper in follow_page_mask(), but the + * shmem_vm_ops->fault method is invoked even during + * coredumping without mmap_sem and it ends up here. + */ + if (current->flags & (PF_EXITING|PF_DUMPCORE)) + goto out; + + /* + * Coredumping runs without mmap_sem so we can only check that + * the mmap_sem is held, if PF_DUMPCORE was not set. + */ + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); + ctx = vmf->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -361,12 +380,6 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) goto out; /* - * We don't do userfault handling for the final child pid update. - */ - if (current->flags & PF_EXITING) - goto out; - - /* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY @@ -509,13 +522,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) * and it's fine not to block on the spinlock. The uwq on this * kernel stack can be released after the list_del_init. */ - if (!list_empty_careful(&uwq.wq.task_list)) { + if (!list_empty_careful(&uwq.wq.entry)) { spin_lock(&ctx->fault_pending_wqh.lock); /* * No need of list_del_init(), the uwq on the stack * will be freed shortly anyway. */ - list_del(&uwq.wq.task_list); + list_del(&uwq.wq.entry); spin_unlock(&ctx->fault_pending_wqh.lock); } @@ -847,7 +860,7 @@ wakeup: static inline struct userfaultfd_wait_queue *find_userfault_in( wait_queue_head_t *wqh) { - wait_queue_t *wq; + wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; VM_BUG_ON(!spin_is_locked(&wqh->lock)); @@ -856,7 +869,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in( if (!waitqueue_active(wqh)) goto out; /* walk in reverse to provide FIFO behavior to read userfaults */ - wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list); + wq = list_last_entry(&wqh->head, typeof(*wq), entry); uwq = container_of(wq, struct userfaultfd_wait_queue, wq); out: return uwq; @@ -990,14 +1003,14 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, * changes __remove_wait_queue() to use * list_del_init() in turn breaking the * !list_empty_careful() check in - * handle_userfault(). The uwq->wq.task_list + * handle_userfault(). The uwq->wq.head list * must never be empty at any time during the * refile, or the waitqueue could disappear * from under us. The "wait_queue_head_t" * parameter of __remove_wait_queue() is unused * anyway. */ - list_del(&uwq->wq.task_list); + list_del(&uwq->wq.entry); __add_wait_queue(&ctx->fault_wqh, &uwq->wq); write_seqcount_end(&ctx->refile_seq); @@ -1019,7 +1032,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, fork_nctx = (struct userfaultfd_ctx *) (unsigned long) uwq->msg.arg.reserved.reserved1; - list_move(&uwq->wq.task_list, &fork_event); + list_move(&uwq->wq.entry, &fork_event); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; @@ -1056,8 +1069,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, if (!list_empty(&fork_event)) { uwq = list_first_entry(&fork_event, typeof(*uwq), - wq.task_list); - list_del(&uwq->wq.task_list); + wq.entry); + list_del(&uwq->wq.entry); __add_wait_queue(&ctx->event_wqh, &uwq->wq); userfaultfd_event_complete(ctx, uwq); } @@ -1734,17 +1747,17 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) { struct userfaultfd_ctx *ctx = f->private_data; - wait_queue_t *wq; + wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; unsigned long pending = 0, total = 0; spin_lock(&ctx->fault_pending_wqh.lock); - list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) { + list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { uwq = container_of(wq, struct userfaultfd_wait_queue, wq); pending++; total++; } - list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { + list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { uwq = container_of(wq, struct userfaultfd_wait_queue, wq); total++; } diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 5c90f82b8f6b..a6e955bfead8 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -98,8 +98,7 @@ xfs-y += xfs_aops.o \ xfs_sysfs.o \ xfs_trans.o \ xfs_xattr.o \ - kmem.o \ - uuid.o + kmem.o # low-level transaction/log code xfs-y += xfs_log.o \ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f02eb7673392..a7048eafa8e6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1280,7 +1280,6 @@ xfs_bmap_read_extents( xfs_bmbt_rec_t *frp; xfs_fsblock_t nextbno; xfs_extnum_t num_recs; - xfs_extnum_t start; num_recs = xfs_btree_get_numrecs(block); if (unlikely(i + num_recs > room)) { @@ -1303,7 +1302,6 @@ xfs_bmap_read_extents( * Copy records into the extent records. */ frp = XFS_BMBT_REC_ADDR(mp, block, 1); - start = i; for (j = 0; j < num_recs; j++, i++, frp++) { xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); trp->l0 = be64_to_cpu(frp->l0); @@ -2065,8 +2063,10 @@ xfs_bmap_add_extent_delay_real( } temp = xfs_bmap_worst_indlen(bma->ip, temp); temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); - diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + diff = (int)(temp + temp2 - + (startblockval(PREV.br_startblock) - + (bma->cur ? + bma->cur->bc_private.b.allocated : 0))); if (diff > 0) { error = xfs_mod_fdblocks(bma->ip->i_mount, -((int64_t)diff), false); @@ -2123,7 +2123,6 @@ xfs_bmap_add_extent_delay_real( temp = da_new; if (bma->cur) temp += bma->cur->bc_private.b.allocated; - ASSERT(temp <= da_old); if (temp < da_old) xfs_mod_fdblocks(bma->ip->i_mount, (int64_t)(da_old - temp), false); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5392674bf893..3a673ba201aa 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4395,7 +4395,7 @@ xfs_btree_visit_blocks( xfs_btree_readahead_ptr(cur, ptr, 1); /* save for the next iteration of the loop */ - lptr = *ptr; + xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); } /* for each buffer in the level */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index b177ef33cd4c..82a38d86ebad 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1629,13 +1629,28 @@ xfs_refcount_recover_cow_leftovers( if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) return -EOPNOTSUPP; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + INIT_LIST_HEAD(&debris); + + /* + * In this first part, we use an empty transaction to gather up + * all the leftover CoW extents so that we can subsequently + * delete them. The empty transaction is used to avoid + * a buffer lock deadlock if there happens to be a loop in the + * refcountbt because we're allowed to re-grab a buffer that is + * already attached to our transaction. When we're done + * recording the CoW debris we cancel the (empty) transaction + * and everything goes away cleanly. + */ + error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (error) + goto out_trans; + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); /* Find all the leftover CoW staging extents. */ - INIT_LIST_HEAD(&debris); memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); low.rc.rc_startblock = XFS_REFC_COW_START; @@ -1645,10 +1660,11 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_cursor; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_buf_relse(agbp); + xfs_trans_brelse(tp, agbp); + xfs_trans_cancel(tp); /* Now iterate the list to free the leftovers */ - list_for_each_entry(rr, &debris, rr_list) { + list_for_each_entry_safe(rr, n, &debris, rr_list) { /* Set up transaction. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); if (error) @@ -1676,8 +1692,16 @@ xfs_refcount_recover_cow_leftovers( error = xfs_trans_commit(tp); if (error) goto out_free; + + list_del(&rr->rr_list); + kmem_free(rr); } + return error; +out_defer: + xfs_defer_cancel(&dfops); +out_trans: + xfs_trans_cancel(tp); out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { @@ -1688,11 +1712,6 @@ out_free: out_cursor: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_buf_relse(agbp); - goto out_free; - -out_defer: - xfs_defer_cancel(&dfops); - xfs_trans_cancel(tp); - goto out_free; + xfs_trans_brelse(tp, agbp); + goto out_trans; } diff --git a/fs/xfs/uuid.c b/fs/xfs/uuid.c deleted file mode 100644 index b83f76b6d410..000000000000 --- a/fs/xfs/uuid.c +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include <xfs.h> - -/* IRIX interpretation of an uuid_t */ -typedef struct { - __be32 uu_timelow; - __be16 uu_timemid; - __be16 uu_timehi; - __be16 uu_clockseq; - __be16 uu_node[3]; -} xfs_uu_t; - -/* - * uuid_getnodeuniq - obtain the node unique fields of a UUID. - * - * This is not in any way a standard or condoned UUID function; - * it just something that's needed for user-level file handles. - */ -void -uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) -{ - xfs_uu_t *uup = (xfs_uu_t *)uuid; - - fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) | - be16_to_cpu(uup->uu_timemid); - fsid[1] = be32_to_cpu(uup->uu_timelow); -} - -int -uuid_is_nil(uuid_t *uuid) -{ - int i; - char *cp = (char *)uuid; - - if (uuid == NULL) - return 0; - /* implied check of version number here... */ - for (i = 0; i < sizeof *uuid; i++) - if (*cp++) return 0; /* not nil */ - return 1; /* is nil */ -} - -int -uuid_equal(uuid_t *uuid1, uuid_t *uuid2) -{ - return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; -} diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h deleted file mode 100644 index 104db0f3bed6..000000000000 --- a/fs/xfs/uuid.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_UUID_H__ -#define __XFS_SUPPORT_UUID_H__ - -typedef struct { - unsigned char __u_bits[16]; -} uuid_t; - -extern int uuid_is_nil(uuid_t *uuid); -extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); -extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); - -static inline void -uuid_copy(uuid_t *dst, uuid_t *src) -{ - memcpy(dst, src, sizeof(uuid_t)); -} - -#endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 09af0f7cd55e..d20c29b9c95b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -276,7 +276,7 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; - int error = ioend->io_bio->bi_error; + int error; /* * Just clean up the in-memory strutures if the fs has been shut down. @@ -289,6 +289,7 @@ xfs_end_io( /* * Clean up any COW blocks on an I/O error. */ + error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { switch (ioend->io_type) { case XFS_IO_COW: @@ -332,7 +333,7 @@ xfs_end_bio( else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); else - xfs_destroy_ioend(ioend, bio->bi_error); + xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } STATIC int @@ -500,11 +501,12 @@ xfs_submit_ioend( * time. */ if (status) { - ioend->io_bio->bi_error = status; + ioend->io_bio->bi_status = errno_to_blk_status(status); bio_endio(ioend->io_bio); return status; } + ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; submit_bio(ioend->io_bio); return 0; } @@ -564,6 +566,7 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; submit_bio(ioend->io_bio); ioend->io_bio = new; } @@ -1316,9 +1319,12 @@ xfs_vm_bmap( * The swap code (ab-)uses ->bmap to get a block mapping and then * bypasseŃ• the file system for actual I/O. We really can't allow * that on reflinks inodes, so we have to skip out here. And yes, - * 0 is the magic code for a bmap error.. + * 0 is the magic code for a bmap error. + * + * Since we don't pass back blockdev info, we can't return bmap + * information for rt files either. */ - if (xfs_is_reflink_inode(ip)) + if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; filemap_write_and_wait(mapping); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2b954308a1d6..9e3cc2146d5b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -582,9 +582,13 @@ xfs_getbmap( } break; default: + /* Local format data forks report no extents. */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + bmv->bmv_entries = 0; + return 0; + } if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) return -EINVAL; if (xfs_get_extsz_hint(ip) || @@ -712,7 +716,7 @@ xfs_getbmap( * extents. */ if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); if (map[i].br_startblock == HOLESTARTBLOCK && diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 62fa39276a24..438505f395e7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -97,12 +97,16 @@ static inline void xfs_buf_ioacct_inc( struct xfs_buf *bp) { - if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) + if (bp->b_flags & XBF_NO_IOACCT) return; ASSERT(bp->b_flags & XBF_ASYNC); - bp->b_flags |= _XBF_IN_FLIGHT; - percpu_counter_inc(&bp->b_target->bt_io_count); + spin_lock(&bp->b_lock); + if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { + bp->b_state |= XFS_BSTATE_IN_FLIGHT; + percpu_counter_inc(&bp->b_target->bt_io_count); + } + spin_unlock(&bp->b_lock); } /* @@ -110,14 +114,24 @@ xfs_buf_ioacct_inc( * freed and unaccount from the buftarg. */ static inline void -xfs_buf_ioacct_dec( +__xfs_buf_ioacct_dec( struct xfs_buf *bp) { - if (!(bp->b_flags & _XBF_IN_FLIGHT)) - return; + lockdep_assert_held(&bp->b_lock); - bp->b_flags &= ~_XBF_IN_FLIGHT; - percpu_counter_dec(&bp->b_target->bt_io_count); + if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { + bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; + percpu_counter_dec(&bp->b_target->bt_io_count); + } +} + +static inline void +xfs_buf_ioacct_dec( + struct xfs_buf *bp) +{ + spin_lock(&bp->b_lock); + __xfs_buf_ioacct_dec(bp); + spin_unlock(&bp->b_lock); } /* @@ -149,9 +163,9 @@ xfs_buf_stale( * unaccounted (released to LRU) before that occurs. Drop in-flight * status now to preserve accounting consistency. */ - xfs_buf_ioacct_dec(bp); - spin_lock(&bp->b_lock); + __xfs_buf_ioacct_dec(bp); + atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) @@ -979,12 +993,12 @@ xfs_buf_rele( * ensures the decrement occurs only once per-buf. */ if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) - xfs_buf_ioacct_dec(bp); + __xfs_buf_ioacct_dec(bp); goto out_unlock; } /* the last reference has been dropped ... */ - xfs_buf_ioacct_dec(bp); + __xfs_buf_ioacct_dec(bp); if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { /* * If the buffer is added to the LRU take a new reference to the @@ -1213,8 +1227,11 @@ xfs_buf_bio_end_io( * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (bio->bi_error) - cmpxchg(&bp->b_io_error, 0, bio->bi_error); + if (bio->bi_status) { + int error = blk_status_to_errno(bio->bi_status); + + cmpxchg(&bp->b_io_error, 0, error); + } if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8d1d44f87ce9..1508121f29f2 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -63,7 +63,6 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ -#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ typedef unsigned int xfs_buf_flags_t; @@ -84,14 +83,14 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" }, \ - { _XBF_IN_FLIGHT, "IN_FLIGHT" } + { _XBF_COMPOUND, "COMPOUND" } /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ +#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ /* * The xfs_buftarg contains 2 notions of "sector size" - diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 35703a801372..17f27a2fb5e2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -237,7 +237,11 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -541,7 +545,11 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - xfs_ilock(ip, iolock); + if (!xfs_ilock_nowait(ip, iolock)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, iolock); + } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -553,9 +561,15 @@ xfs_file_dio_aio_write( * otherwise demote the lock if we had to take the exclusive lock * for other reasons in xfs_file_aio_write_checks. */ - if (unaligned_io) - inode_dio_wait(inode); - else if (iolock == XFS_IOLOCK_EXCL) { + if (unaligned_io) { + /* If we are going to wait for other DIO to finish, bail */ + if (iocb->ki_flags & IOCB_NOWAIT) { + if (atomic_read(&inode->i_dio_count)) + return -EAGAIN; + } else { + inode_dio_wait(inode); + } + } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } @@ -585,7 +599,12 @@ xfs_file_dax_write( size_t count; loff_t pos; - xfs_ilock(ip, iolock); + if (!xfs_ilock_nowait(ip, iolock)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, iolock); + } + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -892,6 +911,7 @@ xfs_file_open( return -EFBIG; if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) return -EIO; + file->f_mode |= FMODE_AIO_NOWAIT; return 0; } @@ -1043,49 +1063,17 @@ xfs_find_get_desired_pgoff( index = startoff >> PAGE_SHIFT; endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); - end = endoff >> PAGE_SHIFT; + end = (endoff - 1) >> PAGE_SHIFT; do { int want; unsigned nr_pages; unsigned int i; - want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); - /* - * No page mapped into given range. If we are searching holes - * and if this is the first time we got into the loop, it means - * that the given offset is landed in a hole, return it. - * - * If we have already stepped through some block buffers to find - * holes but they all contains data. In this case, the last - * offset is already updated and pointed to the end of the last - * mapped page, if it does not reach the endpoint to search, - * that means there should be a hole between them. - */ - if (nr_pages == 0) { - /* Data search found nothing */ - if (type == DATA_OFF) - break; - - ASSERT(type == HOLE_OFF); - if (lastoff == startoff || lastoff < endoff) { - found = true; - *offset = lastoff; - } - break; - } - - /* - * At lease we found one page. If this is the first time we - * step into the loop, and if the first page index offset is - * greater than the given search offset, a hole was found. - */ - if (type == HOLE_OFF && lastoff == startoff && - lastoff < page_offset(pvec.pages[0])) { - found = true; + if (nr_pages == 0) break; - } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1098,18 +1086,18 @@ xfs_find_get_desired_pgoff( * file mapping. However, page->index will not change * because we have a reference on the page. * - * Searching done if the page index is out of range. - * If the current offset is not reaches the end of - * the specified search range, there should be a hole - * between them. + * If current page offset is beyond where we've ended, + * we've found a hole. */ - if (page->index > end) { - if (type == HOLE_OFF && lastoff < endoff) { - *offset = lastoff; - found = true; - } + if (type == HOLE_OFF && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { + found = true; + *offset = lastoff; goto out; } + /* Searching done if the page index is out of range. */ + if (page->index > end) + goto out; lock_page(page); /* @@ -1151,21 +1139,20 @@ xfs_find_get_desired_pgoff( /* * The number of returned pages less than our desired, search - * done. In this case, nothing was found for searching data, - * but we found a hole behind the last offset. + * done. */ - if (nr_pages < want) { - if (type == HOLE_OFF) { - *offset = lastoff; - found = true; - } + if (nr_pages < want) break; - } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + /* No page at lastoff and we are not done - we found a hole. */ + if (type == HOLE_OFF && lastoff < endoff) { + *offset = lastoff; + found = true; + } out: pagevec_release(&pvec); return found; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 3683819887a5..814ed729881d 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -828,6 +828,7 @@ xfs_getfsmap( struct xfs_fsmap dkeys[2]; /* per-dev keys */ struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS]; struct xfs_getfsmap_info info = { NULL }; + bool use_rmap; int i; int error = 0; @@ -837,12 +838,14 @@ xfs_getfsmap( !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; + use_rmap = capable(CAP_SYS_ADMIN) && + xfs_sb_version_hasrmapbt(&mp->m_sb); head->fmh_entries = 0; /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else handlers[0].fn = xfs_getfsmap_datadev_bnobt; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f61c84f8e31a..b9c12e1cc23a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -66,7 +66,6 @@ xfs_inode_alloc( XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); @@ -190,7 +189,7 @@ xfs_perag_set_reclaim_tag( { struct xfs_mount *mp = pag->pag_mount; - ASSERT(spin_is_locked(&pag->pag_ici_lock)); + lockdep_assert_held(&pag->pag_ici_lock); if (pag->pag_ici_reclaimable++) return; @@ -212,7 +211,7 @@ xfs_perag_clear_reclaim_tag( { struct xfs_mount *mp = pag->pag_mount; - ASSERT(spin_is_locked(&pag->pag_ici_lock)); + lockdep_assert_held(&pag->pag_ici_lock); if (--pag->pag_ici_reclaimable) return; @@ -270,12 +269,12 @@ xfs_inew_wait( DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); do { - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (!xfs_iflags_test(ip, XFS_INEW)) break; schedule(); } while (true); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ec9826c56500..c0a1e840a588 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -622,12 +622,12 @@ __xfs_iflock( DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (xfs_isiflocked(ip)) io_schedule(); } while (!xfs_iflock_nowait(ip)); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } STATIC uint @@ -2486,11 +2486,11 @@ __xfs_iunpin_wait( xfs_iunpin(ip); do { - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (xfs_ipincount(ip)) io_schedule(); } while (xfs_ipincount(ip)); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } void diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 08cb7d1a4a3a..013cc78d7daf 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -834,9 +834,7 @@ xfs_inode_item_format_convert( in_f->ilf_dsize = in_f32->ilf_dsize; in_f->ilf_ino = in_f32->ilf_ino; /* copy biggest field of ilf_u */ - memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, - in_f32->ilf_u.ilfu_uuid.__u_bits, - sizeof(uuid_t)); + uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid); in_f->ilf_blkno = in_f32->ilf_blkno; in_f->ilf_len = in_f32->ilf_len; in_f->ilf_boffset = in_f32->ilf_boffset; @@ -851,9 +849,7 @@ xfs_inode_item_format_convert( in_f->ilf_dsize = in_f64->ilf_dsize; in_f->ilf_ino = in_f64->ilf_ino; /* copy biggest field of ilf_u */ - memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, - in_f64->ilf_u.ilfu_uuid.__u_bits, - sizeof(uuid_t)); + uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f64->ilf_u.ilfu_uuid); in_f->ilf_blkno = in_f64->ilf_blkno; in_f->ilf_len = in_f64->ilf_len; in_f->ilf_boffset = in_f64->ilf_boffset; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a63f61c256bd..05dc87e8c1f5 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -995,6 +995,11 @@ xfs_file_iomap_begin( lockmode = xfs_ilock_data_map_shared(ip); } + if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = -EAGAIN; + goto out_unlock; + } + ASSERT(offset <= mp->m_super->s_maxbytes); if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) length = mp->m_super->s_maxbytes - offset; @@ -1016,6 +1021,15 @@ xfs_file_iomap_begin( if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { if (flags & IOMAP_DIRECT) { + /* + * A reflinked inode will result in CoW alloc. + * FIXME: It could still overwrite on unshared extents + * and not need allocation. + */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode); @@ -1033,6 +1047,14 @@ xfs_file_iomap_begin( if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { /* + * If nowait is set bail since we are going to make + * allocations. + */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } + /* * We cap the maximum length we map here to MAX_WRITEBACK_PAGES * pages to keep the chunks of work done where somewhat symmetric * with the work writeback does. This is a completely arbitrary @@ -1068,7 +1090,7 @@ xfs_file_iomap_begin( /* optionally associate a dax device with the iomap bdev */ bdev = iomap->bdev; if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); else iomap->dax_dev = NULL; @@ -1149,7 +1171,7 @@ xfs_file_iomap_end( unsigned flags, struct iomap *iomap) { - put_dax(iomap->dax_dev); + fs_put_dax(iomap->dax_dev); if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, length, written, iomap); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 044fb0e15390..2d167fe643ec 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -19,6 +19,7 @@ #define __XFS_LINUX__ #include <linux/types.h> +#include <linux/uuid.h> /* * Kernel specific type declarations for XFS @@ -42,7 +43,6 @@ typedef __u32 xfs_nlink_t; #include "kmem.h" #include "mrlock.h" -#include "uuid.h" #include <linux/semaphore.h> #include <linux/mm.h> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index cd0b077deb35..8cec1e5505a4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -352,13 +352,13 @@ xlog_header_check_mount( { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); - if (uuid_is_nil(&head->h_fs_uuid)) { + if (uuid_is_null(&head->h_fs_uuid)) { /* * IRIX doesn't write the h_fs_uuid or h_fmt fields. If - * h_fs_uuid is nil, we assume this log was last mounted + * h_fs_uuid is null, we assume this log was last mounted * by IRIX and continue. */ - xfs_warn(mp, "nil uuid in log - IRIX style log"); + xfs_warn(mp, "null uuid in log - IRIX style log"); } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { xfs_warn(mp, "log has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7147d4a8d207..43d07f9c4e9e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -74,20 +74,19 @@ xfs_uuid_mount( int hole, i; /* Publish UUID in struct super_block */ - BUILD_BUG_ON(sizeof(mp->m_super->s_uuid) != sizeof(uuid_t)); - memcpy(&mp->m_super->s_uuid, uuid, sizeof(uuid_t)); + uuid_copy(&mp->m_super->s_uuid, uuid); if (mp->m_flags & XFS_MOUNT_NOUUID) return 0; - if (uuid_is_nil(uuid)) { - xfs_warn(mp, "Filesystem has nil UUID - can't mount"); + if (uuid_is_null(uuid)) { + xfs_warn(mp, "Filesystem has null UUID - can't mount"); return -EINVAL; } mutex_lock(&xfs_uuid_table_mutex); for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { - if (uuid_is_nil(&xfs_uuid_table[i])) { + if (uuid_is_null(&xfs_uuid_table[i])) { hole = i; continue; } @@ -124,7 +123,7 @@ xfs_uuid_unmount( mutex_lock(&xfs_uuid_table_mutex); for (i = 0; i < xfs_uuid_table_size; i++) { - if (uuid_is_nil(&xfs_uuid_table[i])) + if (uuid_is_null(&xfs_uuid_table[i])) continue; if (!uuid_equal(uuid, &xfs_uuid_table[i])) continue; @@ -793,7 +792,10 @@ xfs_mountfs( * Copies the low order bits of the timestamp and the randomly * set "sequence" number out of a UUID. */ - uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid); + mp->m_fixedfsid[0] = + (get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) | + get_unaligned_be16(&sbp->sb_uuid.b[4]); + mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]); mp->m_dmevmask = 0; /* not persistent; set after each mount */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 47d239dcf3f4..97df4db13b2e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -52,6 +52,7 @@ #include "xfs_reflink.h" #include <linux/namei.h> +#include <linux/dax.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/mount.h> @@ -1765,7 +1766,8 @@ STATIC int __init xfs_init_zones(void) { xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, - offsetof(struct xfs_ioend, io_inline_bio)); + offsetof(struct xfs_ioend, io_inline_bio), + BIOSET_NEED_BVECS); if (!xfs_ioend_bioset) goto out; |