summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Makefile3
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/cmservice.c19
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/rxrpc.c86
-rw-r--r--fs/aio.c42
-rw-r--r--fs/autofs4/root.c4
-rw-r--r--fs/bio-integrity.c657
-rw-r--r--fs/bio.c2037
-rw-r--r--fs/btrfs/ctree.h14
-rw-r--r--fs/btrfs/disk-io.c10
-rw-r--r--fs/btrfs/extent-tree.c6
-rw-r--r--fs/btrfs/file.c8
-rw-r--r--fs/btrfs/inode-map.c24
-rw-r--r--fs/btrfs/ioctl.c10
-rw-r--r--fs/btrfs/send.c7
-rw-r--r--fs/btrfs/super.c22
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/ceph/dir.c33
-rw-r--r--fs/ceph/file.c3
-rw-r--r--fs/ceph/inode.c71
-rw-r--r--fs/ceph/ioctl.c3
-rw-r--r--fs/ceph/locks.c1
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/cifs/cifsfs.c14
-rw-r--r--fs/cifs/cifsglob.h8
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/cifssmb.c3
-rw-r--r--fs/cifs/file.c35
-rw-r--r--fs/cifs/inode.c3
-rw-r--r--fs/cifs/misc.c74
-rw-r--r--fs/cifs/smb1ops.c11
-rw-r--r--fs/cifs/smb2misc.c18
-rw-r--r--fs/cifs/smb2ops.c14
-rw-r--r--fs/cifs/smb2pdu.c2
-rw-r--r--fs/compat.c14
-rw-r--r--fs/coredump.c7
-rw-r--r--fs/dcache.c433
-rw-r--r--fs/exec.c6
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h17
-rw-r--r--fs/ext4/extents.c109
-rw-r--r--fs/ext4/extents_status.c2
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/inode.c53
-rw-r--r--fs/ext4/mballoc.c18
-rw-r--r--fs/ext4/page-io.c5
-rw-r--r--fs/ext4/super.c51
-rw-r--r--fs/ext4/xattr.c23
-rw-r--r--fs/fcntl.c12
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/dir.c146
-rw-r--r--fs/fuse/file.c84
-rw-r--r--fs/fuse/fuse_i.h10
-rw-r--r--fs/fuse/inode.c16
-rw-r--r--fs/hugetlbfs/inode.c5
-rw-r--r--fs/ioprio.c241
-rw-r--r--fs/kernfs/dir.c10
-rw-r--r--fs/kernfs/file.c60
-rw-r--r--fs/kernfs/inode.c14
-rw-r--r--fs/kernfs/kernfs-internal.h5
-rw-r--r--fs/kernfs/mount.c22
-rw-r--r--fs/locks.c91
-rw-r--r--fs/namei.c6
-rw-r--r--fs/nfsd/nfs4acl.c19
-rw-r--r--fs/nfsd/nfs4callback.c4
-rw-r--r--fs/nfsd/nfs4state.c40
-rw-r--r--fs/nfsd/nfs4xdr.c8
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c8
-rw-r--r--fs/open.c21
-rw-r--r--fs/posix_acl.c6
-rw-r--r--fs/splice.c6
-rw-r--r--fs/super.c5
-rw-r--r--fs/sysfs/file.c95
-rw-r--r--fs/sysfs/group.c10
-rw-r--r--fs/sysfs/mount.c7
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/xfs/xfs_aops.c51
-rw-r--r--fs/xfs/xfs_attr.c24
-rw-r--r--fs/xfs/xfs_attr_leaf.c21
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_attr_remote.c8
-rw-r--r--fs/xfs/xfs_bmap.c17
-rw-r--r--fs/xfs/xfs_bmap_util.c13
-rw-r--r--fs/xfs/xfs_buf.c16
-rw-r--r--fs/xfs/xfs_da_btree.h2
-rw-r--r--fs/xfs/xfs_export.c2
-rw-r--r--fs/xfs/xfs_file.c18
-rw-r--r--fs/xfs/xfs_inode.c5
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_iops.c53
-rw-r--r--fs/xfs/xfs_log.c63
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_qm.c26
-rw-r--r--fs/xfs/xfs_sb.c4
-rw-r--r--fs/xfs/xfs_super.c4
-rw-r--r--fs/xfs/xfs_trace.h1
98 files changed, 1389 insertions, 3895 deletions
diff --git a/fs/Makefile b/fs/Makefile
index f9cb9876e466..4030cbfbc9af 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \
stack.o fs_struct.o statfs.o
ifeq ($(CONFIG_BLOCK),y)
-obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+obj-y += buffer.o block_dev.o direct-io.o mpage.o
else
obj-y += no-block.o
endif
obj-$(CONFIG_PROC_FS) += proc_namespace.o
-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
obj-y += notify/
obj-$(CONFIG_EPOLL) += eventpoll.o
obj-$(CONFIG_ANON_INODES) += anon_inodes.o
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 6d589f28bf9b..895ac7dc9dbf 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -340,8 +340,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
&blocksize,&sbi->s_prefix,
sbi->s_volume, &mount_flags)) {
printk(KERN_ERR "AFFS: Error parsing options\n");
- kfree(sbi->s_prefix);
- kfree(sbi);
return -EINVAL;
}
/* N.B. after this point s_prefix must be released */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 1c8c6cc6de30..4b0eff6da674 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -130,6 +130,15 @@ static void afs_cm_destructor(struct afs_call *call)
{
_enter("");
+ /* Break the callbacks here so that we do it after the final ACK is
+ * received. The step number here must match the final number in
+ * afs_deliver_cb_callback().
+ */
+ if (call->unmarshall == 6) {
+ ASSERT(call->server && call->count && call->request);
+ afs_break_callbacks(call->server, call->count, call->request);
+ }
+
afs_put_server(call->server);
call->server = NULL;
kfree(call->buffer);
@@ -272,6 +281,16 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
_debug("trailer");
if (skb->len != 0)
return -EBADMSG;
+
+ /* Record that the message was unmarshalled successfully so
+ * that the call destructor can know do the callback breaking
+ * work, even if the final ACK isn't received.
+ *
+ * If the step number changes, then afs_cm_destructor() must be
+ * updated also.
+ */
+ call->unmarshall++;
+ case 6:
break;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index be75b500005d..590b55f46d61 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -75,7 +75,7 @@ struct afs_call {
const struct afs_call_type *type; /* type of call */
const struct afs_wait_mode *wait_mode; /* completion wait mode */
wait_queue_head_t waitq; /* processes awaiting completion */
- work_func_t async_workfn;
+ void (*async_workfn)(struct afs_call *call); /* asynchronous work function */
struct work_struct async_work; /* asynchronous work processor */
struct work_struct work; /* actual work processor */
struct sk_buff_head rx_queue; /* received packets */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index ef943df73b8c..03a3beb17004 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -25,7 +25,7 @@ static void afs_wake_up_call_waiter(struct afs_call *);
static int afs_wait_for_call_to_complete(struct afs_call *);
static void afs_wake_up_async_call(struct afs_call *);
static int afs_dont_wait_for_call_to_complete(struct afs_call *);
-static void afs_process_async_call(struct work_struct *);
+static void afs_process_async_call(struct afs_call *);
static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
@@ -58,6 +58,13 @@ static void afs_collect_incoming_call(struct work_struct *);
static struct sk_buff_head afs_incoming_calls;
static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
+static void afs_async_workfn(struct work_struct *work)
+{
+ struct afs_call *call = container_of(work, struct afs_call, async_work);
+
+ call->async_workfn(call);
+}
+
/*
* open an RxRPC socket and bind it to be a server for callback notifications
* - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
@@ -184,6 +191,28 @@ static void afs_free_call(struct afs_call *call)
}
/*
+ * End a call but do not free it
+ */
+static void afs_end_call_nofree(struct afs_call *call)
+{
+ if (call->rxcall) {
+ rxrpc_kernel_end_call(call->rxcall);
+ call->rxcall = NULL;
+ }
+ if (call->type->destructor)
+ call->type->destructor(call);
+}
+
+/*
+ * End a call and free it
+ */
+static void afs_end_call(struct afs_call *call)
+{
+ afs_end_call_nofree(call);
+ afs_free_call(call);
+}
+
+/*
* allocate a call with flat request and reply buffers
*/
struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
@@ -326,7 +355,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
atomic_read(&afs_outstanding_calls));
call->wait_mode = wait_mode;
- INIT_WORK(&call->async_work, afs_process_async_call);
+ call->async_workfn = afs_process_async_call;
+ INIT_WORK(&call->async_work, afs_async_workfn);
memset(&srx, 0, sizeof(srx));
srx.srx_family = AF_RXRPC;
@@ -383,11 +413,8 @@ error_do_abort:
rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
while ((skb = skb_dequeue(&call->rx_queue)))
afs_free_skb(skb);
- rxrpc_kernel_end_call(rxcall);
- call->rxcall = NULL;
error_kill_call:
- call->type->destructor(call);
- afs_free_call(call);
+ afs_end_call(call);
_leave(" = %d", ret);
return ret;
}
@@ -509,12 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call)
if (call->state >= AFS_CALL_COMPLETE) {
while ((skb = skb_dequeue(&call->rx_queue)))
afs_free_skb(skb);
- if (call->incoming) {
- rxrpc_kernel_end_call(call->rxcall);
- call->rxcall = NULL;
- call->type->destructor(call);
- afs_free_call(call);
- }
+ if (call->incoming)
+ afs_end_call(call);
}
_leave("");
@@ -564,10 +587,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
}
_debug("call complete");
- rxrpc_kernel_end_call(call->rxcall);
- call->rxcall = NULL;
- call->type->destructor(call);
- afs_free_call(call);
+ afs_end_call(call);
_leave(" = %d", ret);
return ret;
}
@@ -603,11 +623,8 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
/*
* delete an asynchronous call
*/
-static void afs_delete_async_call(struct work_struct *work)
+static void afs_delete_async_call(struct afs_call *call)
{
- struct afs_call *call =
- container_of(work, struct afs_call, async_work);
-
_enter("");
afs_free_call(call);
@@ -620,11 +637,8 @@ static void afs_delete_async_call(struct work_struct *work)
* - on a multiple-thread workqueue this work item may try to run on several
* CPUs at the same time
*/
-static void afs_process_async_call(struct work_struct *work)
+static void afs_process_async_call(struct afs_call *call)
{
- struct afs_call *call =
- container_of(work, struct afs_call, async_work);
-
_enter("");
if (!skb_queue_empty(&call->rx_queue))
@@ -637,10 +651,7 @@ static void afs_process_async_call(struct work_struct *work)
call->reply = NULL;
/* kill the call */
- rxrpc_kernel_end_call(call->rxcall);
- call->rxcall = NULL;
- if (call->type->destructor)
- call->type->destructor(call);
+ afs_end_call_nofree(call);
/* we can't just delete the call because the work item may be
* queued */
@@ -663,13 +674,6 @@ void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
call->reply_size += len;
}
-static void afs_async_workfn(struct work_struct *work)
-{
- struct afs_call *call = container_of(work, struct afs_call, async_work);
-
- call->async_workfn(work);
-}
-
/*
* accept the backlog of incoming calls
*/
@@ -790,10 +794,7 @@ void afs_send_empty_reply(struct afs_call *call)
_debug("oom");
rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
default:
- rxrpc_kernel_end_call(call->rxcall);
- call->rxcall = NULL;
- call->type->destructor(call);
- afs_free_call(call);
+ afs_end_call(call);
_leave(" [error]");
return;
}
@@ -823,17 +824,16 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
call->state = AFS_CALL_AWAIT_ACK;
n = rxrpc_kernel_send_data(call->rxcall, &msg, len);
if (n >= 0) {
+ /* Success */
_leave(" [replied]");
return;
}
+
if (n == -ENOMEM) {
_debug("oom");
rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
}
- rxrpc_kernel_end_call(call->rxcall);
- call->rxcall = NULL;
- call->type->destructor(call);
- afs_free_call(call);
+ afs_end_call(call);
_leave(" [error]");
}
diff --git a/fs/aio.c b/fs/aio.c
index 12a3de0ee6da..a0ed6c7d2cd2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -112,6 +112,11 @@ struct kioctx {
struct work_struct free_work;
+ /*
+ * signals when all in-flight requests are done
+ */
+ struct completion *requests_done;
+
struct {
/*
* This counts the number of available slots in the ringbuffer,
@@ -508,6 +513,10 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
{
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
+ /* At this point we know that there are no any in-flight requests */
+ if (ctx->requests_done)
+ complete(ctx->requests_done);
+
INIT_WORK(&ctx->free_work, free_ioctx);
schedule_work(&ctx->free_work);
}
@@ -718,7 +727,8 @@ err:
* when the processes owning a context have all exited to encourage
* the rapid destruction of the kioctx.
*/
-static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
+static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+ struct completion *requests_done)
{
if (!atomic_xchg(&ctx->dead, 1)) {
struct kioctx_table *table;
@@ -747,7 +757,11 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
if (ctx->mmap_size)
vm_munmap(ctx->mmap_base, ctx->mmap_size);
+ ctx->requests_done = requests_done;
percpu_ref_kill(&ctx->users);
+ } else {
+ if (requests_done)
+ complete(requests_done);
}
}
@@ -809,7 +823,7 @@ void exit_aio(struct mm_struct *mm)
*/
ctx->mmap_size = 0;
- kill_ioctx(mm, ctx);
+ kill_ioctx(mm, ctx, NULL);
}
}
@@ -1185,7 +1199,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
if (ret)
- kill_ioctx(current->mm, ioctx);
+ kill_ioctx(current->mm, ioctx, NULL);
percpu_ref_put(&ioctx->users);
}
@@ -1203,8 +1217,22 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
{
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
- kill_ioctx(current->mm, ioctx);
+ struct completion requests_done =
+ COMPLETION_INITIALIZER_ONSTACK(requests_done);
+
+ /* Pass requests_done to kill_ioctx() where it can be set
+ * in a thread-safe way. If we try to set it here then we have
+ * a race condition if two io_destroy() called simultaneously.
+ */
+ kill_ioctx(current->mm, ioctx, &requests_done);
percpu_ref_put(&ioctx->users);
+
+ /* Wait until all IO for the context are done. Otherwise kernel
+ * keep using user-space buffers even if user thinks the context
+ * is destroyed.
+ */
+ wait_for_completion(&requests_done);
+
return 0;
}
pr_debug("EINVAL: io_destroy: invalid context id\n");
@@ -1299,10 +1327,8 @@ rw_common:
&iovec, compat)
: aio_setup_single_vector(req, rw, buf, &nr_segs,
iovec);
- if (ret)
- return ret;
-
- ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+ if (!ret)
+ ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
if (ret < 0) {
if (iovec != &inline_vec)
kfree(iovec);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2caf36ac3e93..cc87c1abac97 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
spin_lock(&active->d_lock);
/* Already gone? */
- if (!d_count(active))
+ if ((int) d_count(active) <= 0)
goto next;
qstr = &active->d_name;
@@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
spin_lock(&expiring->d_lock);
- /* Bad luck, we've already been dentry_iput */
+ /* We've already been dentry_iput or unlinked */
if (!expiring->d_inode)
goto next;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
deleted file mode 100644
index 1c2ce0c87711..000000000000
--- a/fs/bio-integrity.c
+++ /dev/null
@@ -1,657 +0,0 @@
-/*
- * bio-integrity.c - bio data integrity extensions
- *
- * Copyright (C) 2007, 2008, 2009 Oracle Corporation
- * Written by: Martin K. Petersen <martin.petersen@oracle.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- *
- */
-
-#include <linux/blkdev.h>
-#include <linux/mempool.h>
-#include <linux/export.h>
-#include <linux/bio.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
-
-#define BIP_INLINE_VECS 4
-
-static struct kmem_cache *bip_slab;
-static struct workqueue_struct *kintegrityd_wq;
-
-/**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
- * @bio: bio to attach integrity metadata to
- * @gfp_mask: Memory allocation mask
- * @nr_vecs: Number of integrity metadata scatter-gather elements
- *
- * Description: This function prepares a bio for attaching integrity
- * metadata. nr_vecs specifies the maximum number of pages containing
- * integrity metadata that can be attached.
- */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
- gfp_t gfp_mask,
- unsigned int nr_vecs)
-{
- struct bio_integrity_payload *bip;
- struct bio_set *bs = bio->bi_pool;
- unsigned long idx = BIO_POOL_NONE;
- unsigned inline_vecs;
-
- if (!bs) {
- bip = kmalloc(sizeof(struct bio_integrity_payload) +
- sizeof(struct bio_vec) * nr_vecs, gfp_mask);
- inline_vecs = nr_vecs;
- } else {
- bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
- inline_vecs = BIP_INLINE_VECS;
- }
-
- if (unlikely(!bip))
- return NULL;
-
- memset(bip, 0, sizeof(*bip));
-
- if (nr_vecs > inline_vecs) {
- bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
- bs->bvec_integrity_pool);
- if (!bip->bip_vec)
- goto err;
- } else {
- bip->bip_vec = bip->bip_inline_vecs;
- }
-
- bip->bip_slab = idx;
- bip->bip_bio = bio;
- bio->bi_integrity = bip;
-
- return bip;
-err:
- mempool_free(bip, bs->bio_integrity_pool);
- return NULL;
-}
-EXPORT_SYMBOL(bio_integrity_alloc);
-
-/**
- * bio_integrity_free - Free bio integrity payload
- * @bio: bio containing bip to be freed
- *
- * Description: Used to free the integrity portion of a bio. Usually
- * called from bio_free().
- */
-void bio_integrity_free(struct bio *bio)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
- struct bio_set *bs = bio->bi_pool;
-
- if (bip->bip_owns_buf)
- kfree(bip->bip_buf);
-
- if (bs) {
- if (bip->bip_slab != BIO_POOL_NONE)
- bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
- bip->bip_slab);
-
- mempool_free(bip, bs->bio_integrity_pool);
- } else {
- kfree(bip);
- }
-
- bio->bi_integrity = NULL;
-}
-EXPORT_SYMBOL(bio_integrity_free);
-
-static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
-{
- if (bip->bip_slab == BIO_POOL_NONE)
- return BIP_INLINE_VECS;
-
- return bvec_nr_vecs(bip->bip_slab);
-}
-
-/**
- * bio_integrity_add_page - Attach integrity metadata
- * @bio: bio to update
- * @page: page containing integrity metadata
- * @len: number of bytes of integrity metadata in page
- * @offset: start offset within page
- *
- * Description: Attach a page containing integrity metadata to bio.
- */
-int bio_integrity_add_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
- struct bio_vec *iv;
-
- if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
- printk(KERN_ERR "%s: bip_vec full\n", __func__);
- return 0;
- }
-
- iv = bip->bip_vec + bip->bip_vcnt;
-
- iv->bv_page = page;
- iv->bv_len = len;
- iv->bv_offset = offset;
- bip->bip_vcnt++;
-
- return len;
-}
-EXPORT_SYMBOL(bio_integrity_add_page);
-
-static int bdev_integrity_enabled(struct block_device *bdev, int rw)
-{
- struct blk_integrity *bi = bdev_get_integrity(bdev);
-
- if (bi == NULL)
- return 0;
-
- if (rw == READ && bi->verify_fn != NULL &&
- (bi->flags & INTEGRITY_FLAG_READ))
- return 1;
-
- if (rw == WRITE && bi->generate_fn != NULL &&
- (bi->flags & INTEGRITY_FLAG_WRITE))
- return 1;
-
- return 0;
-}
-
-/**
- * bio_integrity_enabled - Check whether integrity can be passed
- * @bio: bio to check
- *
- * Description: Determines whether bio_integrity_prep() can be called
- * on this bio or not. bio data direction and target device must be
- * set prior to calling. The functions honors the write_generate and
- * read_verify flags in sysfs.
- */
-int bio_integrity_enabled(struct bio *bio)
-{
- if (!bio_is_rw(bio))
- return 0;
-
- /* Already protected? */
- if (bio_integrity(bio))
- return 0;
-
- return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
-}
-EXPORT_SYMBOL(bio_integrity_enabled);
-
-/**
- * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
- * @bi: blk_integrity profile for device
- * @sectors: Number of 512 sectors to convert
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the hardware
- * sector size of the storage device. Convert the block layer sectors
- * to physical sectors.
- */
-static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
- unsigned int sectors)
-{
- /* At this point there are only 512b or 4096b DIF/EPP devices */
- if (bi->sector_size == 4096)
- return sectors >>= 3;
-
- return sectors;
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
- unsigned int sectors)
-{
- return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size;
-}
-
-/**
- * bio_integrity_tag_size - Retrieve integrity tag space
- * @bio: bio to inspect
- *
- * Description: Returns the maximum number of tag bytes that can be
- * attached to this bio. Filesystems can use this to determine how
- * much metadata to attach to an I/O.
- */
-unsigned int bio_integrity_tag_size(struct bio *bio)
-{
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-
- BUG_ON(bio->bi_iter.bi_size == 0);
-
- return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size);
-}
-EXPORT_SYMBOL(bio_integrity_tag_size);
-
-static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len,
- int set)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
- unsigned int nr_sectors;
-
- BUG_ON(bip->bip_buf == NULL);
-
- if (bi->tag_size == 0)
- return -1;
-
- nr_sectors = bio_integrity_hw_sectors(bi,
- DIV_ROUND_UP(len, bi->tag_size));
-
- if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) {
- printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__,
- nr_sectors * bi->tuple_size, bip->bip_iter.bi_size);
- return -1;
- }
-
- if (set)
- bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
- else
- bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
-
- return 0;
-}
-
-/**
- * bio_integrity_set_tag - Attach a tag buffer to a bio
- * @bio: bio to attach buffer to
- * @tag_buf: Pointer to a buffer containing tag data
- * @len: Length of the included buffer
- *
- * Description: Use this function to tag a bio by leveraging the extra
- * space provided by devices formatted with integrity protection. The
- * size of the integrity buffer must be <= to the size reported by
- * bio_integrity_tag_size().
- */
-int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
-{
- BUG_ON(bio_data_dir(bio) != WRITE);
-
- return bio_integrity_tag(bio, tag_buf, len, 1);
-}
-EXPORT_SYMBOL(bio_integrity_set_tag);
-
-/**
- * bio_integrity_get_tag - Retrieve a tag buffer from a bio
- * @bio: bio to retrieve buffer from
- * @tag_buf: Pointer to a buffer for the tag data
- * @len: Length of the target buffer
- *
- * Description: Use this function to retrieve the tag buffer from a
- * completed I/O. The size of the integrity buffer must be <= to the
- * size reported by bio_integrity_tag_size().
- */
-int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
-{
- BUG_ON(bio_data_dir(bio) != READ);
-
- return bio_integrity_tag(bio, tag_buf, len, 0);
-}
-EXPORT_SYMBOL(bio_integrity_get_tag);
-
-/**
- * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio
- * @bio: bio to generate/verify integrity metadata for
- * @operate: operate number, 1 for generate, 0 for verify
- */
-static int bio_integrity_generate_verify(struct bio *bio, int operate)
-{
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
- struct blk_integrity_exchg bix;
- struct bio_vec *bv;
- sector_t sector;
- unsigned int sectors, ret = 0, i;
- void *prot_buf = bio->bi_integrity->bip_buf;
-
- if (operate)
- sector = bio->bi_iter.bi_sector;
- else
- sector = bio->bi_integrity->bip_iter.bi_sector;
-
- bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
- bix.sector_size = bi->sector_size;
-
- bio_for_each_segment_all(bv, bio, i) {
- void *kaddr = kmap_atomic(bv->bv_page);
- bix.data_buf = kaddr + bv->bv_offset;
- bix.data_size = bv->bv_len;
- bix.prot_buf = prot_buf;
- bix.sector = sector;
-
- if (operate)
- bi->generate_fn(&bix);
- else {
- ret = bi->verify_fn(&bix);
- if (ret) {
- kunmap_atomic(kaddr);
- return ret;
- }
- }
-
- sectors = bv->bv_len / bi->sector_size;
- sector += sectors;
- prot_buf += sectors * bi->tuple_size;
-
- kunmap_atomic(kaddr);
- }
- return ret;
-}
-
-/**
- * bio_integrity_generate - Generate integrity metadata for a bio
- * @bio: bio to generate integrity metadata for
- *
- * Description: Generates integrity metadata for a bio by calling the
- * block device's generation callback function. The bio must have a
- * bip attached with enough room to accommodate the generated
- * integrity metadata.
- */
-static void bio_integrity_generate(struct bio *bio)
-{
- bio_integrity_generate_verify(bio, 1);
-}
-
-static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
-{
- if (bi)
- return bi->tuple_size;
-
- return 0;
-}
-
-/**
- * bio_integrity_prep - Prepare bio for integrity I/O
- * @bio: bio to prepare
- *
- * Description: Allocates a buffer for integrity metadata, maps the
- * pages and attaches them to a bio. The bio must have data
- * direction, target device and start sector set priot to calling. In
- * the WRITE case, integrity metadata will be generated using the
- * block device's integrity function. In the READ case, the buffer
- * will be prepared for DMA and a suitable end_io handler set up.
- */
-int bio_integrity_prep(struct bio *bio)
-{
- struct bio_integrity_payload *bip;
- struct blk_integrity *bi;
- struct request_queue *q;
- void *buf;
- unsigned long start, end;
- unsigned int len, nr_pages;
- unsigned int bytes, offset, i;
- unsigned int sectors;
-
- bi = bdev_get_integrity(bio->bi_bdev);
- q = bdev_get_queue(bio->bi_bdev);
- BUG_ON(bi == NULL);
- BUG_ON(bio_integrity(bio));
-
- sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
-
- /* Allocate kernel buffer for protection data */
- len = sectors * blk_integrity_tuple_size(bi);
- buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
- if (unlikely(buf == NULL)) {
- printk(KERN_ERR "could not allocate integrity buffer\n");
- return -ENOMEM;
- }
-
- end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = ((unsigned long) buf) >> PAGE_SHIFT;
- nr_pages = end - start;
-
- /* Allocate bio integrity payload and integrity vectors */
- bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
- if (unlikely(bip == NULL)) {
- printk(KERN_ERR "could not allocate data integrity bioset\n");
- kfree(buf);
- return -EIO;
- }
-
- bip->bip_owns_buf = 1;
- bip->bip_buf = buf;
- bip->bip_iter.bi_size = len;
- bip->bip_iter.bi_sector = bio->bi_iter.bi_sector;
-
- /* Map it */
- offset = offset_in_page(buf);
- for (i = 0 ; i < nr_pages ; i++) {
- int ret;
- bytes = PAGE_SIZE - offset;
-
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- ret = bio_integrity_add_page(bio, virt_to_page(buf),
- bytes, offset);
-
- if (ret == 0)
- return 0;
-
- if (ret < bytes)
- break;
-
- buf += bytes;
- len -= bytes;
- offset = 0;
- }
-
- /* Install custom I/O completion handler if read verify is enabled */
- if (bio_data_dir(bio) == READ) {
- bip->bip_end_io = bio->bi_end_io;
- bio->bi_end_io = bio_integrity_endio;
- }
-
- /* Auto-generate integrity metadata if this is a write */
- if (bio_data_dir(bio) == WRITE)
- bio_integrity_generate(bio);
-
- return 0;
-}
-EXPORT_SYMBOL(bio_integrity_prep);
-
-/**
- * bio_integrity_verify - Verify integrity metadata for a bio
- * @bio: bio to verify
- *
- * Description: This function is called to verify the integrity of a
- * bio. The data in the bio io_vec is compared to the integrity
- * metadata returned by the HBA.
- */
-static int bio_integrity_verify(struct bio *bio)
-{
- return bio_integrity_generate_verify(bio, 0);
-}
-
-/**
- * bio_integrity_verify_fn - Integrity I/O completion worker
- * @work: Work struct stored in bio to be verified
- *
- * Description: This workqueue function is called to complete a READ
- * request. The function verifies the transferred integrity metadata
- * and then calls the original bio end_io function.
- */
-static void bio_integrity_verify_fn(struct work_struct *work)
-{
- struct bio_integrity_payload *bip =
- container_of(work, struct bio_integrity_payload, bip_work);
- struct bio *bio = bip->bip_bio;
- int error;
-
- error = bio_integrity_verify(bio);
-
- /* Restore original bio completion handler */
- bio->bi_end_io = bip->bip_end_io;
- bio_endio_nodec(bio, error);
-}
-
-/**
- * bio_integrity_endio - Integrity I/O completion function
- * @bio: Protected bio
- * @error: Pointer to errno
- *
- * Description: Completion for integrity I/O
- *
- * Normally I/O completion is done in interrupt context. However,
- * verifying I/O integrity is a time-consuming task which must be run
- * in process context. This function postpones completion
- * accordingly.
- */
-void bio_integrity_endio(struct bio *bio, int error)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
-
- BUG_ON(bip->bip_bio != bio);
-
- /* In case of an I/O error there is no point in verifying the
- * integrity metadata. Restore original bio end_io handler
- * and run it.
- */
- if (error) {
- bio->bi_end_io = bip->bip_end_io;
- bio_endio(bio, error);
-
- return;
- }
-
- INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
- queue_work(kintegrityd_wq, &bip->bip_work);
-}
-EXPORT_SYMBOL(bio_integrity_endio);
-
-/**
- * bio_integrity_advance - Advance integrity vector
- * @bio: bio whose integrity vector to update
- * @bytes_done: number of data bytes that have been completed
- *
- * Description: This function calculates how many integrity bytes the
- * number of completed data bytes correspond to and advances the
- * integrity vector accordingly.
- */
-void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
- unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
-
- bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
-}
-EXPORT_SYMBOL(bio_integrity_advance);
-
-/**
- * bio_integrity_trim - Trim integrity vector
- * @bio: bio whose integrity vector to update
- * @offset: offset to first data sector
- * @sectors: number of data sectors
- *
- * Description: Used to trim the integrity vector in a cloned bio.
- * The ivec will be advanced corresponding to 'offset' data sectors
- * and the length will be truncated corresponding to 'len' data
- * sectors.
- */
-void bio_integrity_trim(struct bio *bio, unsigned int offset,
- unsigned int sectors)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-
- bio_integrity_advance(bio, offset << 9);
- bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors);
-}
-EXPORT_SYMBOL(bio_integrity_trim);
-
-/**
- * bio_integrity_clone - Callback for cloning bios with integrity metadata
- * @bio: New bio
- * @bio_src: Original bio
- * @gfp_mask: Memory allocation mask
- *
- * Description: Called to allocate a bip when cloning a bio
- */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
- gfp_t gfp_mask)
-{
- struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
- struct bio_integrity_payload *bip;
-
- BUG_ON(bip_src == NULL);
-
- bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
-
- if (bip == NULL)
- return -EIO;
-
- memcpy(bip->bip_vec, bip_src->bip_vec,
- bip_src->bip_vcnt * sizeof(struct bio_vec));
-
- bip->bip_vcnt = bip_src->bip_vcnt;
- bip->bip_iter = bip_src->bip_iter;
-
- return 0;
-}
-EXPORT_SYMBOL(bio_integrity_clone);
-
-int bioset_integrity_create(struct bio_set *bs, int pool_size)
-{
- if (bs->bio_integrity_pool)
- return 0;
-
- bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
- if (!bs->bio_integrity_pool)
- return -1;
-
- bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
- if (!bs->bvec_integrity_pool) {
- mempool_destroy(bs->bio_integrity_pool);
- return -1;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bioset_integrity_create);
-
-void bioset_integrity_free(struct bio_set *bs)
-{
- if (bs->bio_integrity_pool)
- mempool_destroy(bs->bio_integrity_pool);
-
- if (bs->bvec_integrity_pool)
- mempool_destroy(bs->bvec_integrity_pool);
-}
-EXPORT_SYMBOL(bioset_integrity_free);
-
-void __init bio_integrity_init(void)
-{
- /*
- * kintegrityd won't block much but may burn a lot of CPU cycles.
- * Make it highpri CPU intensive wq with max concurrency of 1.
- */
- kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
- WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
- if (!kintegrityd_wq)
- panic("Failed to create kintegrityd\n");
-
- bip_slab = kmem_cache_create("bio_integrity_payload",
- sizeof(struct bio_integrity_payload) +
- sizeof(struct bio_vec) * BIP_INLINE_VECS,
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
- if (!bip_slab)
- panic("Failed to create slab\n");
-}
diff --git a/fs/bio.c b/fs/bio.c
deleted file mode 100644
index 6f0362b77806..000000000000
--- a/fs/bio.c
+++ /dev/null
@@ -1,2037 +0,0 @@
-/*
- * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public Licens
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
- *
- */
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/uio.h>
-#include <linux/iocontext.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/mempool.h>
-#include <linux/workqueue.h>
-#include <linux/cgroup.h>
-#include <scsi/sg.h> /* for struct sg_iovec */
-
-#include <trace/events/block.h>
-
-/*
- * Test patch to inline a certain number of bi_io_vec's inside the bio
- * itself, to shrink a bio data allocation from two mempool calls to one
- */
-#define BIO_INLINE_VECS 4
-
-/*
- * if you change this list, also change bvec_alloc or things will
- * break badly! cannot be bigger than what you can fit into an
- * unsigned short
- */
-#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
- BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
-};
-#undef BV
-
-/*
- * fs_bio_set is the bio_set containing bio and iovec memory pools used by
- * IO code that does not need private memory pools.
- */
-struct bio_set *fs_bio_set;
-EXPORT_SYMBOL(fs_bio_set);
-
-/*
- * Our slab pool management
- */
-struct bio_slab {
- struct kmem_cache *slab;
- unsigned int slab_ref;
- unsigned int slab_size;
- char name[8];
-};
-static DEFINE_MUTEX(bio_slab_lock);
-static struct bio_slab *bio_slabs;
-static unsigned int bio_slab_nr, bio_slab_max;
-
-static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
-{
- unsigned int sz = sizeof(struct bio) + extra_size;
- struct kmem_cache *slab = NULL;
- struct bio_slab *bslab, *new_bio_slabs;
- unsigned int new_bio_slab_max;
- unsigned int i, entry = -1;
-
- mutex_lock(&bio_slab_lock);
-
- i = 0;
- while (i < bio_slab_nr) {
- bslab = &bio_slabs[i];
-
- if (!bslab->slab && entry == -1)
- entry = i;
- else if (bslab->slab_size == sz) {
- slab = bslab->slab;
- bslab->slab_ref++;
- break;
- }
- i++;
- }
-
- if (slab)
- goto out_unlock;
-
- if (bio_slab_nr == bio_slab_max && entry == -1) {
- new_bio_slab_max = bio_slab_max << 1;
- new_bio_slabs = krealloc(bio_slabs,
- new_bio_slab_max * sizeof(struct bio_slab),
- GFP_KERNEL);
- if (!new_bio_slabs)
- goto out_unlock;
- bio_slab_max = new_bio_slab_max;
- bio_slabs = new_bio_slabs;
- }
- if (entry == -1)
- entry = bio_slab_nr++;
-
- bslab = &bio_slabs[entry];
-
- snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
- slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!slab)
- goto out_unlock;
-
- bslab->slab = slab;
- bslab->slab_ref = 1;
- bslab->slab_size = sz;
-out_unlock:
- mutex_unlock(&bio_slab_lock);
- return slab;
-}
-
-static void bio_put_slab(struct bio_set *bs)
-{
- struct bio_slab *bslab = NULL;
- unsigned int i;
-
- mutex_lock(&bio_slab_lock);
-
- for (i = 0; i < bio_slab_nr; i++) {
- if (bs->bio_slab == bio_slabs[i].slab) {
- bslab = &bio_slabs[i];
- break;
- }
- }
-
- if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
- goto out;
-
- WARN_ON(!bslab->slab_ref);
-
- if (--bslab->slab_ref)
- goto out;
-
- kmem_cache_destroy(bslab->slab);
- bslab->slab = NULL;
-
-out:
- mutex_unlock(&bio_slab_lock);
-}
-
-unsigned int bvec_nr_vecs(unsigned short idx)
-{
- return bvec_slabs[idx].nr_vecs;
-}
-
-void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
-{
- BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
-
- if (idx == BIOVEC_MAX_IDX)
- mempool_free(bv, pool);
- else {
- struct biovec_slab *bvs = bvec_slabs + idx;
-
- kmem_cache_free(bvs->slab, bv);
- }
-}
-
-struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
- mempool_t *pool)
-{
- struct bio_vec *bvl;
-
- /*
- * see comment near bvec_array define!
- */
- switch (nr) {
- case 1:
- *idx = 0;
- break;
- case 2 ... 4:
- *idx = 1;
- break;
- case 5 ... 16:
- *idx = 2;
- break;
- case 17 ... 64:
- *idx = 3;
- break;
- case 65 ... 128:
- *idx = 4;
- break;
- case 129 ... BIO_MAX_PAGES:
- *idx = 5;
- break;
- default:
- return NULL;
- }
-
- /*
- * idx now points to the pool we want to allocate from. only the
- * 1-vec entry pool is mempool backed.
- */
- if (*idx == BIOVEC_MAX_IDX) {
-fallback:
- bvl = mempool_alloc(pool, gfp_mask);
- } else {
- struct biovec_slab *bvs = bvec_slabs + *idx;
- gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
-
- /*
- * Make this allocation restricted and don't dump info on
- * allocation failures, since we'll fallback to the mempool
- * in case of failure.
- */
- __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
-
- /*
- * Try a slab allocation. If this fails and __GFP_WAIT
- * is set, retry with the 1-entry mempool
- */
- bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
- if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
- *idx = BIOVEC_MAX_IDX;
- goto fallback;
- }
- }
-
- return bvl;
-}
-
-static void __bio_free(struct bio *bio)
-{
- bio_disassociate_task(bio);
-
- if (bio_integrity(bio))
- bio_integrity_free(bio);
-}
-
-static void bio_free(struct bio *bio)
-{
- struct bio_set *bs = bio->bi_pool;
- void *p;
-
- __bio_free(bio);
-
- if (bs) {
- if (bio_flagged(bio, BIO_OWNS_VEC))
- bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
-
- /*
- * If we have front padding, adjust the bio pointer before freeing
- */
- p = bio;
- p -= bs->front_pad;
-
- mempool_free(p, bs->bio_pool);
- } else {
- /* Bio was allocated by bio_kmalloc() */
- kfree(bio);
- }
-}
-
-void bio_init(struct bio *bio)
-{
- memset(bio, 0, sizeof(*bio));
- bio->bi_flags = 1 << BIO_UPTODATE;
- atomic_set(&bio->bi_remaining, 1);
- atomic_set(&bio->bi_cnt, 1);
-}
-EXPORT_SYMBOL(bio_init);
-
-/**
- * bio_reset - reinitialize a bio
- * @bio: bio to reset
- *
- * Description:
- * After calling bio_reset(), @bio will be in the same state as a freshly
- * allocated bio returned bio bio_alloc_bioset() - the only fields that are
- * preserved are the ones that are initialized by bio_alloc_bioset(). See
- * comment in struct bio.
- */
-void bio_reset(struct bio *bio)
-{
- unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
-
- __bio_free(bio);
-
- memset(bio, 0, BIO_RESET_BYTES);
- bio->bi_flags = flags|(1 << BIO_UPTODATE);
- atomic_set(&bio->bi_remaining, 1);
-}
-EXPORT_SYMBOL(bio_reset);
-
-static void bio_chain_endio(struct bio *bio, int error)
-{
- bio_endio(bio->bi_private, error);
- bio_put(bio);
-}
-
-/**
- * bio_chain - chain bio completions
- *
- * The caller won't have a bi_end_io called when @bio completes - instead,
- * @parent's bi_end_io won't be called until both @parent and @bio have
- * completed; the chained bio will also be freed when it completes.
- *
- * The caller must not set bi_private or bi_end_io in @bio.
- */
-void bio_chain(struct bio *bio, struct bio *parent)
-{
- BUG_ON(bio->bi_private || bio->bi_end_io);
-
- bio->bi_private = parent;
- bio->bi_end_io = bio_chain_endio;
- atomic_inc(&parent->bi_remaining);
-}
-EXPORT_SYMBOL(bio_chain);
-
-static void bio_alloc_rescue(struct work_struct *work)
-{
- struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
- struct bio *bio;
-
- while (1) {
- spin_lock(&bs->rescue_lock);
- bio = bio_list_pop(&bs->rescue_list);
- spin_unlock(&bs->rescue_lock);
-
- if (!bio)
- break;
-
- generic_make_request(bio);
- }
-}
-
-static void punt_bios_to_rescuer(struct bio_set *bs)
-{
- struct bio_list punt, nopunt;
- struct bio *bio;
-
- /*
- * In order to guarantee forward progress we must punt only bios that
- * were allocated from this bio_set; otherwise, if there was a bio on
- * there for a stacking driver higher up in the stack, processing it
- * could require allocating bios from this bio_set, and doing that from
- * our own rescuer would be bad.
- *
- * Since bio lists are singly linked, pop them all instead of trying to
- * remove from the middle of the list:
- */
-
- bio_list_init(&punt);
- bio_list_init(&nopunt);
-
- while ((bio = bio_list_pop(current->bio_list)))
- bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
-
- *current->bio_list = nopunt;
-
- spin_lock(&bs->rescue_lock);
- bio_list_merge(&bs->rescue_list, &punt);
- spin_unlock(&bs->rescue_lock);
-
- queue_work(bs->rescue_workqueue, &bs->rescue_work);
-}
-
-/**
- * bio_alloc_bioset - allocate a bio for I/O
- * @gfp_mask: the GFP_ mask given to the slab allocator
- * @nr_iovecs: number of iovecs to pre-allocate
- * @bs: the bio_set to allocate from.
- *
- * Description:
- * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
- * backed by the @bs's mempool.
- *
- * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
- * able to allocate a bio. This is due to the mempool guarantees. To make this
- * work, callers must never allocate more than 1 bio at a time from this pool.
- * Callers that need to allocate more than 1 bio must always submit the
- * previously allocated bio for IO before attempting to allocate a new one.
- * Failure to do so can cause deadlocks under memory pressure.
- *
- * Note that when running under generic_make_request() (i.e. any block
- * driver), bios are not submitted until after you return - see the code in
- * generic_make_request() that converts recursion into iteration, to prevent
- * stack overflows.
- *
- * This would normally mean allocating multiple bios under
- * generic_make_request() would be susceptible to deadlocks, but we have
- * deadlock avoidance code that resubmits any blocked bios from a rescuer
- * thread.
- *
- * However, we do not guarantee forward progress for allocations from other
- * mempools. Doing multiple allocations from the same mempool under
- * generic_make_request() should be avoided - instead, use bio_set's front_pad
- * for per bio allocations.
- *
- * RETURNS:
- * Pointer to new bio on success, NULL on failure.
- */
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
-{
- gfp_t saved_gfp = gfp_mask;
- unsigned front_pad;
- unsigned inline_vecs;
- unsigned long idx = BIO_POOL_NONE;
- struct bio_vec *bvl = NULL;
- struct bio *bio;
- void *p;
-
- if (!bs) {
- if (nr_iovecs > UIO_MAXIOV)
- return NULL;
-
- p = kmalloc(sizeof(struct bio) +
- nr_iovecs * sizeof(struct bio_vec),
- gfp_mask);
- front_pad = 0;
- inline_vecs = nr_iovecs;
- } else {
- /*
- * generic_make_request() converts recursion to iteration; this
- * means if we're running beneath it, any bios we allocate and
- * submit will not be submitted (and thus freed) until after we
- * return.
- *
- * This exposes us to a potential deadlock if we allocate
- * multiple bios from the same bio_set() while running
- * underneath generic_make_request(). If we were to allocate
- * multiple bios (say a stacking block driver that was splitting
- * bios), we would deadlock if we exhausted the mempool's
- * reserve.
- *
- * We solve this, and guarantee forward progress, with a rescuer
- * workqueue per bio_set. If we go to allocate and there are
- * bios on current->bio_list, we first try the allocation
- * without __GFP_WAIT; if that fails, we punt those bios we
- * would be blocking to the rescuer workqueue before we retry
- * with the original gfp_flags.
- */
-
- if (current->bio_list && !bio_list_empty(current->bio_list))
- gfp_mask &= ~__GFP_WAIT;
-
- p = mempool_alloc(bs->bio_pool, gfp_mask);
- if (!p && gfp_mask != saved_gfp) {
- punt_bios_to_rescuer(bs);
- gfp_mask = saved_gfp;
- p = mempool_alloc(bs->bio_pool, gfp_mask);
- }
-
- front_pad = bs->front_pad;
- inline_vecs = BIO_INLINE_VECS;
- }
-
- if (unlikely(!p))
- return NULL;
-
- bio = p + front_pad;
- bio_init(bio);
-
- if (nr_iovecs > inline_vecs) {
- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
- if (!bvl && gfp_mask != saved_gfp) {
- punt_bios_to_rescuer(bs);
- gfp_mask = saved_gfp;
- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
- }
-
- if (unlikely(!bvl))
- goto err_free;
-
- bio->bi_flags |= 1 << BIO_OWNS_VEC;
- } else if (nr_iovecs) {
- bvl = bio->bi_inline_vecs;
- }
-
- bio->bi_pool = bs;
- bio->bi_flags |= idx << BIO_POOL_OFFSET;
- bio->bi_max_vecs = nr_iovecs;
- bio->bi_io_vec = bvl;
- return bio;
-
-err_free:
- mempool_free(p, bs->bio_pool);
- return NULL;
-}
-EXPORT_SYMBOL(bio_alloc_bioset);
-
-void zero_fill_bio(struct bio *bio)
-{
- unsigned long flags;
- struct bio_vec bv;
- struct bvec_iter iter;
-
- bio_for_each_segment(bv, bio, iter) {
- char *data = bvec_kmap_irq(&bv, &flags);
- memset(data, 0, bv.bv_len);
- flush_dcache_page(bv.bv_page);
- bvec_kunmap_irq(data, &flags);
- }
-}
-EXPORT_SYMBOL(zero_fill_bio);
-
-/**
- * bio_put - release a reference to a bio
- * @bio: bio to release reference to
- *
- * Description:
- * Put a reference to a &struct bio, either one you have gotten with
- * bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
- **/
-void bio_put(struct bio *bio)
-{
- BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
-
- /*
- * last put frees it
- */
- if (atomic_dec_and_test(&bio->bi_cnt))
- bio_free(bio);
-}
-EXPORT_SYMBOL(bio_put);
-
-inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
-{
- if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
- blk_recount_segments(q, bio);
-
- return bio->bi_phys_segments;
-}
-EXPORT_SYMBOL(bio_phys_segments);
-
-/**
- * __bio_clone_fast - clone a bio that shares the original bio's biovec
- * @bio: destination bio
- * @bio_src: bio to clone
- *
- * Clone a &bio. Caller will own the returned bio, but not
- * the actual data it points to. Reference count of returned
- * bio will be one.
- *
- * Caller must ensure that @bio_src is not freed before @bio.
- */
-void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
-{
- BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE);
-
- /*
- * most users will be overriding ->bi_bdev with a new target,
- * so we don't set nor calculate new physical/hw segment counts here
- */
- bio->bi_bdev = bio_src->bi_bdev;
- bio->bi_flags |= 1 << BIO_CLONED;
- bio->bi_rw = bio_src->bi_rw;
- bio->bi_iter = bio_src->bi_iter;
- bio->bi_io_vec = bio_src->bi_io_vec;
-}
-EXPORT_SYMBOL(__bio_clone_fast);
-
-/**
- * bio_clone_fast - clone a bio that shares the original bio's biovec
- * @bio: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- *
- * Like __bio_clone_fast, only also allocates the returned bio
- */
-struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
-{
- struct bio *b;
-
- b = bio_alloc_bioset(gfp_mask, 0, bs);
- if (!b)
- return NULL;
-
- __bio_clone_fast(b, bio);
-
- if (bio_integrity(bio)) {
- int ret;
-
- ret = bio_integrity_clone(b, bio, gfp_mask);
-
- if (ret < 0) {
- bio_put(b);
- return NULL;
- }
- }
-
- return b;
-}
-EXPORT_SYMBOL(bio_clone_fast);
-
-/**
- * bio_clone_bioset - clone a bio
- * @bio_src: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- *
- * Clone bio. Caller will own the returned bio, but not the actual data it
- * points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- struct bio *bio;
-
- /*
- * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
- * bio_src->bi_io_vec to bio->bi_io_vec.
- *
- * We can't do that anymore, because:
- *
- * - The point of cloning the biovec is to produce a bio with a biovec
- * the caller can modify: bi_idx and bi_bvec_done should be 0.
- *
- * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
- * we tried to clone the whole thing bio_alloc_bioset() would fail.
- * But the clone should succeed as long as the number of biovecs we
- * actually need to allocate is fewer than BIO_MAX_PAGES.
- *
- * - Lastly, bi_vcnt should not be looked at or relied upon by code
- * that does not own the bio - reason being drivers don't use it for
- * iterating over the biovec anymore, so expecting it to be kept up
- * to date (i.e. for clones that share the parent biovec) is just
- * asking for trouble and would force extra work on
- * __bio_clone_fast() anyways.
- */
-
- bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
- if (!bio)
- return NULL;
-
- bio->bi_bdev = bio_src->bi_bdev;
- bio->bi_rw = bio_src->bi_rw;
- bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
- bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
-
- if (bio->bi_rw & REQ_DISCARD)
- goto integrity_clone;
-
- if (bio->bi_rw & REQ_WRITE_SAME) {
- bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
- goto integrity_clone;
- }
-
- bio_for_each_segment(bv, bio_src, iter)
- bio->bi_io_vec[bio->bi_vcnt++] = bv;
-
-integrity_clone:
- if (bio_integrity(bio_src)) {
- int ret;
-
- ret = bio_integrity_clone(bio, bio_src, gfp_mask);
- if (ret < 0) {
- bio_put(bio);
- return NULL;
- }
- }
-
- return bio;
-}
-EXPORT_SYMBOL(bio_clone_bioset);
-
-/**
- * bio_get_nr_vecs - return approx number of vecs
- * @bdev: I/O target
- *
- * Return the approximate number of pages we can send to this target.
- * There's no guarantee that you will be able to fit this number of pages
- * into a bio, it does not account for dynamic restrictions that vary
- * on offset.
- */
-int bio_get_nr_vecs(struct block_device *bdev)
-{
- struct request_queue *q = bdev_get_queue(bdev);
- int nr_pages;
-
- nr_pages = min_t(unsigned,
- queue_max_segments(q),
- queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
-
- return min_t(unsigned, nr_pages, BIO_MAX_PAGES);
-
-}
-EXPORT_SYMBOL(bio_get_nr_vecs);
-
-static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
- *page, unsigned int len, unsigned int offset,
- unsigned int max_sectors)
-{
- int retried_segments = 0;
- struct bio_vec *bvec;
-
- /*
- * cloned bio must not modify vec list
- */
- if (unlikely(bio_flagged(bio, BIO_CLONED)))
- return 0;
-
- if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
- return 0;
-
- /*
- * For filesystems with a blocksize smaller than the pagesize
- * we will often be called with the same page as last time and
- * a consecutive offset. Optimize this special case.
- */
- if (bio->bi_vcnt > 0) {
- struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
- if (page == prev->bv_page &&
- offset == prev->bv_offset + prev->bv_len) {
- unsigned int prev_bv_len = prev->bv_len;
- prev->bv_len += len;
-
- if (q->merge_bvec_fn) {
- struct bvec_merge_data bvm = {
- /* prev_bvec is already charged in
- bi_size, discharge it in order to
- simulate merging updated prev_bvec
- as new bvec. */
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_iter.bi_sector,
- .bi_size = bio->bi_iter.bi_size -
- prev_bv_len,
- .bi_rw = bio->bi_rw,
- };
-
- if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
- prev->bv_len -= len;
- return 0;
- }
- }
-
- goto done;
- }
- }
-
- if (bio->bi_vcnt >= bio->bi_max_vecs)
- return 0;
-
- /*
- * we might lose a segment or two here, but rather that than
- * make this too complex.
- */
-
- while (bio->bi_phys_segments >= queue_max_segments(q)) {
-
- if (retried_segments)
- return 0;
-
- retried_segments = 1;
- blk_recount_segments(q, bio);
- }
-
- /*
- * setup the new entry, we might clear it again later if we
- * cannot add the page
- */
- bvec = &bio->bi_io_vec[bio->bi_vcnt];
- bvec->bv_page = page;
- bvec->bv_len = len;
- bvec->bv_offset = offset;
-
- /*
- * if queue has other restrictions (eg varying max sector size
- * depending on offset), it can specify a merge_bvec_fn in the
- * queue to get further control
- */
- if (q->merge_bvec_fn) {
- struct bvec_merge_data bvm = {
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_iter.bi_sector,
- .bi_size = bio->bi_iter.bi_size,
- .bi_rw = bio->bi_rw,
- };
-
- /*
- * merge_bvec_fn() returns number of bytes it can accept
- * at this offset
- */
- if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
- bvec->bv_page = NULL;
- bvec->bv_len = 0;
- bvec->bv_offset = 0;
- return 0;
- }
- }
-
- /* If we may be able to merge these biovecs, force a recount */
- if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
- bio->bi_flags &= ~(1 << BIO_SEG_VALID);
-
- bio->bi_vcnt++;
- bio->bi_phys_segments++;
- done:
- bio->bi_iter.bi_size += len;
- return len;
-}
-
-/**
- * bio_add_pc_page - attempt to add page to bio
- * @q: the target queue
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
- *
- * This should only be used by REQ_PC bios.
- */
-int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
-{
- return __bio_add_page(q, bio, page, len, offset,
- queue_max_hw_sectors(q));
-}
-EXPORT_SYMBOL(bio_add_pc_page);
-
-/**
- * bio_add_page - attempt to add page to bio
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
- */
-int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
- unsigned int offset)
-{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
-}
-EXPORT_SYMBOL(bio_add_page);
-
-struct submit_bio_ret {
- struct completion event;
- int error;
-};
-
-static void submit_bio_wait_endio(struct bio *bio, int error)
-{
- struct submit_bio_ret *ret = bio->bi_private;
-
- ret->error = error;
- complete(&ret->event);
-}
-
-/**
- * submit_bio_wait - submit a bio, and wait until it completes
- * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
- * @bio: The &struct bio which describes the I/O
- *
- * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
- * bio_endio() on failure.
- */
-int submit_bio_wait(int rw, struct bio *bio)
-{
- struct submit_bio_ret ret;
-
- rw |= REQ_SYNC;
- init_completion(&ret.event);
- bio->bi_private = &ret;
- bio->bi_end_io = submit_bio_wait_endio;
- submit_bio(rw, bio);
- wait_for_completion(&ret.event);
-
- return ret.error;
-}
-EXPORT_SYMBOL(submit_bio_wait);
-
-/**
- * bio_advance - increment/complete a bio by some number of bytes
- * @bio: bio to advance
- * @bytes: number of bytes to complete
- *
- * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
- * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
- * be updated on the last bvec as well.
- *
- * @bio will then represent the remaining, uncompleted portion of the io.
- */
-void bio_advance(struct bio *bio, unsigned bytes)
-{
- if (bio_integrity(bio))
- bio_integrity_advance(bio, bytes);
-
- bio_advance_iter(bio, &bio->bi_iter, bytes);
-}
-EXPORT_SYMBOL(bio_advance);
-
-/**
- * bio_alloc_pages - allocates a single page for each bvec in a bio
- * @bio: bio to allocate pages for
- * @gfp_mask: flags for allocation
- *
- * Allocates pages up to @bio->bi_vcnt.
- *
- * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
- * freed.
- */
-int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
-{
- int i;
- struct bio_vec *bv;
-
- bio_for_each_segment_all(bv, bio, i) {
- bv->bv_page = alloc_page(gfp_mask);
- if (!bv->bv_page) {
- while (--bv >= bio->bi_io_vec)
- __free_page(bv->bv_page);
- return -ENOMEM;
- }
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bio_alloc_pages);
-
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
-{
- struct bvec_iter src_iter, dst_iter;
- struct bio_vec src_bv, dst_bv;
- void *src_p, *dst_p;
- unsigned bytes;
-
- src_iter = src->bi_iter;
- dst_iter = dst->bi_iter;
-
- while (1) {
- if (!src_iter.bi_size) {
- src = src->bi_next;
- if (!src)
- break;
-
- src_iter = src->bi_iter;
- }
-
- if (!dst_iter.bi_size) {
- dst = dst->bi_next;
- if (!dst)
- break;
-
- dst_iter = dst->bi_iter;
- }
-
- src_bv = bio_iter_iovec(src, src_iter);
- dst_bv = bio_iter_iovec(dst, dst_iter);
-
- bytes = min(src_bv.bv_len, dst_bv.bv_len);
-
- src_p = kmap_atomic(src_bv.bv_page);
- dst_p = kmap_atomic(dst_bv.bv_page);
-
- memcpy(dst_p + dst_bv.bv_offset,
- src_p + src_bv.bv_offset,
- bytes);
-
- kunmap_atomic(dst_p);
- kunmap_atomic(src_p);
-
- bio_advance_iter(src, &src_iter, bytes);
- bio_advance_iter(dst, &dst_iter, bytes);
- }
-}
-EXPORT_SYMBOL(bio_copy_data);
-
-struct bio_map_data {
- int nr_sgvecs;
- int is_our_pages;
- struct sg_iovec sgvecs[];
-};
-
-static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
- const struct sg_iovec *iov, int iov_count,
- int is_our_pages)
-{
- memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
- bmd->nr_sgvecs = iov_count;
- bmd->is_our_pages = is_our_pages;
- bio->bi_private = bmd;
-}
-
-static struct bio_map_data *bio_alloc_map_data(int nr_segs,
- unsigned int iov_count,
- gfp_t gfp_mask)
-{
- if (iov_count > UIO_MAXIOV)
- return NULL;
-
- return kmalloc(sizeof(struct bio_map_data) +
- sizeof(struct sg_iovec) * iov_count, gfp_mask);
-}
-
-static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
- int to_user, int from_user, int do_free_page)
-{
- int ret = 0, i;
- struct bio_vec *bvec;
- int iov_idx = 0;
- unsigned int iov_off = 0;
-
- bio_for_each_segment_all(bvec, bio, i) {
- char *bv_addr = page_address(bvec->bv_page);
- unsigned int bv_len = bvec->bv_len;
-
- while (bv_len && iov_idx < iov_count) {
- unsigned int bytes;
- char __user *iov_addr;
-
- bytes = min_t(unsigned int,
- iov[iov_idx].iov_len - iov_off, bv_len);
- iov_addr = iov[iov_idx].iov_base + iov_off;
-
- if (!ret) {
- if (to_user)
- ret = copy_to_user(iov_addr, bv_addr,
- bytes);
-
- if (from_user)
- ret = copy_from_user(bv_addr, iov_addr,
- bytes);
-
- if (ret)
- ret = -EFAULT;
- }
-
- bv_len -= bytes;
- bv_addr += bytes;
- iov_addr += bytes;
- iov_off += bytes;
-
- if (iov[iov_idx].iov_len == iov_off) {
- iov_idx++;
- iov_off = 0;
- }
- }
-
- if (do_free_page)
- __free_page(bvec->bv_page);
- }
-
- return ret;
-}
-
-/**
- * bio_uncopy_user - finish previously mapped bio
- * @bio: bio being terminated
- *
- * Free pages allocated from bio_copy_user() and write back data
- * to user space in case of a read.
- */
-int bio_uncopy_user(struct bio *bio)
-{
- struct bio_map_data *bmd = bio->bi_private;
- struct bio_vec *bvec;
- int ret = 0, i;
-
- if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
- /*
- * if we're in a workqueue, the request is orphaned, so
- * don't copy into a random user address space, just free.
- */
- if (current->mm)
- ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs,
- bio_data_dir(bio) == READ,
- 0, bmd->is_our_pages);
- else if (bmd->is_our_pages)
- bio_for_each_segment_all(bvec, bio, i)
- __free_page(bvec->bv_page);
- }
- kfree(bmd);
- bio_put(bio);
- return ret;
-}
-EXPORT_SYMBOL(bio_uncopy_user);
-
-/**
- * bio_copy_user_iov - copy user data to bio
- * @q: destination block queue
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
- * @iov: the iovec.
- * @iov_count: number of elements in the iovec
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
- *
- * Prepares and returns a bio for indirect user io, bouncing data
- * to/from kernel pages as necessary. Must be paired with
- * call bio_uncopy_user() on io completion.
- */
-struct bio *bio_copy_user_iov(struct request_queue *q,
- struct rq_map_data *map_data,
- const struct sg_iovec *iov, int iov_count,
- int write_to_vm, gfp_t gfp_mask)
-{
- struct bio_map_data *bmd;
- struct bio_vec *bvec;
- struct page *page;
- struct bio *bio;
- int i, ret;
- int nr_pages = 0;
- unsigned int len = 0;
- unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
-
- for (i = 0; i < iov_count; i++) {
- unsigned long uaddr;
- unsigned long end;
- unsigned long start;
-
- uaddr = (unsigned long)iov[i].iov_base;
- end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = uaddr >> PAGE_SHIFT;
-
- /*
- * Overflow, abort
- */
- if (end < start)
- return ERR_PTR(-EINVAL);
-
- nr_pages += end - start;
- len += iov[i].iov_len;
- }
-
- if (offset)
- nr_pages++;
-
- bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
- if (!bmd)
- return ERR_PTR(-ENOMEM);
-
- ret = -ENOMEM;
- bio = bio_kmalloc(gfp_mask, nr_pages);
- if (!bio)
- goto out_bmd;
-
- if (!write_to_vm)
- bio->bi_rw |= REQ_WRITE;
-
- ret = 0;
-
- if (map_data) {
- nr_pages = 1 << map_data->page_order;
- i = map_data->offset / PAGE_SIZE;
- }
- while (len) {
- unsigned int bytes = PAGE_SIZE;
-
- bytes -= offset;
-
- if (bytes > len)
- bytes = len;
-
- if (map_data) {
- if (i == map_data->nr_entries * nr_pages) {
- ret = -ENOMEM;
- break;
- }
-
- page = map_data->pages[i / nr_pages];
- page += (i % nr_pages);
-
- i++;
- } else {
- page = alloc_page(q->bounce_gfp | gfp_mask);
- if (!page) {
- ret = -ENOMEM;
- break;
- }
- }
-
- if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
- break;
-
- len -= bytes;
- offset = 0;
- }
-
- if (ret)
- goto cleanup;
-
- /*
- * success
- */
- if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
- (map_data && map_data->from_user)) {
- ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0);
- if (ret)
- goto cleanup;
- }
-
- bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
- return bio;
-cleanup:
- if (!map_data)
- bio_for_each_segment_all(bvec, bio, i)
- __free_page(bvec->bv_page);
-
- bio_put(bio);
-out_bmd:
- kfree(bmd);
- return ERR_PTR(ret);
-}
-
-/**
- * bio_copy_user - copy user data to bio
- * @q: destination block queue
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
- * @uaddr: start of user address
- * @len: length in bytes
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
- *
- * Prepares and returns a bio for indirect user io, bouncing data
- * to/from kernel pages as necessary. Must be paired with
- * call bio_uncopy_user() on io completion.
- */
-struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
- unsigned long uaddr, unsigned int len,
- int write_to_vm, gfp_t gfp_mask)
-{
- struct sg_iovec iov;
-
- iov.iov_base = (void __user *)uaddr;
- iov.iov_len = len;
-
- return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_copy_user);
-
-static struct bio *__bio_map_user_iov(struct request_queue *q,
- struct block_device *bdev,
- const struct sg_iovec *iov, int iov_count,
- int write_to_vm, gfp_t gfp_mask)
-{
- int i, j;
- int nr_pages = 0;
- struct page **pages;
- struct bio *bio;
- int cur_page = 0;
- int ret, offset;
-
- for (i = 0; i < iov_count; i++) {
- unsigned long uaddr = (unsigned long)iov[i].iov_base;
- unsigned long len = iov[i].iov_len;
- unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = uaddr >> PAGE_SHIFT;
-
- /*
- * Overflow, abort
- */
- if (end < start)
- return ERR_PTR(-EINVAL);
-
- nr_pages += end - start;
- /*
- * buffer must be aligned to at least hardsector size for now
- */
- if (uaddr & queue_dma_alignment(q))
- return ERR_PTR(-EINVAL);
- }
-
- if (!nr_pages)
- return ERR_PTR(-EINVAL);
-
- bio = bio_kmalloc(gfp_mask, nr_pages);
- if (!bio)
- return ERR_PTR(-ENOMEM);
-
- ret = -ENOMEM;
- pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
- if (!pages)
- goto out;
-
- for (i = 0; i < iov_count; i++) {
- unsigned long uaddr = (unsigned long)iov[i].iov_base;
- unsigned long len = iov[i].iov_len;
- unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = uaddr >> PAGE_SHIFT;
- const int local_nr_pages = end - start;
- const int page_limit = cur_page + local_nr_pages;
-
- ret = get_user_pages_fast(uaddr, local_nr_pages,
- write_to_vm, &pages[cur_page]);
- if (ret < local_nr_pages) {
- ret = -EFAULT;
- goto out_unmap;
- }
-
- offset = uaddr & ~PAGE_MASK;
- for (j = cur_page; j < page_limit; j++) {
- unsigned int bytes = PAGE_SIZE - offset;
-
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- /*
- * sorry...
- */
- if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
- bytes)
- break;
-
- len -= bytes;
- offset = 0;
- }
-
- cur_page = j;
- /*
- * release the pages we didn't map into the bio, if any
- */
- while (j < page_limit)
- page_cache_release(pages[j++]);
- }
-
- kfree(pages);
-
- /*
- * set data direction, and check if mapped pages need bouncing
- */
- if (!write_to_vm)
- bio->bi_rw |= REQ_WRITE;
-
- bio->bi_bdev = bdev;
- bio->bi_flags |= (1 << BIO_USER_MAPPED);
- return bio;
-
- out_unmap:
- for (i = 0; i < nr_pages; i++) {
- if(!pages[i])
- break;
- page_cache_release(pages[i]);
- }
- out:
- kfree(pages);
- bio_put(bio);
- return ERR_PTR(ret);
-}
-
-/**
- * bio_map_user - map user address into bio
- * @q: the struct request_queue for the bio
- * @bdev: destination block device
- * @uaddr: start of user address
- * @len: length in bytes
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
- *
- * Map the user space address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
- unsigned long uaddr, unsigned int len, int write_to_vm,
- gfp_t gfp_mask)
-{
- struct sg_iovec iov;
-
- iov.iov_base = (void __user *)uaddr;
- iov.iov_len = len;
-
- return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_map_user);
-
-/**
- * bio_map_user_iov - map user sg_iovec table into bio
- * @q: the struct request_queue for the bio
- * @bdev: destination block device
- * @iov: the iovec.
- * @iov_count: number of elements in the iovec
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
- *
- * Map the user space address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
- const struct sg_iovec *iov, int iov_count,
- int write_to_vm, gfp_t gfp_mask)
-{
- struct bio *bio;
-
- bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
- gfp_mask);
- if (IS_ERR(bio))
- return bio;
-
- /*
- * subtle -- if __bio_map_user() ended up bouncing a bio,
- * it would normally disappear when its bi_end_io is run.
- * however, we need it for the unmap, so grab an extra
- * reference to it
- */
- bio_get(bio);
-
- return bio;
-}
-
-static void __bio_unmap_user(struct bio *bio)
-{
- struct bio_vec *bvec;
- int i;
-
- /*
- * make sure we dirty pages we wrote to
- */
- bio_for_each_segment_all(bvec, bio, i) {
- if (bio_data_dir(bio) == READ)
- set_page_dirty_lock(bvec->bv_page);
-
- page_cache_release(bvec->bv_page);
- }
-
- bio_put(bio);
-}
-
-/**
- * bio_unmap_user - unmap a bio
- * @bio: the bio being unmapped
- *
- * Unmap a bio previously mapped by bio_map_user(). Must be called with
- * a process context.
- *
- * bio_unmap_user() may sleep.
- */
-void bio_unmap_user(struct bio *bio)
-{
- __bio_unmap_user(bio);
- bio_put(bio);
-}
-EXPORT_SYMBOL(bio_unmap_user);
-
-static void bio_map_kern_endio(struct bio *bio, int err)
-{
- bio_put(bio);
-}
-
-static struct bio *__bio_map_kern(struct request_queue *q, void *data,
- unsigned int len, gfp_t gfp_mask)
-{
- unsigned long kaddr = (unsigned long)data;
- unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = kaddr >> PAGE_SHIFT;
- const int nr_pages = end - start;
- int offset, i;
- struct bio *bio;
-
- bio = bio_kmalloc(gfp_mask, nr_pages);
- if (!bio)
- return ERR_PTR(-ENOMEM);
-
- offset = offset_in_page(kaddr);
- for (i = 0; i < nr_pages; i++) {
- unsigned int bytes = PAGE_SIZE - offset;
-
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
- offset) < bytes)
- break;
-
- data += bytes;
- len -= bytes;
- offset = 0;
- }
-
- bio->bi_end_io = bio_map_kern_endio;
- return bio;
-}
-
-/**
- * bio_map_kern - map kernel address into bio
- * @q: the struct request_queue for the bio
- * @data: pointer to buffer to map
- * @len: length in bytes
- * @gfp_mask: allocation flags for bio allocation
- *
- * Map the kernel address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
- gfp_t gfp_mask)
-{
- struct bio *bio;
-
- bio = __bio_map_kern(q, data, len, gfp_mask);
- if (IS_ERR(bio))
- return bio;
-
- if (bio->bi_iter.bi_size == len)
- return bio;
-
- /*
- * Don't support partial mappings.
- */
- bio_put(bio);
- return ERR_PTR(-EINVAL);
-}
-EXPORT_SYMBOL(bio_map_kern);
-
-static void bio_copy_kern_endio(struct bio *bio, int err)
-{
- struct bio_vec *bvec;
- const int read = bio_data_dir(bio) == READ;
- struct bio_map_data *bmd = bio->bi_private;
- int i;
- char *p = bmd->sgvecs[0].iov_base;
-
- bio_for_each_segment_all(bvec, bio, i) {
- char *addr = page_address(bvec->bv_page);
-
- if (read)
- memcpy(p, addr, bvec->bv_len);
-
- __free_page(bvec->bv_page);
- p += bvec->bv_len;
- }
-
- kfree(bmd);
- bio_put(bio);
-}
-
-/**
- * bio_copy_kern - copy kernel address into bio
- * @q: the struct request_queue for the bio
- * @data: pointer to buffer to copy
- * @len: length in bytes
- * @gfp_mask: allocation flags for bio and page allocation
- * @reading: data direction is READ
- *
- * copy the kernel address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
- gfp_t gfp_mask, int reading)
-{
- struct bio *bio;
- struct bio_vec *bvec;
- int i;
-
- bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
- if (IS_ERR(bio))
- return bio;
-
- if (!reading) {
- void *p = data;
-
- bio_for_each_segment_all(bvec, bio, i) {
- char *addr = page_address(bvec->bv_page);
-
- memcpy(addr, p, bvec->bv_len);
- p += bvec->bv_len;
- }
- }
-
- bio->bi_end_io = bio_copy_kern_endio;
-
- return bio;
-}
-EXPORT_SYMBOL(bio_copy_kern);
-
-/*
- * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
- * for performing direct-IO in BIOs.
- *
- * The problem is that we cannot run set_page_dirty() from interrupt context
- * because the required locks are not interrupt-safe. So what we can do is to
- * mark the pages dirty _before_ performing IO. And in interrupt context,
- * check that the pages are still dirty. If so, fine. If not, redirty them
- * in process context.
- *
- * We special-case compound pages here: normally this means reads into hugetlb
- * pages. The logic in here doesn't really work right for compound pages
- * because the VM does not uniformly chase down the head page in all cases.
- * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
- * handle them at all. So we skip compound pages here at an early stage.
- *
- * Note that this code is very hard to test under normal circumstances because
- * direct-io pins the pages with get_user_pages(). This makes
- * is_page_cache_freeable return false, and the VM will not clean the pages.
- * But other code (eg, flusher threads) could clean the pages if they are mapped
- * pagecache.
- *
- * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
- * deferred bio dirtying paths.
- */
-
-/*
- * bio_set_pages_dirty() will mark all the bio's pages as dirty.
- */
-void bio_set_pages_dirty(struct bio *bio)
-{
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (page && !PageCompound(page))
- set_page_dirty_lock(page);
- }
-}
-
-static void bio_release_pages(struct bio *bio)
-{
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (page)
- put_page(page);
- }
-}
-
-/*
- * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
- * If they are, then fine. If, however, some pages are clean then they must
- * have been written out during the direct-IO read. So we take another ref on
- * the BIO and the offending pages and re-dirty the pages in process context.
- *
- * It is expected that bio_check_pages_dirty() will wholly own the BIO from
- * here on. It will run one page_cache_release() against each page and will
- * run one bio_put() against the BIO.
- */
-
-static void bio_dirty_fn(struct work_struct *work);
-
-static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
-static DEFINE_SPINLOCK(bio_dirty_lock);
-static struct bio *bio_dirty_list;
-
-/*
- * This runs in process context
- */
-static void bio_dirty_fn(struct work_struct *work)
-{
- unsigned long flags;
- struct bio *bio;
-
- spin_lock_irqsave(&bio_dirty_lock, flags);
- bio = bio_dirty_list;
- bio_dirty_list = NULL;
- spin_unlock_irqrestore(&bio_dirty_lock, flags);
-
- while (bio) {
- struct bio *next = bio->bi_private;
-
- bio_set_pages_dirty(bio);
- bio_release_pages(bio);
- bio_put(bio);
- bio = next;
- }
-}
-
-void bio_check_pages_dirty(struct bio *bio)
-{
- struct bio_vec *bvec;
- int nr_clean_pages = 0;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (PageDirty(page) || PageCompound(page)) {
- page_cache_release(page);
- bvec->bv_page = NULL;
- } else {
- nr_clean_pages++;
- }
- }
-
- if (nr_clean_pages) {
- unsigned long flags;
-
- spin_lock_irqsave(&bio_dirty_lock, flags);
- bio->bi_private = bio_dirty_list;
- bio_dirty_list = bio;
- spin_unlock_irqrestore(&bio_dirty_lock, flags);
- schedule_work(&bio_dirty_work);
- } else {
- bio_put(bio);
- }
-}
-
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-void bio_flush_dcache_pages(struct bio *bi)
-{
- struct bio_vec bvec;
- struct bvec_iter iter;
-
- bio_for_each_segment(bvec, bi, iter)
- flush_dcache_page(bvec.bv_page);
-}
-EXPORT_SYMBOL(bio_flush_dcache_pages);
-#endif
-
-/**
- * bio_endio - end I/O on a bio
- * @bio: bio
- * @error: error, if any
- *
- * Description:
- * bio_endio() will end I/O on the whole bio. bio_endio() is the
- * preferred way to end I/O on a bio, it takes care of clearing
- * BIO_UPTODATE on error. @error is 0 on success, and and one of the
- * established -Exxxx (-EIO, for instance) error values in case
- * something went wrong. No one should call bi_end_io() directly on a
- * bio unless they own it and thus know that it has an end_io
- * function.
- **/
-void bio_endio(struct bio *bio, int error)
-{
- while (bio) {
- BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
-
- if (error)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- error = -EIO;
-
- if (!atomic_dec_and_test(&bio->bi_remaining))
- return;
-
- /*
- * Need to have a real endio function for chained bios,
- * otherwise various corner cases will break (like stacking
- * block devices that save/restore bi_end_io) - however, we want
- * to avoid unbounded recursion and blowing the stack. Tail call
- * optimization would handle this, but compiling with frame
- * pointers also disables gcc's sibling call optimization.
- */
- if (bio->bi_end_io == bio_chain_endio) {
- struct bio *parent = bio->bi_private;
- bio_put(bio);
- bio = parent;
- } else {
- if (bio->bi_end_io)
- bio->bi_end_io(bio, error);
- bio = NULL;
- }
- }
-}
-EXPORT_SYMBOL(bio_endio);
-
-/**
- * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
- * @bio: bio
- * @error: error, if any
- *
- * For code that has saved and restored bi_end_io; thing hard before using this
- * function, probably you should've cloned the entire bio.
- **/
-void bio_endio_nodec(struct bio *bio, int error)
-{
- atomic_inc(&bio->bi_remaining);
- bio_endio(bio, error);
-}
-EXPORT_SYMBOL(bio_endio_nodec);
-
-/**
- * bio_split - split a bio
- * @bio: bio to split
- * @sectors: number of sectors to split from the front of @bio
- * @gfp: gfp mask
- * @bs: bio set to allocate from
- *
- * Allocates and returns a new bio which represents @sectors from the start of
- * @bio, and updates @bio to represent the remaining sectors.
- *
- * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's
- * responsibility to ensure that @bio is not freed before the split.
- */
-struct bio *bio_split(struct bio *bio, int sectors,
- gfp_t gfp, struct bio_set *bs)
-{
- struct bio *split = NULL;
-
- BUG_ON(sectors <= 0);
- BUG_ON(sectors >= bio_sectors(bio));
-
- split = bio_clone_fast(bio, gfp, bs);
- if (!split)
- return NULL;
-
- split->bi_iter.bi_size = sectors << 9;
-
- if (bio_integrity(split))
- bio_integrity_trim(split, 0, sectors);
-
- bio_advance(bio, split->bi_iter.bi_size);
-
- return split;
-}
-EXPORT_SYMBOL(bio_split);
-
-/**
- * bio_trim - trim a bio
- * @bio: bio to trim
- * @offset: number of sectors to trim from the front of @bio
- * @size: size we want to trim @bio to, in sectors
- */
-void bio_trim(struct bio *bio, int offset, int size)
-{
- /* 'bio' is a cloned bio which we need to trim to match
- * the given offset and size.
- */
-
- size <<= 9;
- if (offset == 0 && size == bio->bi_iter.bi_size)
- return;
-
- clear_bit(BIO_SEG_VALID, &bio->bi_flags);
-
- bio_advance(bio, offset << 9);
-
- bio->bi_iter.bi_size = size;
-}
-EXPORT_SYMBOL_GPL(bio_trim);
-
-/*
- * create memory pools for biovec's in a bio_set.
- * use the global biovec slabs created for general use.
- */
-mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
-{
- struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
-
- return mempool_create_slab_pool(pool_entries, bp->slab);
-}
-
-void bioset_free(struct bio_set *bs)
-{
- if (bs->rescue_workqueue)
- destroy_workqueue(bs->rescue_workqueue);
-
- if (bs->bio_pool)
- mempool_destroy(bs->bio_pool);
-
- if (bs->bvec_pool)
- mempool_destroy(bs->bvec_pool);
-
- bioset_integrity_free(bs);
- bio_put_slab(bs);
-
- kfree(bs);
-}
-EXPORT_SYMBOL(bioset_free);
-
-/**
- * bioset_create - Create a bio_set
- * @pool_size: Number of bio and bio_vecs to cache in the mempool
- * @front_pad: Number of bytes to allocate in front of the returned bio
- *
- * Description:
- * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
- * to ask for a number of bytes to be allocated in front of the bio.
- * Front pad allocation is useful for embedding the bio inside
- * another structure, to avoid allocating extra data to go with the bio.
- * Note that the bio must be embedded at the END of that structure always,
- * or things will break badly.
- */
-struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
-{
- unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
- struct bio_set *bs;
-
- bs = kzalloc(sizeof(*bs), GFP_KERNEL);
- if (!bs)
- return NULL;
-
- bs->front_pad = front_pad;
-
- spin_lock_init(&bs->rescue_lock);
- bio_list_init(&bs->rescue_list);
- INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
-
- bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
- if (!bs->bio_slab) {
- kfree(bs);
- return NULL;
- }
-
- bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
- if (!bs->bio_pool)
- goto bad;
-
- bs->bvec_pool = biovec_create_pool(bs, pool_size);
- if (!bs->bvec_pool)
- goto bad;
-
- bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
- if (!bs->rescue_workqueue)
- goto bad;
-
- return bs;
-bad:
- bioset_free(bs);
- return NULL;
-}
-EXPORT_SYMBOL(bioset_create);
-
-#ifdef CONFIG_BLK_CGROUP
-/**
- * bio_associate_current - associate a bio with %current
- * @bio: target bio
- *
- * Associate @bio with %current if it hasn't been associated yet. Block
- * layer will treat @bio as if it were issued by %current no matter which
- * task actually issues it.
- *
- * This function takes an extra reference of @task's io_context and blkcg
- * which will be put when @bio is released. The caller must own @bio,
- * ensure %current->io_context exists, and is responsible for synchronizing
- * calls to this function.
- */
-int bio_associate_current(struct bio *bio)
-{
- struct io_context *ioc;
- struct cgroup_subsys_state *css;
-
- if (bio->bi_ioc)
- return -EBUSY;
-
- ioc = current->io_context;
- if (!ioc)
- return -ENOENT;
-
- /* acquire active ref on @ioc and associate */
- get_io_context_active(ioc);
- bio->bi_ioc = ioc;
-
- /* associate blkcg if exists */
- rcu_read_lock();
- css = task_css(current, blkio_cgrp_id);
- if (css && css_tryget(css))
- bio->bi_css = css;
- rcu_read_unlock();
-
- return 0;
-}
-
-/**
- * bio_disassociate_task - undo bio_associate_current()
- * @bio: target bio
- */
-void bio_disassociate_task(struct bio *bio)
-{
- if (bio->bi_ioc) {
- put_io_context(bio->bi_ioc);
- bio->bi_ioc = NULL;
- }
- if (bio->bi_css) {
- css_put(bio->bi_css);
- bio->bi_css = NULL;
- }
-}
-
-#endif /* CONFIG_BLK_CGROUP */
-
-static void __init biovec_init_slabs(void)
-{
- int i;
-
- for (i = 0; i < BIOVEC_NR_POOLS; i++) {
- int size;
- struct biovec_slab *bvs = bvec_slabs + i;
-
- if (bvs->nr_vecs <= BIO_INLINE_VECS) {
- bvs->slab = NULL;
- continue;
- }
-
- size = bvs->nr_vecs * sizeof(struct bio_vec);
- bvs->slab = kmem_cache_create(bvs->name, size, 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
- }
-}
-
-static int __init init_bio(void)
-{
- bio_slab_max = 2;
- bio_slab_nr = 0;
- bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
- if (!bio_slabs)
- panic("bio: can't allocate bios\n");
-
- bio_integrity_init();
- biovec_init_slabs();
-
- fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
- if (!fs_bio_set)
- panic("bio: can't allocate bios\n");
-
- if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
- panic("bio: can't create integrity pool\n");
-
- return 0;
-}
-subsys_initcall(init_bio);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4c48df572bd6..ba6b88528dc7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2058,6 +2058,20 @@ struct btrfs_ioctl_defrag_range_args {
#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
BTRFS_MOUNT_##opt)
+#define btrfs_set_and_info(root, opt, fmt, args...) \
+{ \
+ if (!btrfs_test_opt(root, opt)) \
+ btrfs_info(root->fs_info, fmt, ##args); \
+ btrfs_set_opt(root->fs_info->mount_opt, opt); \
+}
+
+#define btrfs_clear_and_info(root, opt, fmt, args...) \
+{ \
+ if (btrfs_test_opt(root, opt)) \
+ btrfs_info(root->fs_info, fmt, ##args); \
+ btrfs_clear_opt(root->fs_info->mount_opt, opt); \
+}
+
/*
* Inode flags
*/
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 029d46c2e170..983314932af3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2861,7 +2861,7 @@ retry_root_backup:
printk(KERN_ERR "BTRFS: failed to read log tree\n");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
@@ -2870,24 +2870,24 @@ retry_root_backup:
"Failed to recover log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
if (sb->s_flags & MS_RDONLY) {
ret = btrfs_commit_super(tree_root);
if (ret)
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
}
ret = btrfs_find_orphan_roots(tree_root);
if (ret)
- goto fail_trans_kthread;
+ goto fail_qgroup;
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_cleanup_fs_roots(fs_info);
if (ret)
- goto fail_trans_kthread;
+ goto fail_qgroup;
ret = btrfs_recover_relocation(tree_root);
if (ret < 0) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1306487c82cf..5590af92094b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1542,6 +1542,7 @@ again:
ret = 0;
}
if (ret) {
+ key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
btrfs_release_path(path);
@@ -3542,11 +3543,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
return extended_to_chunk(flags | tmp);
}
-static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
{
unsigned seq;
+ u64 flags;
do {
+ flags = orig_flags;
seq = read_seqbegin(&root->fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
@@ -5719,6 +5722,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (ret > 0 && skinny_metadata) {
skinny_metadata = false;
+ key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
btrfs_release_path(path);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index eb742c07e7a4..ae6af072b635 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -800,7 +800,7 @@ next_slot:
if (start > key.offset && end < extent_end) {
BUG_ON(del_nr > 0);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- ret = -EINVAL;
+ ret = -EOPNOTSUPP;
break;
}
@@ -846,7 +846,7 @@ next_slot:
*/
if (start <= key.offset && end < extent_end) {
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- ret = -EINVAL;
+ ret = -EOPNOTSUPP;
break;
}
@@ -872,7 +872,7 @@ next_slot:
if (start > key.offset && end >= extent_end) {
BUG_ON(del_nr > 0);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- ret = -EINVAL;
+ ret = -EOPNOTSUPP;
break;
}
@@ -1777,7 +1777,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
/* Expand hole size to cover write data, preventing empty gap */
- end_pos = round_up(pos + iov->iov_len, root->sectorsize);
+ end_pos = round_up(pos + count, root->sectorsize);
err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
if (err) {
mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index cc8ca193d830..86935f5ae291 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -176,7 +176,11 @@ static void start_caching(struct btrfs_root *root)
tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
root->root_key.objectid);
- BUG_ON(IS_ERR(tsk)); /* -ENOMEM */
+ if (IS_ERR(tsk)) {
+ btrfs_warn(root->fs_info, "failed to start inode caching task");
+ btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+ "disabling inode map caching");
+ }
}
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
@@ -205,24 +209,14 @@ again:
void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
-
again:
if (root->cached == BTRFS_CACHE_FINISHED) {
- __btrfs_add_free_space(ctl, objectid, 1);
+ __btrfs_add_free_space(pinned, objectid, 1);
} else {
- /*
- * If we are in the process of caching free ino chunks,
- * to avoid adding the same inode number to the free_ino
- * tree twice due to cross transaction, we'll leave it
- * in the pinned tree until a transaction is committed
- * or the caching work is done.
- */
-
down_write(&root->fs_info->commit_root_sem);
spin_lock(&root->cache_lock);
if (root->cached == BTRFS_CACHE_FINISHED) {
@@ -234,11 +228,7 @@ again:
start_caching(root);
- if (objectid <= root->cache_progress ||
- objectid >= root->highest_objectid)
- __btrfs_add_free_space(ctl, objectid, 1);
- else
- __btrfs_add_free_space(pinned, objectid, 1);
+ __btrfs_add_free_space(pinned, objectid, 1);
up_write(&root->fs_info->commit_root_sem);
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f45040a4bb76..3f52bb7a58d2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3066,7 +3066,7 @@ process_slot:
new_key.offset + datal,
1);
if (ret) {
- if (ret != -EINVAL)
+ if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans,
root, ret);
btrfs_end_transaction(trans, root);
@@ -3120,6 +3120,8 @@ process_slot:
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 skip = 0;
u64 trim = 0;
+ u64 aligned_end = 0;
+
if (off > key.offset) {
skip = off - key.offset;
new_key.offset += skip;
@@ -3136,12 +3138,14 @@ process_slot:
size -= skip + trim;
datal -= skip + trim;
+ aligned_end = ALIGN(new_key.offset + datal,
+ root->sectorsize);
ret = btrfs_drop_extents(trans, root, inode,
new_key.offset,
- new_key.offset + datal,
+ aligned_end,
1);
if (ret) {
- if (ret != -EINVAL)
+ if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans,
root, ret);
btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 1ac3ca98c429..fd38b5053479 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -349,6 +349,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (p->buf_len >= len)
return 0;
+ if (len > PATH_MAX) {
+ WARN_ON(1);
+ return -ENOMEM;
+ }
+
path_len = p->end - p->start;
old_buf_len = p->buf_len;
@@ -1663,7 +1668,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
goto out;
}
- if (key.type == BTRFS_INODE_REF_KEY) {
+ if (found_key.type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5011aadacab8..9601d25a4607 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -385,20 +385,6 @@ static match_table_t tokens = {
{Opt_err, NULL},
};
-#define btrfs_set_and_info(root, opt, fmt, args...) \
-{ \
- if (!btrfs_test_opt(root, opt)) \
- btrfs_info(root->fs_info, fmt, ##args); \
- btrfs_set_opt(root->fs_info->mount_opt, opt); \
-}
-
-#define btrfs_clear_and_info(root, opt, fmt, args...) \
-{ \
- if (btrfs_test_opt(root, opt)) \
- btrfs_info(root->fs_info, fmt, ##args); \
- btrfs_clear_opt(root->fs_info->mount_opt, opt); \
-}
-
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
@@ -1186,7 +1172,6 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
return ERR_PTR(-ENOMEM);
mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
newargs);
- kfree(newargs);
if (PTR_RET(mnt) == -EBUSY) {
if (flags & MS_RDONLY) {
@@ -1196,17 +1181,22 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
int r;
mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
newargs);
- if (IS_ERR(mnt))
+ if (IS_ERR(mnt)) {
+ kfree(newargs);
return ERR_CAST(mnt);
+ }
r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
if (r < 0) {
/* FIXME: release vfsmount mnt ??*/
+ kfree(newargs);
return ERR_PTR(r);
}
}
}
+ kfree(newargs);
+
if (IS_ERR(mnt))
return ERR_CAST(mnt);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2e5e648eb5c3..c561b628ebce 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3261,7 +3261,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
rel->seq = cpu_to_le32(cap->seq);
rel->issue_seq = cpu_to_le32(cap->issue_seq),
rel->mseq = cpu_to_le32(cap->mseq);
- rel->caps = cpu_to_le32(cap->issued);
+ rel->caps = cpu_to_le32(cap->implemented);
rel->wanted = cpu_to_le32(cap->mds_wanted);
rel->dname_len = 0;
rel->dname_seq = 0;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 766410a12c2c..c29d6ae68874 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -141,7 +141,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
/* start at beginning? */
if (ctx->pos == 2 || last == NULL ||
- ctx->pos < ceph_dentry(last)->offset) {
+ fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
if (list_empty(&parent->d_subdirs))
goto out_unlock;
p = parent->d_subdirs.prev;
@@ -182,9 +182,16 @@ more:
spin_unlock(&dentry->d_lock);
spin_unlock(&parent->d_lock);
+ /* make sure a dentry wasn't dropped while we didn't have parent lock */
+ if (!ceph_dir_is_complete(dir)) {
+ dout(" lost dir complete on %p; falling back to mds\n", dir);
+ dput(dentry);
+ err = -EAGAIN;
+ goto out;
+ }
+
dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name,
dentry->d_name.len,
ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
@@ -198,19 +205,12 @@ more:
return 0;
}
+ ctx->pos = di->offset + 1;
+
if (last)
dput(last);
last = dentry;
- ctx->pos++;
-
- /* make sure a dentry wasn't dropped while we didn't have parent lock */
- if (!ceph_dir_is_complete(dir)) {
- dout(" lost dir complete on %p; falling back to mds\n", dir);
- err = -EAGAIN;
- goto out;
- }
-
spin_lock(&parent->d_lock);
p = p->prev; /* advance to next dentry */
goto more;
@@ -296,6 +296,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
err = __dcache_readdir(file, ctx, shared_gen);
if (err != -EAGAIN)
return err;
+ frag = fpos_frag(ctx->pos);
+ off = fpos_off(ctx->pos);
} else {
spin_unlock(&ci->i_ceph_lock);
}
@@ -446,7 +448,6 @@ more:
if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
dout(" marking %p complete\n", inode);
__ceph_dir_set_complete(ci, fi->dir_release_count);
- ci->i_max_offset = ctx->pos;
}
spin_unlock(&ci->i_ceph_lock);
@@ -935,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
* to do it here.
*/
- /* d_move screws up d_subdirs order */
- ceph_dir_clear_complete(new_dir);
-
d_move(old_dentry, new_dentry);
/* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(new_dentry);
+
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_complete(old_dir);
+ ceph_dir_clear_complete(new_dir);
+
}
ceph_mdsc_put_request(req);
return err;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 39da1c2efa50..88a6df4cbe6d 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1221,9 +1221,6 @@ static long ceph_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
-
mutex_lock(&inode->i_mutex);
if (ceph_snap(inode) != CEPH_NOSNAP) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 0b0728e5be2d..233c6f96910a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -744,7 +744,6 @@ static int fill_inode(struct inode *inode,
!__ceph_dir_is_complete(ci)) {
dout(" marking %p complete (empty)\n", inode);
__ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
- ci->i_max_offset = 2;
}
no_change:
/* only update max_size on auth cap */
@@ -890,41 +889,6 @@ out_unlock:
}
/*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- *
- * Always called under directory's i_mutex.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
- struct dentry *dir = dn->d_parent;
- struct inode *inode = dir->d_inode;
- struct ceph_inode_info *ci;
- struct ceph_dentry_info *di;
-
- BUG_ON(!inode);
-
- ci = ceph_inode(inode);
- di = ceph_dentry(dn);
-
- spin_lock(&ci->i_ceph_lock);
- if (!__ceph_dir_is_complete(ci)) {
- spin_unlock(&ci->i_ceph_lock);
- return;
- }
- di->offset = ceph_inode(inode)->i_max_offset++;
- spin_unlock(&ci->i_ceph_lock);
-
- spin_lock(&dir->d_lock);
- spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&dn->d_u.d_child, &dir->d_subdirs);
- dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
- dn->d_u.d_child.prev, dn->d_u.d_child.next);
- spin_unlock(&dn->d_lock);
- spin_unlock(&dir->d_lock);
-}
-
-/*
* splice a dentry to an inode.
* caller must hold directory i_mutex for this to be safe.
*
@@ -933,7 +897,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
* the caller) if we fail.
*/
static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
- bool *prehash, bool set_offset)
+ bool *prehash)
{
struct dentry *realdn;
@@ -965,8 +929,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
}
if ((!prehash || *prehash) && d_unhashed(dn))
d_rehash(dn);
- if (set_offset)
- ceph_set_dentry_offset(dn);
out:
return dn;
}
@@ -987,7 +949,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
{
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct inode *in = NULL;
- struct ceph_mds_reply_inode *ininfo;
struct ceph_vino vino;
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
int err = 0;
@@ -1161,6 +1122,9 @@ retry_lookup:
/* rename? */
if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+ struct inode *olddir = req->r_old_dentry_dir;
+ BUG_ON(!olddir);
+
dout(" src %p '%.*s' dst %p '%.*s'\n",
req->r_old_dentry,
req->r_old_dentry->d_name.len,
@@ -1180,13 +1144,10 @@ retry_lookup:
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn);
- /*
- * d_move() puts the renamed dentry at the end of
- * d_subdirs. We need to assign it an appropriate
- * directory offset so we can behave when dir is
- * complete.
- */
- ceph_set_dentry_offset(req->r_old_dentry);
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_complete(dir);
+ ceph_dir_clear_complete(olddir);
+
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset);
@@ -1213,8 +1174,9 @@ retry_lookup:
/* attach proper inode */
if (!dn->d_inode) {
+ ceph_dir_clear_complete(dir);
ihold(in);
- dn = splice_dentry(dn, in, &have_lease, true);
+ dn = splice_dentry(dn, in, &have_lease);
if (IS_ERR(dn)) {
err = PTR_ERR(dn);
goto done;
@@ -1235,17 +1197,16 @@ retry_lookup:
(req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
req->r_op == CEPH_MDS_OP_MKSNAP)) {
struct dentry *dn = req->r_dentry;
+ struct inode *dir = req->r_locked_dir;
/* fill out a snapdir LOOKUPSNAP dentry */
BUG_ON(!dn);
- BUG_ON(!req->r_locked_dir);
- BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
+ BUG_ON(!dir);
+ BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
dout(" linking snapped dir %p to dn %p\n", in, dn);
+ ceph_dir_clear_complete(dir);
ihold(in);
- dn = splice_dentry(dn, in, NULL, true);
+ dn = splice_dentry(dn, in, NULL);
if (IS_ERR(dn)) {
err = PTR_ERR(dn);
goto done;
@@ -1407,7 +1368,7 @@ retry_lookup:
}
if (!dn->d_inode) {
- dn = splice_dentry(dn, in, NULL, false);
+ dn = splice_dentry(dn, in, NULL);
if (IS_ERR(dn)) {
err = PTR_ERR(dn);
dn = NULL;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index fdf941b44ff1..a822a6e58290 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -109,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
+
req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
req->r_args.setlayout.layout.fl_stripe_unit =
@@ -153,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
req->r_args.setlayout.layout.fl_stripe_unit =
cpu_to_le32(l.stripe_unit);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index d94ba0df9f4d..191398852a2e 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -45,6 +45,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
/* mds requires start and length rather than start and end */
if (LLONG_MAX == fl->fl_end)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7866cd05a6bb..ead05cc1f447 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -266,7 +266,6 @@ struct ceph_inode_info {
struct timespec i_rctime;
u64 i_rbytes, i_rfiles, i_rsubdirs;
u64 i_files, i_subdirs;
- u64 i_max_offset; /* largest readdir offset, set with complete dir */
struct rb_root i_fragtree;
struct mutex i_fragtree_mutex;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index df9c9141c099..5be1f997ecde 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -253,6 +253,11 @@ cifs_alloc_inode(struct super_block *sb)
cifs_set_oplock_level(cifs_inode, 0);
cifs_inode->delete_pending = false;
cifs_inode->invalid_mapping = false;
+ clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags);
+ clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags);
+ clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags);
+ spin_lock_init(&cifs_inode->writers_lock);
+ cifs_inode->writers = 0;
cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
cifs_inode->server_eof = 0;
cifs_inode->uniqueid = 0;
@@ -732,19 +737,26 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
ssize_t written;
int rc;
+ written = cifs_get_writer(cinode);
+ if (written)
+ return written;
+
written = generic_file_aio_write(iocb, iov, nr_segs, pos);
if (CIFS_CACHE_WRITE(CIFS_I(inode)))
- return written;
+ goto out;
rc = filemap_fdatawrite(inode->i_mapping);
if (rc)
cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n",
rc, inode);
+out:
+ cifs_put_writer(cinode);
return written;
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c0f3718b77a8..30f6e9251a4a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -228,6 +228,8 @@ struct smb_version_operations {
/* verify the message */
int (*check_message)(char *, unsigned int);
bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
+ void (*downgrade_oplock)(struct TCP_Server_Info *,
+ struct cifsInodeInfo *, bool);
/* process transaction2 response */
bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *,
char *, int);
@@ -1113,6 +1115,12 @@ struct cifsInodeInfo {
unsigned int epoch; /* used to track lease state changes */
bool delete_pending; /* DELETE_ON_CLOSE is set */
bool invalid_mapping; /* pagecache is invalid */
+ unsigned long flags;
+#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */
+#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */
+#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */
+ spinlock_t writers_lock;
+ unsigned int writers; /* Number of writers on this inode */
unsigned long time; /* jiffies of last update of inode */
u64 server_eof; /* current file size on server -- protected by i_lock */
u64 uniqueid; /* server inode number */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index acc4ee8ed075..ca7980a1e303 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
int offset);
extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
+extern int cifs_get_writer(struct cifsInodeInfo *cinode);
+extern void cifs_put_writer(struct cifsInodeInfo *cinode);
+extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode);
extern int cifs_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f3264bd7a83d..6ce4e0954b98 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -6197,6 +6197,9 @@ QAllEAsRetry:
cifs_dbg(FYI, "ea length %d\n", list_len);
if (list_len <= 8) {
cifs_dbg(FYI, "empty EA list returned from server\n");
+ /* didn't find the named attribute */
+ if (ea_name)
+ rc = -ENODATA;
goto QAllEAsOut;
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8add25538a3b..5ed03e0b8b40 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2599,7 +2599,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
ssize_t err;
err = generic_write_sync(file, iocb->ki_pos - rc, rc);
- if (rc < 0)
+ if (err < 0)
rc = err;
}
} else {
@@ -2621,12 +2621,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
ssize_t written;
+ written = cifs_get_writer(cinode);
+ if (written)
+ return written;
+
if (CIFS_CACHE_WRITE(cinode)) {
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
- && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
- return generic_file_aio_write(iocb, iov, nr_segs, pos);
- return cifs_writev(iocb, iov, nr_segs, pos);
+ && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
+ written = generic_file_aio_write(
+ iocb, iov, nr_segs, pos);
+ goto out;
+ }
+ written = cifs_writev(iocb, iov, nr_segs, pos);
+ goto out;
}
/*
* For non-oplocked files in strict cache mode we need to write the data
@@ -2646,6 +2654,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
inode);
cinode->oplock = 0;
}
+out:
+ cifs_put_writer(cinode);
return written;
}
@@ -2872,7 +2882,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
cifs_uncached_readv_complete);
if (!rdata) {
rc = -ENOMEM;
- goto error;
+ break;
}
rc = cifs_read_allocate_pages(rdata, npages);
@@ -3621,6 +3631,13 @@ static int cifs_launder_page(struct page *page)
return rc;
}
+static int
+cifs_pending_writers_wait(void *unused)
+{
+ schedule();
+ return 0;
+}
+
void cifs_oplock_break(struct work_struct *work)
{
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -3628,8 +3645,15 @@ void cifs_oplock_break(struct work_struct *work)
struct inode *inode = cfile->dentry->d_inode;
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
int rc = 0;
+ wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
+ cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE);
+
+ server->ops->downgrade_oplock(server, cinode,
+ test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
+
if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) &&
cifs_has_mand_locks(cinode)) {
cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n",
@@ -3666,6 +3690,7 @@ void cifs_oplock_break(struct work_struct *work)
cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
+ cifs_done_oplock_break(cinode);
}
/*
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index aadc2b68678b..a22d667f1069 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1737,6 +1737,9 @@ cifs_inode_needs_reval(struct inode *inode)
if (cifs_i->time == 0)
return true;
+ if (!cifs_sb->actimeo)
+ return true;
+
if (!time_in_range(jiffies, cifs_i->time,
cifs_i->time + cifs_sb->actimeo))
return true;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 2f9f3790679d..3b0c62e622da 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
cifs_dbg(FYI, "file id match, oplock break\n");
pCifsInode = CIFS_I(netfile->dentry->d_inode);
- cifs_set_oplock_level(pCifsInode,
- pSMB->OplockLevel ? OPLOCK_READ : 0);
+ set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
+ &pCifsInode->flags);
+
+ /*
+ * Set flag if the server downgrades the oplock
+ * to L2 else clear.
+ */
+ if (pSMB->OplockLevel)
+ set_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &pCifsInode->flags);
+ else
+ clear_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &pCifsInode->flags);
+
queue_work(cifsiod_wq,
&netfile->oplock_break);
netfile->oplock_break_cancelled = false;
@@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
cinode->oplock = 0;
}
+static int
+cifs_oplock_break_wait(void *unused)
+{
+ schedule();
+ return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+
+/*
+ * We wait for oplock breaks to be processed before we attempt to perform
+ * writes.
+ */
+int cifs_get_writer(struct cifsInodeInfo *cinode)
+{
+ int rc;
+
+start:
+ rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK,
+ cifs_oplock_break_wait, TASK_KILLABLE);
+ if (rc)
+ return rc;
+
+ spin_lock(&cinode->writers_lock);
+ if (!cinode->writers)
+ set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+ cinode->writers++;
+ /* Check to see if we have started servicing an oplock break */
+ if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) {
+ cinode->writers--;
+ if (cinode->writers == 0) {
+ clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+ wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+ }
+ spin_unlock(&cinode->writers_lock);
+ goto start;
+ }
+ spin_unlock(&cinode->writers_lock);
+ return 0;
+}
+
+void cifs_put_writer(struct cifsInodeInfo *cinode)
+{
+ spin_lock(&cinode->writers_lock);
+ cinode->writers--;
+ if (cinode->writers == 0) {
+ clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+ wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+ }
+ spin_unlock(&cinode->writers_lock);
+}
+
+void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
+{
+ clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+ wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK);
+}
+
bool
backup_cred(struct cifs_sb_info *cifs_sb)
{
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 526fb89f9230..d1fdfa848703 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
return 0;
}
+static void
+cifs_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, bool set_level2)
+{
+ if (set_level2)
+ cifs_set_oplock_level(cinode, OPLOCK_READ);
+ else
+ cifs_set_oplock_level(cinode, 0);
+}
+
static bool
cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server,
char *buf, int malformed)
@@ -1019,6 +1029,7 @@ struct smb_version_operations smb1_operations = {
.clear_stats = cifs_clear_stats,
.print_stats = cifs_print_stats,
.is_oplock_break = is_valid_oplock_break,
+ .downgrade_oplock = cifs_downgrade_oplock,
.check_trans2 = cifs_check_trans2,
.need_neg = cifs_need_neg,
.negotiate = cifs_negotiate,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index fb3966265b6e..b8021fde987d 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
else
cfile->oplock_break_cancelled = false;
- server->ops->set_oplock_level(cinode,
- rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0,
- 0, NULL);
+ set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
+ &cinode->flags);
+
+ /*
+ * Set flag if the server downgrades the oplock
+ * to L2 else clear.
+ */
+ if (rsp->OplockLevel)
+ set_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
+ else
+ clear_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
queue_work(cifsiod_wq, &cfile->oplock_break);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 192f51a12cf1..35ddc3ed119d 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -905,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
}
static void
+smb2_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, bool set_level2)
+{
+ if (set_level2)
+ server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II,
+ 0, NULL);
+ else
+ server->ops->set_oplock_level(cinode, 0, 0, NULL);
+}
+
+static void
smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int epoch, bool *purge_cache)
{
@@ -1110,6 +1121,7 @@ struct smb_version_operations smb20_operations = {
.clear_stats = smb2_clear_stats,
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
+ .downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -1184,6 +1196,7 @@ struct smb_version_operations smb21_operations = {
.clear_stats = smb2_clear_stats,
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
+ .downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -1259,6 +1272,7 @@ struct smb_version_operations smb30_operations = {
.print_stats = smb2_print_stats,
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
+ .downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 860344701067..3802f8c94acc 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1352,7 +1352,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid)
{
int rc;
- char *res_key = NULL;
struct compress_ioctl fsctl_input;
char *ret_data = NULL;
@@ -1365,7 +1364,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
2 /* in data len */, &ret_data /* out data */, NULL);
cifs_dbg(FYI, "set compression rc %d\n", rc);
- kfree(res_key);
return rc;
}
diff --git a/fs/compat.c b/fs/compat.c
index ca926ad0430c..66d3d3c6b4b2 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -457,9 +457,9 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
case F_GETLK64:
case F_SETLK64:
case F_SETLKW64:
- case F_GETLKP:
- case F_SETLKP:
- case F_SETLKPW:
+ case F_OFD_GETLK:
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
ret = get_compat_flock64(&f, compat_ptr(arg));
if (ret != 0)
break;
@@ -468,7 +468,7 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
conv_cmd = convert_fcntl_cmd(cmd);
ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
set_fs(old_fs);
- if ((conv_cmd == F_GETLK || conv_cmd == F_GETLKP) && ret == 0) {
+ if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
/* need to return lock information - see above for commentary */
if (f.l_start > COMPAT_LOFF_T_MAX)
ret = -EOVERFLOW;
@@ -493,9 +493,9 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
case F_GETLK64:
case F_SETLK64:
case F_SETLKW64:
- case F_GETLKP:
- case F_SETLKP:
- case F_SETLKPW:
+ case F_OFD_GETLK:
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
return -EINVAL;
}
return compat_sys_fcntl64(fd, cmd, arg);
diff --git a/fs/coredump.c b/fs/coredump.c
index e3ad709a4232..0b2528fb640e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -73,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size)
static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
{
int free, need;
+ va_list arg_copy;
again:
free = cn->size - cn->used;
- need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
+
+ va_copy(arg_copy, arg);
+ need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
+ va_end(arg_copy);
+
if (need < free) {
cn->used += need;
return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 40707d88a945..be2bea834bf4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -246,16 +246,8 @@ static void __d_free(struct rcu_head *head)
kmem_cache_free(dentry_cache, dentry);
}
-/*
- * no locks, please.
- */
-static void d_free(struct dentry *dentry)
+static void dentry_free(struct dentry *dentry)
{
- BUG_ON((int)dentry->d_lockref.count > 0);
- this_cpu_dec(nr_dentry);
- if (dentry->d_op && dentry->d_op->d_release)
- dentry->d_op->d_release(dentry);
-
/* if dentry was never visible to RCU, immediate free is OK */
if (!(dentry->d_flags & DCACHE_RCUACCESS))
__d_free(&dentry->d_u.d_rcu);
@@ -403,56 +395,6 @@ static void dentry_lru_add(struct dentry *dentry)
d_lru_add(dentry);
}
-/*
- * Remove a dentry with references from the LRU.
- *
- * If we are on the shrink list, then we can get to try_prune_one_dentry() and
- * lose our last reference through the parent walk. In this case, we need to
- * remove ourselves from the shrink list, not the LRU.
- */
-static void dentry_lru_del(struct dentry *dentry)
-{
- if (dentry->d_flags & DCACHE_LRU_LIST) {
- if (dentry->d_flags & DCACHE_SHRINK_LIST)
- return d_shrink_del(dentry);
- d_lru_del(dentry);
- }
-}
-
-/**
- * d_kill - kill dentry and return parent
- * @dentry: dentry to kill
- * @parent: parent dentry
- *
- * The dentry must already be unhashed and removed from the LRU.
- *
- * If this is the root of the dentry tree, return NULL.
- *
- * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
- * d_kill.
- */
-static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
- __releases(dentry->d_lock)
- __releases(parent->d_lock)
- __releases(dentry->d_inode->i_lock)
-{
- list_del(&dentry->d_u.d_child);
- /*
- * Inform d_walk() that we are no longer attached to the
- * dentry tree
- */
- dentry->d_flags |= DCACHE_DENTRY_KILLED;
- if (parent)
- spin_unlock(&parent->d_lock);
- dentry_iput(dentry);
- /*
- * dentry_iput drops the locks, at which point nobody (except
- * transient RCU lookups) can reach this dentry.
- */
- d_free(dentry);
- return parent;
-}
-
/**
* d_drop - drop a dentry
* @dentry: dentry to drop
@@ -499,37 +441,12 @@ void d_drop(struct dentry *dentry)
}
EXPORT_SYMBOL(d_drop);
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * If ref is non-zero, then decrement the refcount too.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *
-dentry_kill(struct dentry *dentry, int unlock_on_failure)
- __releases(dentry->d_lock)
+static void __dentry_kill(struct dentry *dentry)
{
- struct inode *inode;
- struct dentry *parent;
-
- inode = dentry->d_inode;
- if (inode && !spin_trylock(&inode->i_lock)) {
-relock:
- if (unlock_on_failure) {
- spin_unlock(&dentry->d_lock);
- cpu_relax();
- }
- return dentry; /* try again with same dentry */
- }
- if (IS_ROOT(dentry))
- parent = NULL;
- else
+ struct dentry *parent = NULL;
+ bool can_free = true;
+ if (!IS_ROOT(dentry))
parent = dentry->d_parent;
- if (parent && !spin_trylock(&parent->d_lock)) {
- if (inode)
- spin_unlock(&inode->i_lock);
- goto relock;
- }
/*
* The dentry is now unrecoverably dead to the world.
@@ -543,10 +460,103 @@ relock:
if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
dentry->d_op->d_prune(dentry);
- dentry_lru_del(dentry);
+ if (dentry->d_flags & DCACHE_LRU_LIST) {
+ if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
+ d_lru_del(dentry);
+ }
/* if it was on the hash then remove it */
__d_drop(dentry);
- return d_kill(dentry, parent);
+ list_del(&dentry->d_u.d_child);
+ /*
+ * Inform d_walk() that we are no longer attached to the
+ * dentry tree
+ */
+ dentry->d_flags |= DCACHE_DENTRY_KILLED;
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ dentry_iput(dentry);
+ /*
+ * dentry_iput drops the locks, at which point nobody (except
+ * transient RCU lookups) can reach this dentry.
+ */
+ BUG_ON((int)dentry->d_lockref.count > 0);
+ this_cpu_dec(nr_dentry);
+ if (dentry->d_op && dentry->d_op->d_release)
+ dentry->d_op->d_release(dentry);
+
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+ dentry->d_flags |= DCACHE_MAY_FREE;
+ can_free = false;
+ }
+ spin_unlock(&dentry->d_lock);
+ if (likely(can_free))
+ dentry_free(dentry);
+}
+
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static struct dentry *dentry_kill(struct dentry *dentry)
+ __releases(dentry->d_lock)
+{
+ struct inode *inode = dentry->d_inode;
+ struct dentry *parent = NULL;
+
+ if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+ goto failed;
+
+ if (!IS_ROOT(dentry)) {
+ parent = dentry->d_parent;
+ if (unlikely(!spin_trylock(&parent->d_lock))) {
+ if (inode)
+ spin_unlock(&inode->i_lock);
+ goto failed;
+ }
+ }
+
+ __dentry_kill(dentry);
+ return parent;
+
+failed:
+ spin_unlock(&dentry->d_lock);
+ cpu_relax();
+ return dentry; /* try again with same dentry */
+}
+
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+ struct dentry *parent = dentry->d_parent;
+ if (IS_ROOT(dentry))
+ return NULL;
+ if (likely(spin_trylock(&parent->d_lock)))
+ return parent;
+ spin_unlock(&dentry->d_lock);
+ rcu_read_lock();
+again:
+ parent = ACCESS_ONCE(dentry->d_parent);
+ spin_lock(&parent->d_lock);
+ /*
+ * We can't blindly lock dentry until we are sure
+ * that we won't violate the locking order.
+ * Any changes of dentry->d_parent must have
+ * been done with parent->d_lock held, so
+ * spin_lock() above is enough of a barrier
+ * for checking if it's still our child.
+ */
+ if (unlikely(parent != dentry->d_parent)) {
+ spin_unlock(&parent->d_lock);
+ goto again;
+ }
+ rcu_read_unlock();
+ if (parent != dentry)
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ else
+ parent = NULL;
+ return parent;
}
/*
@@ -602,7 +612,7 @@ repeat:
return;
kill_it:
- dentry = dentry_kill(dentry, 1);
+ dentry = dentry_kill(dentry);
if (dentry)
goto repeat;
}
@@ -815,64 +825,15 @@ restart:
}
EXPORT_SYMBOL(d_prune_aliases);
-/*
- * Try to throw away a dentry - free the inode, dput the parent.
- * Requires dentry->d_lock is held, and dentry->d_count == 0.
- * Releases dentry->d_lock.
- *
- * This may fail if locks cannot be acquired no problem, just try again.
- */
-static struct dentry * try_prune_one_dentry(struct dentry *dentry)
- __releases(dentry->d_lock)
-{
- struct dentry *parent;
-
- parent = dentry_kill(dentry, 0);
- /*
- * If dentry_kill returns NULL, we have nothing more to do.
- * if it returns the same dentry, trylocks failed. In either
- * case, just loop again.
- *
- * Otherwise, we need to prune ancestors too. This is necessary
- * to prevent quadratic behavior of shrink_dcache_parent(), but
- * is also expected to be beneficial in reducing dentry cache
- * fragmentation.
- */
- if (!parent)
- return NULL;
- if (parent == dentry)
- return dentry;
-
- /* Prune ancestors. */
- dentry = parent;
- while (dentry) {
- if (lockref_put_or_lock(&dentry->d_lockref))
- return NULL;
- dentry = dentry_kill(dentry, 1);
- }
- return NULL;
-}
-
static void shrink_dentry_list(struct list_head *list)
{
- struct dentry *dentry;
-
- rcu_read_lock();
- for (;;) {
- dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
- if (&dentry->d_lru == list)
- break; /* empty */
+ struct dentry *dentry, *parent;
- /*
- * Get the dentry lock, and re-verify that the dentry is
- * this on the shrinking list. If it is, we know that
- * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set.
- */
+ while (!list_empty(list)) {
+ struct inode *inode;
+ dentry = list_entry(list->prev, struct dentry, d_lru);
spin_lock(&dentry->d_lock);
- if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
- spin_unlock(&dentry->d_lock);
- continue;
- }
+ parent = lock_parent(dentry);
/*
* The dispose list is isolated and dentries are not accounted
@@ -885,30 +846,63 @@ static void shrink_dentry_list(struct list_head *list)
* We found an inuse dentry which was not removed from
* the LRU because of laziness during lookup. Do not free it.
*/
- if (dentry->d_lockref.count) {
+ if ((int)dentry->d_lockref.count > 0) {
spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
continue;
}
- rcu_read_unlock();
- /*
- * If 'try_to_prune()' returns a dentry, it will
- * be the same one we passed in, and d_lock will
- * have been held the whole time, so it will not
- * have been added to any other lists. We failed
- * to get the inode lock.
- *
- * We just add it back to the shrink list.
- */
- dentry = try_prune_one_dentry(dentry);
- rcu_read_lock();
- if (dentry) {
+ if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
+ bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
+ spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ if (can_free)
+ dentry_free(dentry);
+ continue;
+ }
+
+ inode = dentry->d_inode;
+ if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
d_shrink_add(dentry, list);
spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ continue;
+ }
+
+ __dentry_kill(dentry);
+
+ /*
+ * We need to prune ancestors too. This is necessary to prevent
+ * quadratic behavior of shrink_dcache_parent(), but is also
+ * expected to be beneficial in reducing dentry cache
+ * fragmentation.
+ */
+ dentry = parent;
+ while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
+ parent = lock_parent(dentry);
+ if (dentry->d_lockref.count != 1) {
+ dentry->d_lockref.count--;
+ spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ break;
+ }
+ inode = dentry->d_inode; /* can't be NULL */
+ if (unlikely(!spin_trylock(&inode->i_lock))) {
+ spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ cpu_relax();
+ continue;
+ }
+ __dentry_kill(dentry);
+ dentry = parent;
}
}
- rcu_read_unlock();
}
static enum lru_status
@@ -1261,34 +1255,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
if (data->start == dentry)
goto out;
- /*
- * move only zero ref count dentries to the dispose list.
- *
- * Those which are presently on the shrink list, being processed
- * by shrink_dentry_list(), shouldn't be moved. Otherwise the
- * loop in shrink_dcache_parent() might not make any progress
- * and loop forever.
- */
- if (dentry->d_lockref.count) {
- dentry_lru_del(dentry);
- } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
- /*
- * We can't use d_lru_shrink_move() because we
- * need to get the global LRU lock and do the
- * LRU accounting.
- */
- d_lru_del(dentry);
- d_shrink_add(dentry, &data->dispose);
+ if (dentry->d_flags & DCACHE_SHRINK_LIST) {
data->found++;
- ret = D_WALK_NORETRY;
+ } else {
+ if (dentry->d_flags & DCACHE_LRU_LIST)
+ d_lru_del(dentry);
+ if (!dentry->d_lockref.count) {
+ d_shrink_add(dentry, &data->dispose);
+ data->found++;
+ }
}
/*
* We can return to the caller if we have found some (this
* ensures forward progress). We'll be coming back to find
* the rest.
*/
- if (data->found && need_resched())
- ret = D_WALK_QUIT;
+ if (!list_empty(&data->dispose))
+ ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
return ret;
}
@@ -1318,45 +1301,35 @@ void shrink_dcache_parent(struct dentry *parent)
}
EXPORT_SYMBOL(shrink_dcache_parent);
-static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry)
+static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
{
- struct select_data *data = _data;
- enum d_walk_ret ret = D_WALK_CONTINUE;
+ /* it has busy descendents; complain about those instead */
+ if (!list_empty(&dentry->d_subdirs))
+ return D_WALK_CONTINUE;
- if (dentry->d_lockref.count) {
- dentry_lru_del(dentry);
- if (likely(!list_empty(&dentry->d_subdirs)))
- goto out;
- if (dentry == data->start && dentry->d_lockref.count == 1)
- goto out;
- printk(KERN_ERR
- "BUG: Dentry %p{i=%lx,n=%s}"
- " still in use (%d)"
- " [unmount of %s %s]\n",
+ /* root with refcount 1 is fine */
+ if (dentry == _data && dentry->d_lockref.count == 1)
+ return D_WALK_CONTINUE;
+
+ printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
+ " still in use (%d) [unmount of %s %s]\n",
dentry,
dentry->d_inode ?
dentry->d_inode->i_ino : 0UL,
- dentry->d_name.name,
+ dentry,
dentry->d_lockref.count,
dentry->d_sb->s_type->name,
dentry->d_sb->s_id);
- BUG();
- } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
- /*
- * We can't use d_lru_shrink_move() because we
- * need to get the global LRU lock and do the
- * LRU accounting.
- */
- if (dentry->d_flags & DCACHE_LRU_LIST)
- d_lru_del(dentry);
- d_shrink_add(dentry, &data->dispose);
- data->found++;
- ret = D_WALK_NORETRY;
- }
-out:
- if (data->found && need_resched())
- ret = D_WALK_QUIT;
- return ret;
+ WARN_ON(1);
+ return D_WALK_CONTINUE;
+}
+
+static void do_one_tree(struct dentry *dentry)
+{
+ shrink_dcache_parent(dentry);
+ d_walk(dentry, dentry, umount_check, NULL);
+ d_drop(dentry);
+ dput(dentry);
}
/*
@@ -1366,40 +1339,15 @@ void shrink_dcache_for_umount(struct super_block *sb)
{
struct dentry *dentry;
- if (down_read_trylock(&sb->s_umount))
- BUG();
+ WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");
dentry = sb->s_root;
sb->s_root = NULL;
- for (;;) {
- struct select_data data;
-
- INIT_LIST_HEAD(&data.dispose);
- data.start = dentry;
- data.found = 0;
-
- d_walk(dentry, &data, umount_collect, NULL);
- if (!data.found)
- break;
-
- shrink_dentry_list(&data.dispose);
- cond_resched();
- }
- d_drop(dentry);
- dput(dentry);
+ do_one_tree(dentry);
while (!hlist_bl_empty(&sb->s_anon)) {
- struct select_data data;
- dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
-
- INIT_LIST_HEAD(&data.dispose);
- data.start = NULL;
- data.found = 0;
-
- d_walk(dentry, &data, umount_collect, NULL);
- if (data.found)
- shrink_dentry_list(&data.dispose);
- cond_resched();
+ dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash));
+ do_one_tree(dentry);
}
}
@@ -1647,8 +1595,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
unsigned add_flags = d_flags_for_inode(inode);
spin_lock(&dentry->d_lock);
- dentry->d_flags &= ~DCACHE_ENTRY_TYPE;
- dentry->d_flags |= add_flags;
+ __d_set_type(dentry, add_flags);
if (inode)
hlist_add_head(&dentry->d_alias, &inode->i_dentry);
dentry->d_inode = inode;
diff --git a/fs/exec.c b/fs/exec.c
index 476f3ebf437e..238b7aa26f68 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -657,10 +657,10 @@ int setup_arg_pages(struct linux_binprm *bprm,
unsigned long rlim_stack;
#ifdef CONFIG_STACK_GROWSUP
- /* Limit stack size to 1GB */
+ /* Limit stack size */
stack_base = rlimit_max(RLIMIT_STACK);
- if (stack_base > (1 << 30))
- stack_base = 1 << 30;
+ if (stack_base > STACK_SIZE_MAX)
+ stack_base = STACK_SIZE_MAX;
/* Make sure we didn't let the argument array grow too large. */
if (vma->vm_end - vma->vm_start > stack_base)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 6ea7b1436bbc..5c56785007e0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -667,7 +667,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
continue;
x = ext4_count_free(bitmap_bh->b_data,
- EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ EXT4_CLUSTERS_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
i, ext4_free_group_clusters(sb, gdp), x);
bitmap_count += x;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f1c65dc7cc0a..66946aa62127 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2466,23 +2466,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
up_write(&EXT4_I(inode)->i_data_sem);
}
-/*
- * Update i_disksize after writeback has been started. Races with truncate
- * are avoided by checking i_size under i_data_sem.
- */
-static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
-{
- loff_t i_size;
-
- down_write(&EXT4_I(inode)->i_data_sem);
- i_size = i_size_read(inode);
- if (newsize > i_size)
- newsize = i_size;
- if (newsize > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = newsize;
- up_write(&EXT4_I(inode)->i_data_sem);
-}
-
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 82df3ce9874a..01b0c208f625 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3313,6 +3313,11 @@ static int ext4_split_extent(handle_t *handle,
return PTR_ERR(path);
depth = ext_depth(inode);
ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ return -EIO;
+ }
uninitialized = ext4_ext_is_uninitialized(ex);
split_flag1 = 0;
@@ -3694,6 +3699,12 @@ static int ext4_convert_initialized_extents(handle_t *handle,
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ err = -EIO;
+ goto out;
+ }
}
err = ext4_ext_get_access(handle, inode, path + depth);
@@ -4730,6 +4741,9 @@ static long ext4_zero_range(struct file *file, loff_t offset,
trace_ext4_zero_range(inode, offset, len, mode);
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
@@ -4878,9 +4892,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (mode & FALLOC_FL_PUNCH_HOLE)
return ext4_punch_hole(inode, offset, len);
- if (mode & FALLOC_FL_COLLAPSE_RANGE)
- return ext4_collapse_range(inode, offset, len);
-
ret = ext4_convert_inline_data(inode);
if (ret)
return ret;
@@ -4892,6 +4903,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
+ if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ return ext4_collapse_range(inode, offset, len);
+
if (mode & FALLOC_FL_ZERO_RANGE)
return ext4_zero_range(file, offset, len, mode);
@@ -5229,18 +5243,19 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
update = 1;
- *start = ex_last->ee_block +
+ *start = le32_to_cpu(ex_last->ee_block) +
ext4_ext_get_actual_len(ex_last);
while (ex_start <= ex_last) {
- ex_start->ee_block -= shift;
- if (ex_start >
- EXT_FIRST_EXTENT(path[depth].p_hdr)) {
- if (ext4_ext_try_to_merge_right(inode,
- path, ex_start - 1))
- ex_last--;
- }
- ex_start++;
+ le32_add_cpu(&ex_start->ee_block, -shift);
+ /* Try to merge to the left. */
+ if ((ex_start >
+ EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
+ ext4_ext_try_to_merge_right(inode,
+ path, ex_start - 1))
+ ex_last--;
+ else
+ ex_start++;
}
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
@@ -5255,7 +5270,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
if (err)
goto out;
- path[depth].p_idx->ei_block -= shift;
+ le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
goto out;
@@ -5300,7 +5315,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
return ret;
}
- stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
+ stop_block = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
ext4_ext_drop_refs(path);
kfree(path);
@@ -5313,10 +5329,18 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
* enough to accomodate the shift.
*/
path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
depth = path->p_depth;
extent = path[depth].p_ext;
- ex_start = extent->ee_block;
- ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
+ if (extent) {
+ ex_start = le32_to_cpu(extent->ee_block);
+ ex_end = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
+ } else {
+ ex_start = 0;
+ ex_end = 0;
+ }
ext4_ext_drop_refs(path);
kfree(path);
@@ -5331,7 +5355,13 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
return PTR_ERR(path);
depth = path->p_depth;
extent = path[depth].p_ext;
- current_block = extent->ee_block;
+ if (!extent) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) start);
+ return -EIO;
+ }
+
+ current_block = le32_to_cpu(extent->ee_block);
if (start > current_block) {
/* Hole, move to the next extent */
ret = mext_next_extent(inode, path, &extent);
@@ -5365,17 +5395,18 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ext4_lblk_t punch_start, punch_stop;
handle_t *handle;
unsigned int credits;
- loff_t new_size;
+ loff_t new_size, ioffset;
int ret;
- BUG_ON(offset + len > i_size_read(inode));
-
/* Collapse range works only on fs block size aligned offsets. */
if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
len & (EXT4_BLOCK_SIZE(sb) - 1))
return -EINVAL;
if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1)
return -EOPNOTSUPP;
trace_ext4_collapse_range(inode, offset, len);
@@ -5383,22 +5414,34 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+ /* Call ext4_force_commit to flush all data in case of data=journal. */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Need to round down offset to be aligned with page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+
/* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1);
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+ LLONG_MAX);
if (ret)
return ret;
/* Take mutex lock */
mutex_lock(&inode->i_mutex);
- /* It's not possible punch hole on append only file */
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
- ret = -EPERM;
- goto out_mutex;
- }
-
- if (IS_SWAPFILE(inode)) {
- ret = -ETXTBSY;
+ /*
+ * There is no need to overlap collapse range with EOF, in which case
+ * it is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode)) {
+ ret = -EINVAL;
goto out_mutex;
}
@@ -5408,7 +5451,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
- truncate_pagecache_range(inode, offset, -1);
+ truncate_pagecache(inode, ioffset);
/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
@@ -5425,7 +5468,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ext4_discard_preallocations(inode);
ret = ext4_es_remove_extent(inode, punch_start,
- EXT_MAX_BLOCKS - punch_start - 1);
+ EXT_MAX_BLOCKS - punch_start);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
@@ -5436,6 +5479,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
+ ext4_discard_preallocations(inode);
ret = ext4_ext_shift_extents(inode, handle, punch_stop,
punch_stop - punch_start);
@@ -5445,10 +5489,9 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
}
new_size = i_size_read(inode) - len;
- truncate_setsize(inode, new_size);
+ i_size_write(inode, new_size);
EXT4_I(inode)->i_disksize = new_size;
- ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 0a014a7194b2..0ebc21204b51 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -810,7 +810,7 @@ retry:
newes.es_lblk = end + 1;
newes.es_len = len2;
- block = 0x7FDEADBEEF;
+ block = 0x7FDEADBEEFULL;
if (ext4_es_is_written(&orig_es) ||
ext4_es_is_unwritten(&orig_es))
block = ext4_es_pblock(&orig_es) +
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ca7502d89fde..063fc1538355 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -82,7 +82,7 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
size_t count = iov_length(iov, nr_segs);
loff_t final_size = pos + count;
- if (pos >= inode->i_size)
+ if (pos >= i_size_read(inode))
return 0;
if ((pos & blockmask) || (final_size & blockmask))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5b0d2c7d5408..d7b7462a0e13 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -522,6 +522,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (unlikely(map->m_len > INT_MAX))
map->m_len = INT_MAX;
+ /* We can handle the block number less than EXT_MAX_BLOCKS */
+ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
+ return -EIO;
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
ext4_es_lru_add(inode);
@@ -2243,13 +2247,23 @@ static int mpage_map_and_submit_extent(handle_t *handle,
return err;
} while (map->m_len);
- /* Update on-disk size after IO is submitted */
+ /*
+ * Update on-disk size after IO is submitted. Races with
+ * truncate are avoided by checking i_size under i_data_sem.
+ */
disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
if (disksize > EXT4_I(inode)->i_disksize) {
int err2;
-
- ext4_wb_update_i_disksize(inode, disksize);
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (disksize > i_size)
+ disksize = i_size;
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
err2 = ext4_mark_inode_dirty(handle, inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
if (err2)
ext4_error(inode->i_sb,
"Failed to mark inode %lu dirty",
@@ -3527,15 +3541,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
mutex_lock(&inode->i_mutex);
- /* It's not possible punch hole on append only file */
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
- ret = -EPERM;
- goto out_mutex;
- }
- if (IS_SWAPFILE(inode)) {
- ret = -ETXTBSY;
- goto out_mutex;
- }
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
@@ -3616,7 +3621,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
ret = ext4_free_hole_blocks(handle, inode, first_block,
stop_block);
- ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
@@ -4423,21 +4427,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
* transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -4449,15 +4452,15 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err;
- if (current->flags & PF_MEMALLOC)
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
if (EXT4_SB(inode->i_sb)->s_journal) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a888cac76e9c..c8238a26818c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -989,7 +989,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
poff = block % blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (!page)
- return -EIO;
+ return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
@@ -1003,7 +1003,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
pnum = block / blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (!page)
- return -EIO;
+ return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
e4b->bd_buddy_page = page;
return 0;
@@ -1168,7 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
@@ -1197,7 +1201,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
@@ -5008,6 +5016,8 @@ error_return:
*/
static int ext4_trim_extent(struct super_block *sb, int start, int count,
ext4_group_t group, struct ext4_buddy *e4b)
+__releases(bitlock)
+__acquires(bitlock)
{
struct ext4_free_extent ex;
int ret = 0;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index ab95508e3d40..c18d95b50540 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -308,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error)
if (error) {
struct inode *inode = io_end->inode;
- ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+ ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
"(offset %llu size %ld starting block %llu)",
- inode->i_ino,
+ error, inode->i_ino,
(unsigned long long) io_end->offset,
(long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
+ mapping_set_error(inode->i_mapping, error);
}
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f3c667091618..6f9e6fadac04 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3869,19 +3869,38 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount2;
}
}
+
+ /*
+ * set up enough so that it can read an inode,
+ * and create new inode for buddy allocator
+ */
+ sbi->s_gdb_count = db_count;
+ if (!test_opt(sb, NOLOAD) &&
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ sb->s_op = &ext4_sops;
+ else
+ sb->s_op = &ext4_nojournal_sops;
+
+ ext4_ext_init(sb);
+ err = ext4_mb_init(sb);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
+ err);
+ goto failed_mount2;
+ }
+
if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
- goto failed_mount2;
+ goto failed_mount2a;
}
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
if (!ext4_fill_flex_info(sb)) {
ext4_msg(sb, KERN_ERR,
"unable to initialize "
"flex_bg meta info!");
- goto failed_mount2;
+ goto failed_mount2a;
}
- sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
@@ -3916,14 +3935,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_extent_max_zeroout_kb = 32;
- /*
- * set up enough so that it can read an inode
- */
- if (!test_opt(sb, NOLOAD) &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
- sb->s_op = &ext4_sops;
- else
- sb->s_op = &ext4_nojournal_sops;
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_QUOTA
@@ -4113,21 +4124,13 @@ no_journal:
if (err) {
ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
"reserved pool", ext4_calculate_resv_clusters(sb));
- goto failed_mount4a;
+ goto failed_mount5;
}
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
"zone (%d)", err);
- goto failed_mount4a;
- }
-
- ext4_ext_init(sb);
- err = ext4_mb_init(sb);
- if (err) {
- ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
- err);
goto failed_mount5;
}
@@ -4204,11 +4207,8 @@ failed_mount8:
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
- ext4_mb_release(sb);
-failed_mount5:
- ext4_ext_release(sb);
ext4_release_system_zone(sb);
-failed_mount4a:
+failed_mount5:
dput(sb->s_root);
sb->s_root = NULL;
failed_mount4:
@@ -4232,11 +4232,14 @@ failed_mount3:
percpu_counter_destroy(&sbi->s_extent_cache_cnt);
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
+failed_mount2a:
+ ext4_mb_release(sb);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
ext4_kvfree(sbi->s_group_desc);
failed_mount:
+ ext4_ext_release(sb);
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
if (sbi->s_proc) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 1f5cf5880718..4eec399ec807 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -520,8 +520,8 @@ static void ext4_xattr_update_super_block(handle_t *handle,
}
/*
- * Release the xattr block BH: If the reference count is > 1, decrement
- * it; otherwise free the block.
+ * Release the xattr block BH: If the reference count is > 1, decrement it;
+ * otherwise free the block.
*/
static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
@@ -542,16 +542,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
if (ce)
mb_cache_entry_free(ce);
get_bh(bh);
+ unlock_buffer(bh);
ext4_free_blocks(handle, inode, bh, 0, 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
- unlock_buffer(bh);
} else {
le32_add_cpu(&BHDR(bh)->h_refcount, -1);
if (ce)
mb_cache_entry_release(ce);
+ /*
+ * Beware of this ugliness: Releasing of xattr block references
+ * from different inodes can race and so we have to protect
+ * from a race where someone else frees the block (and releases
+ * its journal_head) before we are done dirtying the buffer. In
+ * nojournal mode this race is harmless and we actually cannot
+ * call ext4_handle_dirty_xattr_block() with locked buffer as
+ * that function can call sync_dirty_buffer() so for that case
+ * we handle the dirtying after unlocking the buffer.
+ */
+ if (ext4_handle_valid(handle))
+ error = ext4_handle_dirty_xattr_block(handle, inode,
+ bh);
unlock_buffer(bh);
- error = ext4_handle_dirty_xattr_block(handle, inode, bh);
+ if (!ext4_handle_valid(handle))
+ error = ext4_handle_dirty_xattr_block(handle, inode,
+ bh);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9ead1596399a..72c82f69b01b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -274,15 +274,15 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
break;
#if BITS_PER_LONG != 32
/* 32-bit arches must use fcntl64() */
- case F_GETLKP:
+ case F_OFD_GETLK:
#endif
case F_GETLK:
err = fcntl_getlk(filp, cmd, (struct flock __user *) arg);
break;
#if BITS_PER_LONG != 32
/* 32-bit arches must use fcntl64() */
- case F_SETLKP:
- case F_SETLKPW:
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
#endif
/* Fallthrough */
case F_SETLK:
@@ -399,13 +399,13 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
switch (cmd) {
case F_GETLK64:
- case F_GETLKP:
+ case F_OFD_GETLK:
err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg);
break;
case F_SETLK64:
case F_SETLKW64:
- case F_SETLKP:
- case F_SETLKPW:
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
err = fcntl_setlk64(fd, f.file, cmd,
(struct flock64 __user *) arg);
break;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index a0b0855d00a9..205e0d5d5307 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -348,7 +348,7 @@ int __init fuse_ctl_init(void)
return register_filesystem(&fuse_ctl_fs_type);
}
-void fuse_ctl_cleanup(void)
+void __exit fuse_ctl_cleanup(void)
{
unregister_filesystem(&fuse_ctl_fs_type);
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5b4e035b364c..42198359fa1b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -679,6 +679,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
return create_new_entry(fc, req, dir, entry, S_IFLNK);
}
+static inline void fuse_update_ctime(struct inode *inode)
+{
+ if (!IS_NOCMTIME(inode)) {
+ inode->i_ctime = current_fs_time(inode->i_sb);
+ mark_inode_dirty_sync(inode);
+ }
+}
+
static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
@@ -713,6 +721,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
fuse_invalidate_attr(inode);
fuse_invalidate_attr(dir);
fuse_invalidate_entry_cache(entry);
+ fuse_update_ctime(inode);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -743,23 +752,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
return err;
}
-static int fuse_rename(struct inode *olddir, struct dentry *oldent,
- struct inode *newdir, struct dentry *newent)
+static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
+ struct inode *newdir, struct dentry *newent,
+ unsigned int flags, int opcode, size_t argsize)
{
int err;
- struct fuse_rename_in inarg;
+ struct fuse_rename2_in inarg;
struct fuse_conn *fc = get_fuse_conn(olddir);
- struct fuse_req *req = fuse_get_req_nopages(fc);
+ struct fuse_req *req;
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
- memset(&inarg, 0, sizeof(inarg));
+ memset(&inarg, 0, argsize);
inarg.newdir = get_node_id(newdir);
- req->in.h.opcode = FUSE_RENAME;
+ inarg.flags = flags;
+ req->in.h.opcode = opcode;
req->in.h.nodeid = get_node_id(olddir);
req->in.numargs = 3;
- req->in.args[0].size = sizeof(inarg);
+ req->in.args[0].size = argsize;
req->in.args[0].value = &inarg;
req->in.args[1].size = oldent->d_name.len + 1;
req->in.args[1].value = oldent->d_name.name;
@@ -771,15 +783,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
if (!err) {
/* ctime changes */
fuse_invalidate_attr(oldent->d_inode);
+ fuse_update_ctime(oldent->d_inode);
+
+ if (flags & RENAME_EXCHANGE) {
+ fuse_invalidate_attr(newent->d_inode);
+ fuse_update_ctime(newent->d_inode);
+ }
fuse_invalidate_attr(olddir);
if (olddir != newdir)
fuse_invalidate_attr(newdir);
/* newent will end up negative */
- if (newent->d_inode) {
+ if (!(flags & RENAME_EXCHANGE) && newent->d_inode) {
fuse_invalidate_attr(newent->d_inode);
fuse_invalidate_entry_cache(newent);
+ fuse_update_ctime(newent->d_inode);
}
} else if (err == -EINTR) {
/* If request was interrupted, DEITY only knows if the
@@ -795,6 +814,36 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
return err;
}
+static int fuse_rename(struct inode *olddir, struct dentry *oldent,
+ struct inode *newdir, struct dentry *newent)
+{
+ return fuse_rename_common(olddir, oldent, newdir, newent, 0,
+ FUSE_RENAME, sizeof(struct fuse_rename_in));
+}
+
+static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
+ struct inode *newdir, struct dentry *newent,
+ unsigned int flags)
+{
+ struct fuse_conn *fc = get_fuse_conn(olddir);
+ int err;
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (fc->no_rename2 || fc->minor < 23)
+ return -EINVAL;
+
+ err = fuse_rename_common(olddir, oldent, newdir, newent, flags,
+ FUSE_RENAME2, sizeof(struct fuse_rename2_in));
+ if (err == -ENOSYS) {
+ fc->no_rename2 = 1;
+ err = -EINVAL;
+ }
+ return err;
+
+}
+
static int fuse_link(struct dentry *entry, struct inode *newdir,
struct dentry *newent)
{
@@ -829,6 +878,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
inc_nlink(inode);
spin_unlock(&fc->lock);
fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
} else if (err == -EINTR) {
fuse_invalidate_attr(inode);
}
@@ -846,6 +896,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
attr->size = i_size_read(inode);
attr->mtime = inode->i_mtime.tv_sec;
attr->mtimensec = inode->i_mtime.tv_nsec;
+ attr->ctime = inode->i_ctime.tv_sec;
+ attr->ctimensec = inode->i_ctime.tv_nsec;
}
stat->dev = inode->i_sb->s_dev;
@@ -1504,7 +1556,7 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
}
static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
- bool trust_local_mtime)
+ bool trust_local_cmtime)
{
unsigned ivalid = iattr->ia_valid;
@@ -1523,13 +1575,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
if (!(ivalid & ATTR_ATIME_SET))
arg->valid |= FATTR_ATIME_NOW;
}
- if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) {
+ if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) {
arg->valid |= FATTR_MTIME;
arg->mtime = iattr->ia_mtime.tv_sec;
arg->mtimensec = iattr->ia_mtime.tv_nsec;
- if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime)
+ if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime)
arg->valid |= FATTR_MTIME_NOW;
}
+ if ((ivalid & ATTR_CTIME) && trust_local_cmtime) {
+ arg->valid |= FATTR_CTIME;
+ arg->ctime = iattr->ia_ctime.tv_sec;
+ arg->ctimensec = iattr->ia_ctime.tv_nsec;
+ }
}
/*
@@ -1597,39 +1654,38 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
/*
* Flush inode->i_mtime to the server
*/
-int fuse_flush_mtime(struct file *file, bool nofail)
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
{
- struct inode *inode = file->f_mapping->host;
- struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = NULL;
+ struct fuse_req *req;
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
int err;
- if (nofail) {
- req = fuse_get_req_nofail_nopages(fc, file);
- } else {
- req = fuse_get_req_nopages(fc);
- if (IS_ERR(req))
- return PTR_ERR(req);
- }
+ req = fuse_get_req_nopages(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
- inarg.valid |= FATTR_MTIME;
+ inarg.valid = FATTR_MTIME;
inarg.mtime = inode->i_mtime.tv_sec;
inarg.mtimensec = inode->i_mtime.tv_nsec;
-
+ if (fc->minor >= 23) {
+ inarg.valid |= FATTR_CTIME;
+ inarg.ctime = inode->i_ctime.tv_sec;
+ inarg.ctimensec = inode->i_ctime.tv_nsec;
+ }
+ if (ff) {
+ inarg.valid |= FATTR_FH;
+ inarg.fh = ff->fh;
+ }
fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
- if (!err)
- clear_bit(FUSE_I_MTIME_DIRTY, &fi->state);
-
return err;
}
@@ -1653,7 +1709,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
bool is_wb = fc->writeback_cache;
loff_t oldsize;
int err;
- bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode);
+ bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
attr->ia_valid |= ATTR_FORCE;
@@ -1678,11 +1734,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
if (is_truncate) {
fuse_set_nowrite(inode);
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ if (trust_local_cmtime && attr->ia_size != inode->i_size)
+ attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
}
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
- iattr_to_fattr(attr, &inarg, trust_local_mtime);
+ iattr_to_fattr(attr, &inarg, trust_local_cmtime);
if (file) {
struct fuse_file *ff = file->private_data;
inarg.valid |= FATTR_FH;
@@ -1711,9 +1769,12 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
spin_lock(&fc->lock);
/* the kernel maintains i_mtime locally */
- if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) {
- inode->i_mtime = attr->ia_mtime;
- clear_bit(FUSE_I_MTIME_DIRTY, &fi->state);
+ if (trust_local_cmtime) {
+ if (attr->ia_valid & ATTR_MTIME)
+ inode->i_mtime = attr->ia_mtime;
+ if (attr->ia_valid & ATTR_CTIME)
+ inode->i_ctime = attr->ia_ctime;
+ /* FIXME: clear I_DIRTY_SYNC? */
}
fuse_change_attributes_common(inode, &outarg.attr,
@@ -1810,8 +1871,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
fc->no_setxattr = 1;
err = -EOPNOTSUPP;
}
- if (!err)
+ if (!err) {
fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
+ }
return err;
}
@@ -1941,20 +2004,11 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
fc->no_removexattr = 1;
err = -EOPNOTSUPP;
}
- if (!err)
+ if (!err) {
fuse_invalidate_attr(inode);
- return err;
-}
-
-static int fuse_update_time(struct inode *inode, struct timespec *now,
- int flags)
-{
- if (flags & S_MTIME) {
- inode->i_mtime = *now;
- set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state);
- BUG_ON(!S_ISREG(inode->i_mode));
+ fuse_update_ctime(inode);
}
- return 0;
+ return err;
}
static const struct inode_operations fuse_dir_inode_operations = {
@@ -1964,6 +2018,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
.unlink = fuse_unlink,
.rmdir = fuse_rmdir,
.rename = fuse_rename,
+ .rename2 = fuse_rename2,
.link = fuse_link,
.setattr = fuse_setattr,
.create = fuse_create,
@@ -1996,7 +2051,6 @@ static const struct inode_operations fuse_common_inode_operations = {
.getxattr = fuse_getxattr,
.listxattr = fuse_listxattr,
.removexattr = fuse_removexattr,
- .update_time = fuse_update_time,
};
static const struct inode_operations fuse_symlink_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 13f8bdec5110..96d513e01a5d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -223,6 +223,8 @@ void fuse_finish_open(struct inode *inode, struct file *file)
i_size_write(inode, 0);
spin_unlock(&fc->lock);
fuse_invalidate_attr(inode);
+ if (fc->writeback_cache)
+ file_update_time(file);
}
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
@@ -232,18 +234,26 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
+ bool lock_inode = (file->f_flags & O_TRUNC) &&
+ fc->atomic_o_trunc &&
+ fc->writeback_cache;
err = generic_file_open(inode, file);
if (err)
return err;
+ if (lock_inode)
+ mutex_lock(&inode->i_mutex);
+
err = fuse_do_open(fc, get_node_id(inode), file, isdir);
- if (err)
- return err;
- fuse_finish_open(inode, file);
+ if (!err)
+ fuse_finish_open(inode, file);
- return 0;
+ if (lock_inode)
+ mutex_unlock(&inode->i_mutex);
+
+ return err;
}
static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
@@ -314,10 +324,7 @@ static int fuse_release(struct inode *inode, struct file *file)
/* see fuse_vma_close() for !writeback_cache case */
if (fc->writeback_cache)
- filemap_write_and_wait(file->f_mapping);
-
- if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state))
- fuse_flush_mtime(file, true);
+ write_inode_now(inode, 1);
fuse_release_common(file, FUSE_RELEASE);
@@ -439,7 +446,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (fc->no_flush)
return 0;
- err = filemap_write_and_wait(file->f_mapping);
+ err = write_inode_now(inode, 1);
if (err)
return err;
@@ -480,13 +487,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
if (is_bad_inode(inode))
return -EIO;
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (err)
- return err;
-
- if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
- return 0;
-
mutex_lock(&inode->i_mutex);
/*
@@ -494,17 +494,17 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
* wait for all outstanding writes, before sending the FSYNC
* request.
*/
- err = write_inode_now(inode, 0);
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (err)
goto out;
fuse_sync_writes(inode);
+ err = sync_inode_metadata(inode, 1);
+ if (err)
+ goto out;
- if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) {
- int err = fuse_flush_mtime(file, false);
- if (err)
- goto out;
- }
+ if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
+ goto out;
req = fuse_get_req_nopages(fc);
if (IS_ERR(req)) {
@@ -1659,13 +1659,13 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
fuse_writepage_free(fc, req);
}
-static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
- struct fuse_inode *fi)
+static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
+ struct fuse_inode *fi)
{
struct fuse_file *ff = NULL;
spin_lock(&fc->lock);
- if (!WARN_ON(list_empty(&fi->write_files))) {
+ if (!list_empty(&fi->write_files)) {
ff = list_entry(fi->write_files.next, struct fuse_file,
write_entry);
fuse_file_get(ff);
@@ -1675,6 +1675,29 @@ static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
return ff;
}
+static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
+ struct fuse_inode *fi)
+{
+ struct fuse_file *ff = __fuse_write_file_get(fc, fi);
+ WARN_ON(!ff);
+ return ff;
+}
+
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_file *ff;
+ int err;
+
+ ff = __fuse_write_file_get(fc, fi);
+ err = fuse_flush_times(inode, ff);
+ if (ff)
+ fuse_file_put(ff, 0);
+
+ return err;
+}
+
static int fuse_writepage_locked(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -2972,6 +2995,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
(mode & FALLOC_FL_PUNCH_HOLE);
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
if (fc->no_fallocate)
return -EOPNOTSUPP;
@@ -3017,12 +3043,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE)) {
bool changed = fuse_write_update_size(inode, offset + length);
- if (changed && fc->writeback_cache) {
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- inode->i_mtime = current_fs_time(inode->i_sb);
- set_bit(FUSE_I_MTIME_DIRTY, &fi->state);
- }
+ if (changed && fc->writeback_cache)
+ file_update_time(file);
}
if (mode & FALLOC_FL_PUNCH_HOLE)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index a257ed8ebee6..7aa5c75e0de1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -119,8 +119,6 @@ enum {
FUSE_I_INIT_RDPLUS,
/** An operation changing file size is in progress */
FUSE_I_SIZE_UNSTABLE,
- /** i_mtime has been updated locally; a flush to userspace needed */
- FUSE_I_MTIME_DIRTY,
};
struct fuse_conn;
@@ -544,6 +542,9 @@ struct fuse_conn {
/** Is fallocate not implemented by fs? */
unsigned no_fallocate:1;
+ /** Is rename with flags implemented by fs? */
+ unsigned no_rename2:1;
+
/** Use enhanced/automatic page cache invalidation. */
unsigned auto_inval_data:1;
@@ -725,7 +726,7 @@ int fuse_dev_init(void);
void fuse_dev_cleanup(void);
int fuse_ctl_init(void);
-void fuse_ctl_cleanup(void);
+void __exit fuse_ctl_cleanup(void);
/**
* Allocate a request
@@ -891,7 +892,8 @@ int fuse_dev_release(struct inode *inode, struct file *file);
bool fuse_write_update_size(struct inode *inode, loff_t pos);
-int fuse_flush_mtime(struct file *file, bool nofail);
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
int fuse_do_setattr(struct inode *inode, struct iattr *attr,
struct file *file);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 8d611696fcad..754dcf23de8a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -175,9 +175,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
inode->i_mtime.tv_sec = attr->mtime;
inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode->i_ctime.tv_sec = attr->ctime;
+ inode->i_ctime.tv_nsec = attr->ctimensec;
}
- inode->i_ctime.tv_sec = attr->ctime;
- inode->i_ctime.tv_nsec = attr->ctimensec;
if (attr->blksize != 0)
inode->i_blkbits = ilog2(attr->blksize);
@@ -256,6 +256,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
inode->i_size = attr->size;
inode->i_mtime.tv_sec = attr->mtime;
inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode->i_ctime.tv_sec = attr->ctime;
+ inode->i_ctime.tv_nsec = attr->ctimensec;
if (S_ISREG(inode->i_mode)) {
fuse_init_common(inode);
fuse_init_file_inode(inode);
@@ -303,7 +305,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
if ((inode->i_state & I_NEW)) {
inode->i_flags |= S_NOATIME;
- if (!fc->writeback_cache || !S_ISREG(inode->i_mode))
+ if (!fc->writeback_cache || !S_ISREG(attr->mode))
inode->i_flags |= S_NOCMTIME;
inode->i_generation = generation;
inode->i_data.backing_dev_info = &fc->bdi;
@@ -788,6 +790,7 @@ static const struct super_operations fuse_super_operations = {
.alloc_inode = fuse_alloc_inode,
.destroy_inode = fuse_destroy_inode,
.evict_inode = fuse_evict_inode,
+ .write_inode = fuse_write_inode,
.drop_inode = generic_delete_inode,
.remount_fs = fuse_remount_fs,
.put_super = fuse_put_super,
@@ -890,6 +893,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->async_dio = 1;
if (arg->flags & FUSE_WRITEBACK_CACHE)
fc->writeback_cache = 1;
+ if (arg->time_gran && arg->time_gran <= 1000000000)
+ fc->sb->s_time_gran = arg->time_gran;
+ else
+ fc->sb->s_time_gran = 1000000000;
+
} else {
ra_pages = fc->max_read / PAGE_CACHE_SIZE;
fc->no_lock = 1;
@@ -996,7 +1004,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (sb->s_flags & MS_MANDLOCK)
goto err;
- sb->s_flags &= ~MS_NOSEC;
+ sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION);
if (!parse_fuse_opt((char *) data, &d, is_bdev))
goto err;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 204027520937..e19d4c0cacae 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1030,6 +1030,11 @@ static int __init init_hugetlbfs_fs(void)
int error;
int i;
+ if (!hugepages_supported()) {
+ pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n");
+ return -ENOTSUPP;
+ }
+
error = bdi_init(&hugetlbfs_backing_dev_info);
if (error)
return error;
diff --git a/fs/ioprio.c b/fs/ioprio.c
deleted file mode 100644
index e50170ca7c33..000000000000
--- a/fs/ioprio.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * fs/ioprio.c
- *
- * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
- *
- * Helper functions for setting/querying io priorities of processes. The
- * system calls closely mimmick getpriority/setpriority, see the man page for
- * those. The prio argument is a composite of prio class and prio data, where
- * the data argument has meaning within that class. The standard scheduling
- * classes have 8 distinct prio levels, with 0 being the highest prio and 7
- * being the lowest.
- *
- * IOW, setting BE scheduling class with prio 2 is done ala:
- *
- * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
- *
- * ioprio_set(PRIO_PROCESS, pid, prio);
- *
- * See also Documentation/block/ioprio.txt
- *
- */
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/ioprio.h>
-#include <linux/blkdev.h>
-#include <linux/capability.h>
-#include <linux/syscalls.h>
-#include <linux/security.h>
-#include <linux/pid_namespace.h>
-
-int set_task_ioprio(struct task_struct *task, int ioprio)
-{
- int err;
- struct io_context *ioc;
- const struct cred *cred = current_cred(), *tcred;
-
- rcu_read_lock();
- tcred = __task_cred(task);
- if (!uid_eq(tcred->uid, cred->euid) &&
- !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
- rcu_read_unlock();
- return -EPERM;
- }
- rcu_read_unlock();
-
- err = security_task_setioprio(task, ioprio);
- if (err)
- return err;
-
- ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
- if (ioc) {
- ioc->ioprio = ioprio;
- put_io_context(ioc);
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(set_task_ioprio);
-
-SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
-{
- int class = IOPRIO_PRIO_CLASS(ioprio);
- int data = IOPRIO_PRIO_DATA(ioprio);
- struct task_struct *p, *g;
- struct user_struct *user;
- struct pid *pgrp;
- kuid_t uid;
- int ret;
-
- switch (class) {
- case IOPRIO_CLASS_RT:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- /* fall through, rt has prio field too */
- case IOPRIO_CLASS_BE:
- if (data >= IOPRIO_BE_NR || data < 0)
- return -EINVAL;
-
- break;
- case IOPRIO_CLASS_IDLE:
- break;
- case IOPRIO_CLASS_NONE:
- if (data)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
-
- ret = -ESRCH;
- rcu_read_lock();
- switch (which) {
- case IOPRIO_WHO_PROCESS:
- if (!who)
- p = current;
- else
- p = find_task_by_vpid(who);
- if (p)
- ret = set_task_ioprio(p, ioprio);
- break;
- case IOPRIO_WHO_PGRP:
- if (!who)
- pgrp = task_pgrp(current);
- else
- pgrp = find_vpid(who);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- ret = set_task_ioprio(p, ioprio);
- if (ret)
- break;
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case IOPRIO_WHO_USER:
- uid = make_kuid(current_user_ns(), who);
- if (!uid_valid(uid))
- break;
- if (!who)
- user = current_user();
- else
- user = find_user(uid);
-
- if (!user)
- break;
-
- do_each_thread(g, p) {
- if (!uid_eq(task_uid(p), uid))
- continue;
- ret = set_task_ioprio(p, ioprio);
- if (ret)
- goto free_uid;
- } while_each_thread(g, p);
-free_uid:
- if (who)
- free_uid(user);
- break;
- default:
- ret = -EINVAL;
- }
-
- rcu_read_unlock();
- return ret;
-}
-
-static int get_task_ioprio(struct task_struct *p)
-{
- int ret;
-
- ret = security_task_getioprio(p);
- if (ret)
- goto out;
- ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
- if (p->io_context)
- ret = p->io_context->ioprio;
-out:
- return ret;
-}
-
-int ioprio_best(unsigned short aprio, unsigned short bprio)
-{
- unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
- unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
-
- if (aclass == IOPRIO_CLASS_NONE)
- aclass = IOPRIO_CLASS_BE;
- if (bclass == IOPRIO_CLASS_NONE)
- bclass = IOPRIO_CLASS_BE;
-
- if (aclass == bclass)
- return min(aprio, bprio);
- if (aclass > bclass)
- return bprio;
- else
- return aprio;
-}
-
-SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
-{
- struct task_struct *g, *p;
- struct user_struct *user;
- struct pid *pgrp;
- kuid_t uid;
- int ret = -ESRCH;
- int tmpio;
-
- rcu_read_lock();
- switch (which) {
- case IOPRIO_WHO_PROCESS:
- if (!who)
- p = current;
- else
- p = find_task_by_vpid(who);
- if (p)
- ret = get_task_ioprio(p);
- break;
- case IOPRIO_WHO_PGRP:
- if (!who)
- pgrp = task_pgrp(current);
- else
- pgrp = find_vpid(who);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- tmpio = get_task_ioprio(p);
- if (tmpio < 0)
- continue;
- if (ret == -ESRCH)
- ret = tmpio;
- else
- ret = ioprio_best(ret, tmpio);
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case IOPRIO_WHO_USER:
- uid = make_kuid(current_user_ns(), who);
- if (!who)
- user = current_user();
- else
- user = find_user(uid);
-
- if (!user)
- break;
-
- do_each_thread(g, p) {
- if (!uid_eq(task_uid(p), user->uid))
- continue;
- tmpio = get_task_ioprio(p);
- if (tmpio < 0)
- continue;
- if (ret == -ESRCH)
- ret = tmpio;
- else
- ret = ioprio_best(ret, tmpio);
- } while_each_thread(g, p);
-
- if (who)
- free_uid(user);
- break;
- default:
- ret = -EINVAL;
- }
-
- rcu_read_unlock();
- return ret;
-}
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 78f3403300af..a693f5b01ae6 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -232,9 +232,6 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
struct rb_node **node = &kn->parent->dir.children.rb_node;
struct rb_node *parent = NULL;
- if (kernfs_type(kn) == KERNFS_DIR)
- kn->parent->dir.subdirs++;
-
while (*node) {
struct kernfs_node *pos;
int result;
@@ -249,9 +246,15 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
else
return -EEXIST;
}
+
/* add new node and rebalance the tree */
rb_link_node(&kn->rb, parent, node);
rb_insert_color(&kn->rb, &kn->parent->dir.children);
+
+ /* successfully added, account subdir number */
+ if (kernfs_type(kn) == KERNFS_DIR)
+ kn->parent->dir.subdirs++;
+
return 0;
}
@@ -711,6 +714,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
return ERR_PTR(-ENOMEM);
ida_init(&root->ino_ida);
+ INIT_LIST_HEAD(&root->supers);
kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
KERNFS_DIR);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 8034706a7af8..e3d37f607f97 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -14,6 +14,7 @@
#include <linux/poll.h>
#include <linux/pagemap.h>
#include <linux/sched.h>
+#include <linux/fsnotify.h>
#include "kernfs-internal.h"
@@ -484,6 +485,8 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
ops = kernfs_ops(of->kn);
rc = ops->mmap(of, vma);
+ if (rc)
+ goto out_put;
/*
* PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
@@ -608,6 +611,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
static int kernfs_fop_open(struct inode *inode, struct file *file)
{
struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+ struct kernfs_root *root = kernfs_root(kn);
const struct kernfs_ops *ops;
struct kernfs_open_file *of;
bool has_read, has_write, has_mmap;
@@ -622,14 +626,16 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
has_write = ops->write || ops->mmap;
has_mmap = ops->mmap;
- /* check perms and supported operations */
- if ((file->f_mode & FMODE_WRITE) &&
- (!(inode->i_mode & S_IWUGO) || !has_write))
- goto err_out;
+ /* see the flag definition for details */
+ if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
+ if ((file->f_mode & FMODE_WRITE) &&
+ (!(inode->i_mode & S_IWUGO) || !has_write))
+ goto err_out;
- if ((file->f_mode & FMODE_READ) &&
- (!(inode->i_mode & S_IRUGO) || !has_read))
- goto err_out;
+ if ((file->f_mode & FMODE_READ) &&
+ (!(inode->i_mode & S_IRUGO) || !has_read))
+ goto err_out;
+ }
/* allocate a kernfs_open_file for the file */
error = -ENOMEM;
@@ -785,20 +791,48 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
*/
void kernfs_notify(struct kernfs_node *kn)
{
+ struct kernfs_root *root = kernfs_root(kn);
struct kernfs_open_node *on;
+ struct kernfs_super_info *info;
unsigned long flags;
+ if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
+ return;
+
+ /* kick poll */
spin_lock_irqsave(&kernfs_open_node_lock, flags);
- if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) {
- on = kn->attr.open;
- if (on) {
- atomic_inc(&on->event);
- wake_up_interruptible(&on->poll);
- }
+ on = kn->attr.open;
+ if (on) {
+ atomic_inc(&on->event);
+ wake_up_interruptible(&on->poll);
}
spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+
+ /* kick fsnotify */
+ mutex_lock(&kernfs_mutex);
+
+ list_for_each_entry(info, &root->supers, node) {
+ struct inode *inode;
+ struct dentry *dentry;
+
+ inode = ilookup(info->sb, kn->ino);
+ if (!inode)
+ continue;
+
+ dentry = d_find_any_alias(inode);
+ if (dentry) {
+ fsnotify_parent(NULL, dentry, FS_MODIFY);
+ fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
+ NULL, 0);
+ dput(dentry);
+ }
+
+ iput(inode);
+ }
+
+ mutex_unlock(&kernfs_mutex);
}
EXPORT_SYMBOL_GPL(kernfs_notify);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index abb0f1f53d93..985217626e66 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -48,14 +48,18 @@ void __init kernfs_inode_init(void)
static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
{
+ static DEFINE_MUTEX(iattr_mutex);
+ struct kernfs_iattrs *ret;
struct iattr *iattrs;
+ mutex_lock(&iattr_mutex);
+
if (kn->iattr)
- return kn->iattr;
+ goto out_unlock;
kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
if (!kn->iattr)
- return NULL;
+ goto out_unlock;
iattrs = &kn->iattr->ia_iattr;
/* assign default attributes */
@@ -65,8 +69,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
simple_xattrs_init(&kn->iattr->xattrs);
-
- return kn->iattr;
+out_unlock:
+ ret = kn->iattr;
+ mutex_unlock(&iattr_mutex);
+ return ret;
}
static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 8be13b2a079b..dc84a3ef9ca2 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -49,6 +49,8 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
* mount.c
*/
struct kernfs_super_info {
+ struct super_block *sb;
+
/*
* The root associated with this super_block. Each super_block is
* identified by the root and ns it's associated with.
@@ -62,6 +64,9 @@ struct kernfs_super_info {
* an array and compare kernfs_node tag against every entry.
*/
const void *ns;
+
+ /* anchored at kernfs_root->supers, protected by kernfs_mutex */
+ struct list_head node;
};
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 6a5f04ac8704..d171b98a6cdd 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -62,15 +62,16 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
return NULL;
}
-static int kernfs_fill_super(struct super_block *sb)
+static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
{
struct kernfs_super_info *info = kernfs_info(sb);
struct inode *inode;
struct dentry *root;
+ info->sb = sb;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
- sb->s_magic = SYSFS_MAGIC;
+ sb->s_magic = magic;
sb->s_op = &kernfs_sops;
sb->s_time_gran = 1;
@@ -131,6 +132,7 @@ const void *kernfs_super_ns(struct super_block *sb)
* @fs_type: file_system_type of the fs being mounted
* @flags: mount flags specified for the mount
* @root: kernfs_root of the hierarchy being mounted
+ * @magic: file system specific magic number
* @new_sb_created: tell the caller if we allocated a new superblock
* @ns: optional namespace tag of the mount
*
@@ -142,8 +144,8 @@ const void *kernfs_super_ns(struct super_block *sb)
* The return value can be passed to the vfs layer verbatim.
*/
struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
- struct kernfs_root *root, bool *new_sb_created,
- const void *ns)
+ struct kernfs_root *root, unsigned long magic,
+ bool *new_sb_created, const void *ns)
{
struct super_block *sb;
struct kernfs_super_info *info;
@@ -166,12 +168,18 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
*new_sb_created = !sb->s_root;
if (!sb->s_root) {
- error = kernfs_fill_super(sb);
+ struct kernfs_super_info *info = kernfs_info(sb);
+
+ error = kernfs_fill_super(sb, magic);
if (error) {
deactivate_locked_super(sb);
return ERR_PTR(error);
}
sb->s_flags |= MS_ACTIVE;
+
+ mutex_lock(&kernfs_mutex);
+ list_add(&info->node, &root->supers);
+ mutex_unlock(&kernfs_mutex);
}
return dget(sb->s_root);
@@ -190,6 +198,10 @@ void kernfs_kill_sb(struct super_block *sb)
struct kernfs_super_info *info = kernfs_info(sb);
struct kernfs_node *root_kn = sb->s_root->d_fsdata;
+ mutex_lock(&kernfs_mutex);
+ list_del(&info->node);
+ mutex_unlock(&kernfs_mutex);
+
/*
* Remove the superblock from fs_supers/s_instances
* so we can't find it, before freeing kernfs_super_info.
diff --git a/fs/locks.c b/fs/locks.c
index 13fc7a6d380a..e390bd9ae068 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -135,7 +135,7 @@
#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG))
-#define IS_FILE_PVT(fl) (fl->fl_flags & FL_FILE_PVT)
+#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
static bool lease_breaking(struct file_lock *fl)
{
@@ -389,18 +389,6 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
fl->fl_ops = NULL;
fl->fl_lmops = NULL;
- /* Ensure that fl->fl_filp has compatible f_mode */
- switch (l->l_type) {
- case F_RDLCK:
- if (!(filp->f_mode & FMODE_READ))
- return -EBADF;
- break;
- case F_WRLCK:
- if (!(filp->f_mode & FMODE_WRITE))
- return -EBADF;
- break;
- }
-
return assign_type(fl, l->l_type);
}
@@ -564,7 +552,7 @@ static void __locks_insert_block(struct file_lock *blocker,
BUG_ON(!list_empty(&waiter->fl_block));
waiter->fl_next = blocker;
list_add_tail(&waiter->fl_block, &blocker->fl_block);
- if (IS_POSIX(blocker) && !IS_FILE_PVT(blocker))
+ if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
locks_insert_global_blocked(waiter);
}
@@ -759,12 +747,12 @@ EXPORT_SYMBOL(posix_test_lock);
* of tasks (such as posix threads) sharing the same open file table.
* To handle those cases, we just bail out after a few iterations.
*
- * For FL_FILE_PVT locks, the owner is the filp, not the files_struct.
+ * For FL_OFDLCK locks, the owner is the filp, not the files_struct.
* Because the owner is not even nominally tied to a thread of
* execution, the deadlock detection below can't reasonably work well. Just
* skip it for those.
*
- * In principle, we could do a more limited deadlock detection on FL_FILE_PVT
+ * In principle, we could do a more limited deadlock detection on FL_OFDLCK
* locks that just checks for the case where two tasks are attempting to
* upgrade from read to write locks on the same inode.
*/
@@ -791,9 +779,9 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
/*
* This deadlock detector can't reasonably detect deadlocks with
- * FL_FILE_PVT locks, since they aren't owned by a process, per-se.
+ * FL_OFDLCK locks, since they aren't owned by a process, per-se.
*/
- if (IS_FILE_PVT(caller_fl))
+ if (IS_OFDLCK(caller_fl))
return 0;
while ((block_fl = what_owner_is_waiting_for(block_fl))) {
@@ -1391,11 +1379,10 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
restart:
break_time = flock->fl_break_time;
- if (break_time != 0) {
+ if (break_time != 0)
break_time -= jiffies;
- if (break_time == 0)
- break_time++;
- }
+ if (break_time == 0)
+ break_time++;
locks_insert_block(flock, new_fl);
spin_unlock(&inode->i_lock);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
@@ -1891,7 +1878,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
{
- flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid;
+ flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
#if BITS_PER_LONG == 32
/*
* Make sure we can represent the posix lock via
@@ -1913,7 +1900,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
#if BITS_PER_LONG == 32
static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
{
- flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid;
+ flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
flock->l_start = fl->fl_start;
flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
fl->fl_end - fl->fl_start + 1;
@@ -1942,13 +1929,13 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
if (error)
goto out;
- if (cmd == F_GETLKP) {
+ if (cmd == F_OFD_GETLK) {
error = -EINVAL;
if (flock.l_pid != 0)
goto out;
cmd = F_GETLK;
- file_lock.fl_flags |= FL_FILE_PVT;
+ file_lock.fl_flags |= FL_OFDLCK;
file_lock.fl_owner = (fl_owner_t)filp;
}
@@ -2035,6 +2022,22 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
return error;
}
+/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */
+static int
+check_fmode_for_setlk(struct file_lock *fl)
+{
+ switch (fl->fl_type) {
+ case F_RDLCK:
+ if (!(fl->fl_file->f_mode & FMODE_READ))
+ return -EBADF;
+ break;
+ case F_WRLCK:
+ if (!(fl->fl_file->f_mode & FMODE_WRITE))
+ return -EBADF;
+ }
+ return 0;
+}
+
/* Apply the lock described by l to an open file descriptor.
* This implements both the F_SETLK and F_SETLKW commands of fcntl().
*/
@@ -2072,27 +2075,31 @@ again:
if (error)
goto out;
+ error = check_fmode_for_setlk(file_lock);
+ if (error)
+ goto out;
+
/*
* If the cmd is requesting file-private locks, then set the
- * FL_FILE_PVT flag and override the owner.
+ * FL_OFDLCK flag and override the owner.
*/
switch (cmd) {
- case F_SETLKP:
+ case F_OFD_SETLK:
error = -EINVAL;
if (flock.l_pid != 0)
goto out;
cmd = F_SETLK;
- file_lock->fl_flags |= FL_FILE_PVT;
+ file_lock->fl_flags |= FL_OFDLCK;
file_lock->fl_owner = (fl_owner_t)filp;
break;
- case F_SETLKPW:
+ case F_OFD_SETLKW:
error = -EINVAL;
if (flock.l_pid != 0)
goto out;
cmd = F_SETLKW;
- file_lock->fl_flags |= FL_FILE_PVT;
+ file_lock->fl_flags |= FL_OFDLCK;
file_lock->fl_owner = (fl_owner_t)filp;
/* Fallthrough */
case F_SETLKW:
@@ -2144,13 +2151,13 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
if (error)
goto out;
- if (cmd == F_GETLKP) {
+ if (cmd == F_OFD_GETLK) {
error = -EINVAL;
if (flock.l_pid != 0)
goto out;
cmd = F_GETLK64;
- file_lock.fl_flags |= FL_FILE_PVT;
+ file_lock.fl_flags |= FL_OFDLCK;
file_lock.fl_owner = (fl_owner_t)filp;
}
@@ -2207,27 +2214,31 @@ again:
if (error)
goto out;
+ error = check_fmode_for_setlk(file_lock);
+ if (error)
+ goto out;
+
/*
* If the cmd is requesting file-private locks, then set the
- * FL_FILE_PVT flag and override the owner.
+ * FL_OFDLCK flag and override the owner.
*/
switch (cmd) {
- case F_SETLKP:
+ case F_OFD_SETLK:
error = -EINVAL;
if (flock.l_pid != 0)
goto out;
cmd = F_SETLK64;
- file_lock->fl_flags |= FL_FILE_PVT;
+ file_lock->fl_flags |= FL_OFDLCK;
file_lock->fl_owner = (fl_owner_t)filp;
break;
- case F_SETLKPW:
+ case F_OFD_SETLKW:
error = -EINVAL;
if (flock.l_pid != 0)
goto out;
cmd = F_SETLKW64;
- file_lock->fl_flags |= FL_FILE_PVT;
+ file_lock->fl_flags |= FL_OFDLCK;
file_lock->fl_owner = (fl_owner_t)filp;
/* Fallthrough */
case F_SETLKW64:
@@ -2413,8 +2424,8 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
if (IS_POSIX(fl)) {
if (fl->fl_flags & FL_ACCESS)
seq_printf(f, "ACCESS");
- else if (IS_FILE_PVT(fl))
- seq_printf(f, "FLPVT ");
+ else if (IS_OFDLCK(fl))
+ seq_printf(f, "OFDLCK");
else
seq_printf(f, "POSIX ");
diff --git a/fs/namei.c b/fs/namei.c
index c6157c894fce..80168273396b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1542,7 +1542,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
inode = path->dentry->d_inode;
}
err = -ENOENT;
- if (!inode)
+ if (!inode || d_is_negative(path->dentry))
goto out_path_put;
if (should_follow_link(path->dentry, follow)) {
@@ -2249,7 +2249,7 @@ mountpoint_last(struct nameidata *nd, struct path *path)
mutex_unlock(&dir->d_inode->i_mutex);
done:
- if (!dentry->d_inode) {
+ if (!dentry->d_inode || d_is_negative(dentry)) {
error = -ENOENT;
dput(dentry);
goto out;
@@ -2994,7 +2994,7 @@ retry_lookup:
finish_lookup:
/* we _can_ be in RCU mode here */
error = -ENOENT;
- if (d_is_negative(path->dentry)) {
+ if (!inode || d_is_negative(path->dentry)) {
path_to_nameidata(path, nd);
goto out;
}
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 6f3f392d48af..f66c66b9f182 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -402,8 +402,10 @@ sort_pacl(struct posix_acl *pacl)
* by uid/gid. */
int i, j;
- if (pacl->a_count <= 4)
- return; /* no users or groups */
+ /* no users or groups */
+ if (!pacl || pacl->a_count <= 4)
+ return;
+
i = 1;
while (pacl->a_entries[i].e_tag == ACL_USER)
i++;
@@ -530,13 +532,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
/*
* ACLs with no ACEs are treated differently in the inheritable
- * and effective cases: when there are no inheritable ACEs, we
- * set a zero-length default posix acl:
+ * and effective cases: when there are no inheritable ACEs,
+ * calls ->set_acl with a NULL ACL structure.
*/
- if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) {
- pacl = posix_acl_alloc(0, GFP_KERNEL);
- return pacl ? pacl : ERR_PTR(-ENOMEM);
- }
+ if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT))
+ return NULL;
+
/*
* When there are no effective ACEs, the following will end
* up setting a 3-element effective posix ACL with all
@@ -589,7 +590,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
add_to_mask(state, &state->groups->aces[i].perms);
}
- if (!state->users->n && !state->groups->n) {
+ if (state->users->n || state->groups->n) {
pace++;
pace->e_tag = ACL_MASK;
low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 39c8ef875f91..2c73cae9899d 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -654,9 +654,11 @@ static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args)
static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
{
+ int maxtime = max_cb_time(clp->net);
struct rpc_timeout timeparms = {
- .to_initval = max_cb_time(clp->net),
+ .to_initval = maxtime,
.to_retries = 0,
+ .to_maxval = maxtime,
};
struct rpc_create_args args = {
.net = clp->net,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3ba65979a3cd..9a77a5a21557 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1078,6 +1078,18 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
return NULL;
}
clp->cl_name.len = name.len;
+ INIT_LIST_HEAD(&clp->cl_sessions);
+ idr_init(&clp->cl_stateids);
+ atomic_set(&clp->cl_refcount, 0);
+ clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+ INIT_LIST_HEAD(&clp->cl_idhash);
+ INIT_LIST_HEAD(&clp->cl_openowners);
+ INIT_LIST_HEAD(&clp->cl_delegations);
+ INIT_LIST_HEAD(&clp->cl_lru);
+ INIT_LIST_HEAD(&clp->cl_callbacks);
+ INIT_LIST_HEAD(&clp->cl_revoked);
+ spin_lock_init(&clp->cl_lock);
+ rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
return clp;
}
@@ -1095,6 +1107,7 @@ free_client(struct nfs4_client *clp)
WARN_ON_ONCE(atomic_read(&ses->se_ref));
free_session(ses);
}
+ rpc_destroy_wait_queue(&clp->cl_cb_waitq);
free_svc_cred(&clp->cl_cred);
kfree(clp->cl_name.data);
idr_destroy(&clp->cl_stateids);
@@ -1347,7 +1360,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
if (clp == NULL)
return NULL;
- INIT_LIST_HEAD(&clp->cl_sessions);
ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
if (ret) {
spin_lock(&nn->client_lock);
@@ -1355,20 +1367,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
spin_unlock(&nn->client_lock);
return NULL;
}
- idr_init(&clp->cl_stateids);
- atomic_set(&clp->cl_refcount, 0);
- clp->cl_cb_state = NFSD4_CB_UNKNOWN;
- INIT_LIST_HEAD(&clp->cl_idhash);
- INIT_LIST_HEAD(&clp->cl_openowners);
- INIT_LIST_HEAD(&clp->cl_delegations);
- INIT_LIST_HEAD(&clp->cl_lru);
- INIT_LIST_HEAD(&clp->cl_callbacks);
- INIT_LIST_HEAD(&clp->cl_revoked);
- spin_lock_init(&clp->cl_lock);
nfsd4_init_callback(&clp->cl_cb_null);
clp->cl_time = get_seconds();
clear_bit(0, &clp->cl_cb_slot_busy);
- rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
copy_verf(clp, verf);
rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
gen_confirm(clp);
@@ -3716,9 +3717,16 @@ out:
static __be32
nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
{
- if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner)))
+ struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
+
+ if (check_for_locks(stp->st_file, lo))
return nfserr_locks_held;
- release_lock_stateid(stp);
+ /*
+ * Currently there's a 1-1 lock stateid<->lockowner
+ * correspondance, and we have to delete the lockowner when we
+ * delete the lock stateid:
+ */
+ unhash_lockowner(lo);
return nfs_ok;
}
@@ -4158,6 +4166,10 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c
if (!same_owner_str(&lo->lo_owner, owner, clid))
return false;
+ if (list_empty(&lo->lo_owner.so_stateids)) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
lst = list_first_entry(&lo->lo_owner.so_stateids,
struct nfs4_ol_stateid, st_perstateowner);
return lst->st_file->fi_inode == inode;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2723c1badd01..18881f34737a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3627,14 +3627,6 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
/* nfsd4_check_resp_size guarantees enough room for error status */
if (!op->status)
op->status = nfsd4_check_resp_size(resp, 0);
- if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) {
- struct nfsd4_slot *slot = resp->cstate.slot;
-
- if (slot->sl_flags & NFSD4_SLOT_CACHETHIS)
- op->status = nfserr_rep_too_big_to_cache;
- else
- op->status = nfserr_rep_too_big;
- }
if (so) {
so->so_replay.rp_status = op->status;
so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 4e565c814309..732648b270dc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -698,6 +698,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
}
group->overflow_event = &oevent->fse;
+ if (force_o_largefile())
+ event_f_flags |= O_LARGEFILE;
group->fanotify_data.f_flags = event_f_flags;
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
spin_lock_init(&group->fanotify_data.access_lock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index af3f7aa73e13..ee1f88419cb0 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -472,11 +472,15 @@ bail:
void dlm_destroy_master_caches(void)
{
- if (dlm_lockname_cache)
+ if (dlm_lockname_cache) {
kmem_cache_destroy(dlm_lockname_cache);
+ dlm_lockname_cache = NULL;
+ }
- if (dlm_lockres_cache)
+ if (dlm_lockres_cache) {
kmem_cache_destroy(dlm_lockres_cache);
+ dlm_lockres_cache = NULL;
+ }
}
static void dlm_lockres_release(struct kref *kref)
diff --git a/fs/open.c b/fs/open.c
index 3d30eb1fc95e..9d64679cec73 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -254,17 +254,22 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EBADF;
/*
- * It's not possible to punch hole or perform collapse range
- * on append only file
+ * We can only allow pure fallocate on append only files
*/
- if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)
- && IS_APPEND(inode))
+ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
return -EPERM;
if (IS_IMMUTABLE(inode))
return -EPERM;
/*
+ * We can not allow to do any fallocate operation on an active
+ * swapfile
+ */
+ if (IS_SWAPFILE(inode))
+ ret = -ETXTBSY;
+
+ /*
* Revalidate the write permissions, in case security policy has
* changed since the files were opened.
*/
@@ -286,14 +291,6 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
return -EFBIG;
- /*
- * There is no need to overlap collapse range with EOF, in which case
- * it is effectively a truncate operation
- */
- if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
- (offset + len >= i_size_read(inode)))
- return -EINVAL;
-
if (!file->f_op->fallocate)
return -EOPNOTSUPP;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 9e363e41dacc..0855f772cd41 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -246,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
umode_t mode = 0;
int not_equiv = 0;
+ /*
+ * A null ACL can always be presented as mode bits.
+ */
+ if (!acl)
+ return 0;
+
FOREACH_ACL_ENTRY(pa, acl, pe) {
switch (pa->e_tag) {
case ACL_USER_OBJ:
diff --git a/fs/splice.c b/fs/splice.c
index 9bc07d2b53cf..e246954ea48c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1537,7 +1537,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
- ssize_t count = 0;
+ ssize_t count;
pipe = get_pipe_info(file);
if (!pipe)
@@ -1546,8 +1546,9 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
ret = rw_copy_check_uvector(READ, uiov, nr_segs,
ARRAY_SIZE(iovstack), iovstack, &iov);
if (ret <= 0)
- return ret;
+ goto out;
+ count = ret;
iov_iter_init(&iter, iov, nr_segs, count, 0);
sd.len = 0;
@@ -1560,6 +1561,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
pipe_unlock(pipe);
+out:
if (iov != iovstack)
kfree(iov);
diff --git a/fs/super.c b/fs/super.c
index e9dc3c3fe159..48377f7463c0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -800,7 +800,10 @@ void emergency_remount(void)
static DEFINE_IDA(unnamed_dev_ida);
static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
-static int unnamed_dev_start = 0; /* don't bother trying below it */
+/* Many userspace utilities consider an FSID of 0 invalid.
+ * Always return at least 1 from get_anon_bdev.
+ */
+static int unnamed_dev_start = 1;
int get_anon_bdev(dev_t *p)
{
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1b8b91b67fdb..e9ef59b3abb1 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -47,12 +47,13 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
ssize_t count;
char *buf;
- /* acquire buffer and ensure that it's >= PAGE_SIZE */
+ /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */
count = seq_get_buf(sf, &buf);
if (count < PAGE_SIZE) {
seq_commit(sf, -1);
return 0;
}
+ memset(buf, 0, PAGE_SIZE);
/*
* Invoke show(). Control may reach here via seq file lseek even
@@ -453,95 +454,3 @@ void sysfs_remove_bin_file(struct kobject *kobj,
kernfs_remove_by_name(kobj->sd, attr->attr.name);
}
EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
-
-struct sysfs_schedule_callback_struct {
- struct list_head workq_list;
- struct kobject *kobj;
- void (*func)(void *);
- void *data;
- struct module *owner;
- struct work_struct work;
-};
-
-static struct workqueue_struct *sysfs_workqueue;
-static DEFINE_MUTEX(sysfs_workq_mutex);
-static LIST_HEAD(sysfs_workq);
-static void sysfs_schedule_callback_work(struct work_struct *work)
-{
- struct sysfs_schedule_callback_struct *ss = container_of(work,
- struct sysfs_schedule_callback_struct, work);
-
- (ss->func)(ss->data);
- kobject_put(ss->kobj);
- module_put(ss->owner);
- mutex_lock(&sysfs_workq_mutex);
- list_del(&ss->workq_list);
- mutex_unlock(&sysfs_workq_mutex);
- kfree(ss);
-}
-
-/**
- * sysfs_schedule_callback - helper to schedule a callback for a kobject
- * @kobj: object we're acting for.
- * @func: callback function to invoke later.
- * @data: argument to pass to @func.
- * @owner: module owning the callback code
- *
- * sysfs attribute methods must not unregister themselves or their parent
- * kobject (which would amount to the same thing). Attempts to do so will
- * deadlock, since unregistration is mutually exclusive with driver
- * callbacks.
- *
- * Instead methods can call this routine, which will attempt to allocate
- * and schedule a workqueue request to call back @func with @data as its
- * argument in the workqueue's process context. @kobj will be pinned
- * until @func returns.
- *
- * Returns 0 if the request was submitted, -ENOMEM if storage could not
- * be allocated, -ENODEV if a reference to @owner isn't available,
- * -EAGAIN if a callback has already been scheduled for @kobj.
- */
-int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
- void *data, struct module *owner)
-{
- struct sysfs_schedule_callback_struct *ss, *tmp;
-
- if (!try_module_get(owner))
- return -ENODEV;
-
- mutex_lock(&sysfs_workq_mutex);
- list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
- if (ss->kobj == kobj) {
- module_put(owner);
- mutex_unlock(&sysfs_workq_mutex);
- return -EAGAIN;
- }
- mutex_unlock(&sysfs_workq_mutex);
-
- if (sysfs_workqueue == NULL) {
- sysfs_workqueue = create_singlethread_workqueue("sysfsd");
- if (sysfs_workqueue == NULL) {
- module_put(owner);
- return -ENOMEM;
- }
- }
-
- ss = kmalloc(sizeof(*ss), GFP_KERNEL);
- if (!ss) {
- module_put(owner);
- return -ENOMEM;
- }
- kobject_get(kobj);
- ss->kobj = kobj;
- ss->func = func;
- ss->data = data;
- ss->owner = owner;
- INIT_WORK(&ss->work, sysfs_schedule_callback_work);
- INIT_LIST_HEAD(&ss->workq_list);
- mutex_lock(&sysfs_workq_mutex);
- list_add_tail(&ss->workq_list, &sysfs_workq);
- mutex_unlock(&sysfs_workq_mutex);
- queue_work(sysfs_workqueue, &ss->work);
- return 0;
-}
-EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index aa0406895b53..7d2a860ba788 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,7 +18,7 @@
#include "sysfs.h"
-static void remove_files(struct kernfs_node *parent, struct kobject *kobj,
+static void remove_files(struct kernfs_node *parent,
const struct attribute_group *grp)
{
struct attribute *const *attr;
@@ -29,7 +29,7 @@ static void remove_files(struct kernfs_node *parent, struct kobject *kobj,
kernfs_remove_by_name(parent, (*attr)->name);
if (grp->bin_attrs)
for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
- sysfs_remove_bin_file(kobj, *bin_attr);
+ kernfs_remove_by_name(parent, (*bin_attr)->attr.name);
}
static int create_files(struct kernfs_node *parent, struct kobject *kobj,
@@ -62,7 +62,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
break;
}
if (error) {
- remove_files(parent, kobj, grp);
+ remove_files(parent, grp);
goto exit;
}
}
@@ -79,7 +79,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
break;
}
if (error)
- remove_files(parent, kobj, grp);
+ remove_files(parent, grp);
}
exit:
return error;
@@ -224,7 +224,7 @@ void sysfs_remove_group(struct kobject *kobj,
kernfs_get(kn);
}
- remove_files(kn, kobj, grp);
+ remove_files(kn, grp);
if (grp->name)
kernfs_remove(kn);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index a66ad6196f59..8a49486bf30c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -13,6 +13,7 @@
#define DEBUG
#include <linux/fs.h>
+#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/user_namespace.h>
@@ -38,7 +39,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
}
ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
- root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns);
+ root = kernfs_mount_ns(fs_type, flags, sysfs_root,
+ SYSFS_MAGIC, &new_sb, ns);
if (IS_ERR(root) || !new_sb)
kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
return root;
@@ -63,7 +65,8 @@ int __init sysfs_init(void)
{
int err;
- sysfs_root = kernfs_create_root(NULL, 0, NULL);
+ sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
+ NULL);
if (IS_ERR(sysfs_root))
return PTR_ERR(sysfs_root);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index a1266089eca1..a81c7b556896 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1556,7 +1556,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
if (c->space_fixup) {
err = ubifs_fixup_free_space(c);
if (err)
- return err;
+ goto out;
}
err = check_free_space(c);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 75df77d09f75..0479c32c5eb1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1344,6 +1344,14 @@ __xfs_get_blocks(
/*
* If this is O_DIRECT or the mpage code calling tell them how large
* the mapping is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the
+ * mapping for blocks beyond EOF must be marked new so that sub block
+ * regions can be correctly zeroed. We can't do this for mappings within
+ * EOF unless the mapping was just allocated or is unwritten, otherwise
+ * the callers would overwrite existing data with zeros. Hence we have
+ * to split the mapping into a range up to and including EOF, and a
+ * second mapping for beyond EOF.
*/
if (direct || size > (1 << inode->i_blkbits)) {
xfs_off_t mapping_size;
@@ -1354,6 +1362,12 @@ __xfs_get_blocks(
ASSERT(mapping_size > 0);
if (mapping_size > size)
mapping_size = size;
+ if (offset < i_size_read(inode) &&
+ offset + mapping_size >= i_size_read(inode)) {
+ /* limit mapping to block that spans EOF */
+ mapping_size = roundup_64(i_size_read(inode) - offset,
+ 1 << inode->i_blkbits);
+ }
if (mapping_size > LONG_MAX)
mapping_size = LONG_MAX;
@@ -1566,6 +1580,16 @@ xfs_vm_write_failed(
xfs_vm_kill_delalloc_range(inode, block_offset,
block_offset + bh->b_size);
+
+ /*
+ * This buffer does not contain data anymore. make sure anyone
+ * who finds it knows that for certain.
+ */
+ clear_buffer_delay(bh);
+ clear_buffer_uptodate(bh);
+ clear_buffer_mapped(bh);
+ clear_buffer_new(bh);
+ clear_buffer_dirty(bh);
}
}
@@ -1599,12 +1623,21 @@ xfs_vm_write_begin(
status = __block_write_begin(page, pos, len, xfs_get_blocks);
if (unlikely(status)) {
struct inode *inode = mapping->host;
+ size_t isize = i_size_read(inode);
xfs_vm_write_failed(inode, page, pos, len);
unlock_page(page);
- if (pos + len > i_size_read(inode))
- truncate_pagecache(inode, i_size_read(inode));
+ /*
+ * If the write is beyond EOF, we only want to kill blocks
+ * allocated in this write, not blocks that were previously
+ * written successfully.
+ */
+ if (pos + len > isize) {
+ ssize_t start = max_t(ssize_t, pos, isize);
+
+ truncate_pagecache_range(inode, start, pos + len);
+ }
page_cache_release(page);
page = NULL;
@@ -1615,9 +1648,12 @@ xfs_vm_write_begin(
}
/*
- * On failure, we only need to kill delalloc blocks beyond EOF because they
- * will never be written. For blocks within EOF, generic_write_end() zeros them
- * so they are safe to leave alone and be written with all the other valid data.
+ * On failure, we only need to kill delalloc blocks beyond EOF in the range of
+ * this specific write because they will never be written. Previous writes
+ * beyond EOF where block allocation succeeded do not need to be trashed, so
+ * only new blocks from this write should be trashed. For blocks within
+ * EOF, generic_write_end() zeros them so they are safe to leave alone and be
+ * written with all the other valid data.
*/
STATIC int
xfs_vm_write_end(
@@ -1640,8 +1676,11 @@ xfs_vm_write_end(
loff_t to = pos + len;
if (to > isize) {
- truncate_pagecache(inode, isize);
+ /* only kill blocks in this write beyond EOF */
+ if (pos > isize)
+ isize = pos;
xfs_vm_kill_delalloc_range(inode, isize, to);
+ truncate_pagecache_range(inode, isize, to);
}
}
return ret;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 01b6a0102fbd..abda1124a70f 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -213,7 +213,7 @@ xfs_attr_calc_size(
* Out of line attribute, cannot double split, but
* make room for the attribute value itself.
*/
- uint dblocks = XFS_B_TO_FSB(mp, valuelen);
+ uint dblocks = xfs_attr3_rmt_blocks(mp, valuelen);
nblks += dblocks;
nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
}
@@ -698,11 +698,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
trace_xfs_attr_leaf_replace(args);
+ /* save the attribute state for later removal*/
args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtvaluelen2 = args->rmtvaluelen;
+
+ /*
+ * clear the remote attr state now that it is saved so that the
+ * values reflect the state of the attribute we are about to
+ * add, not the attribute we just found and will remove later.
+ */
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
/*
@@ -794,6 +805,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
args->blkno = args->blkno2;
args->rmtblkno = args->rmtblkno2;
args->rmtblkcnt = args->rmtblkcnt2;
+ args->rmtvaluelen = args->rmtvaluelen2;
if (args->rmtblkno) {
error = xfs_attr_rmtval_remove(args);
if (error)
@@ -999,13 +1011,22 @@ restart:
trace_xfs_attr_node_replace(args);
+ /* save the attribute state for later removal*/
args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtvaluelen2 = args->rmtvaluelen;
+
+ /*
+ * clear the remote attr state now that it is saved so that the
+ * values reflect the state of the attribute we are about to
+ * add, not the attribute we just found and will remove later.
+ */
args->rmtblkno = 0;
args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
retval = xfs_attr3_leaf_add(blk->bp, state->args);
@@ -1133,6 +1154,7 @@ restart:
args->blkno = args->blkno2;
args->rmtblkno = args->rmtblkno2;
args->rmtblkcnt = args->rmtblkcnt2;
+ args->rmtvaluelen = args->rmtvaluelen2;
if (args->rmtblkno) {
error = xfs_attr_rmtval_remove(args);
if (error)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index fe9587fab17a..511c283459b1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -1229,6 +1229,7 @@ xfs_attr3_leaf_add_work(
name_rmt->valueblk = 0;
args->rmtblkno = 1;
args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ args->rmtvaluelen = args->valuelen;
}
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
@@ -2167,11 +2168,11 @@ xfs_attr3_leaf_lookup_int(
if (!xfs_attr_namesp_match(args->flags, entry->flags))
continue;
args->index = probe;
- args->valuelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
args->rmtblkcnt = xfs_attr3_rmt_blocks(
args->dp->i_mount,
- args->valuelen);
+ args->rmtvaluelen);
return XFS_ERROR(EEXIST);
}
}
@@ -2220,19 +2221,19 @@ xfs_attr3_leaf_getvalue(
name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
ASSERT(name_rmt->namelen == args->namelen);
ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
- valuelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
- valuelen);
+ args->rmtvaluelen);
if (args->flags & ATTR_KERNOVAL) {
- args->valuelen = valuelen;
+ args->valuelen = args->rmtvaluelen;
return 0;
}
- if (args->valuelen < valuelen) {
- args->valuelen = valuelen;
+ if (args->valuelen < args->rmtvaluelen) {
+ args->valuelen = args->rmtvaluelen;
return XFS_ERROR(ERANGE);
}
- args->valuelen = valuelen;
+ args->valuelen = args->rmtvaluelen;
}
return 0;
}
@@ -2519,7 +2520,7 @@ xfs_attr3_leaf_clearflag(
ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
- name_rmt->valuelen = cpu_to_be32(args->valuelen);
+ name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
}
@@ -2677,7 +2678,7 @@ xfs_attr3_leaf_flipflags(
ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
- name_rmt->valuelen = cpu_to_be32(args->valuelen);
+ name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
xfs_trans_log_buf(args->trans, bp1,
XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
}
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 01db96f60cf0..833fe5d98d80 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -447,6 +447,7 @@ xfs_attr3_leaf_list_int(
args.dp = context->dp;
args.whichfork = XFS_ATTR_FORK;
args.valuelen = valuelen;
+ args.rmtvaluelen = valuelen;
args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
args.rmtblkcnt = xfs_attr3_rmt_blocks(
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 6e37823e2932..d2e6e948cec7 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -337,7 +337,7 @@ xfs_attr_rmtval_get(
struct xfs_buf *bp;
xfs_dablk_t lblkno = args->rmtblkno;
__uint8_t *dst = args->value;
- int valuelen = args->valuelen;
+ int valuelen;
int nmap;
int error;
int blkcnt = args->rmtblkcnt;
@@ -347,7 +347,9 @@ xfs_attr_rmtval_get(
trace_xfs_attr_rmtval_get(args);
ASSERT(!(args->flags & ATTR_KERNOVAL));
+ ASSERT(args->rmtvaluelen == args->valuelen);
+ valuelen = args->rmtvaluelen;
while (valuelen > 0) {
nmap = ATTR_RMTVALUE_MAPSIZE;
error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
@@ -415,7 +417,7 @@ xfs_attr_rmtval_set(
* attributes have headers, we can't just do a straight byte to FSB
* conversion and have to take the header space into account.
*/
- blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
XFS_ATTR_FORK);
if (error)
@@ -480,7 +482,7 @@ xfs_attr_rmtval_set(
*/
lblkno = args->rmtblkno;
blkcnt = args->rmtblkcnt;
- valuelen = args->valuelen;
+ valuelen = args->rmtvaluelen;
while (valuelen > 0) {
struct xfs_buf *bp;
xfs_daddr_t dblkno;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5b6092ef51ef..f0efc7e970ef 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5413,6 +5413,7 @@ xfs_bmap_shift_extents(
int whichfork = XFS_DATA_FORK;
int logflags;
xfs_filblks_t blockcount = 0;
+ int total_extents;
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5429,7 +5430,6 @@ xfs_bmap_shift_extents(
ASSERT(current_ext != NULL);
ifp = XFS_IFORK_PTR(ip, whichfork);
-
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
/* Read in all the extents */
error = xfs_iread_extents(tp, ip, whichfork);
@@ -5456,7 +5456,6 @@ xfs_bmap_shift_extents(
/* We are going to change core inode */
logflags = XFS_ILOG_CORE;
-
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
@@ -5467,8 +5466,14 @@ xfs_bmap_shift_extents(
logflags |= XFS_ILOG_DEXT;
}
- while (nexts++ < num_exts &&
- *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) {
+ /*
+ * There may be delalloc extents in the data fork before the range we
+ * are collapsing out, so we cannot
+ * use the count of real extents here. Instead we have to calculate it
+ * from the incore fork.
+ */
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ while (nexts++ < num_exts && *current_ext < total_extents) {
gotp = xfs_iext_get_ext(ifp, *current_ext);
xfs_bmbt_get_all(gotp, &got);
@@ -5556,10 +5561,11 @@ xfs_bmap_shift_extents(
}
(*current_ext)++;
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
}
/* Check if we are done */
- if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork))
+ if (*current_ext == total_extents)
*done = 1;
del_cursor:
@@ -5568,6 +5574,5 @@ del_cursor:
error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
xfs_trans_log_inode(tp, ip, logflags);
-
return error;
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 01f6a646caa1..296160b8e78c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1418,6 +1418,8 @@ xfs_zero_file_space(
xfs_off_t end_boundary;
int error;
+ trace_xfs_zero_file_space(ip);
+
granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
/*
@@ -1432,9 +1434,18 @@ xfs_zero_file_space(
ASSERT(end_boundary <= offset + len);
if (start_boundary < end_boundary - 1) {
- /* punch out the page cache over the conversion range */
+ /*
+ * punch out delayed allocation blocks and the page cache over
+ * the conversion range
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip,
+ XFS_B_TO_FSBT(mp, start_boundary),
+ XFS_B_TO_FSB(mp, end_boundary - start_boundary));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
truncate_pagecache_range(VFS_I(ip), start_boundary,
end_boundary - 1);
+
/* convert the blocks */
error = xfs_alloc_file_space(ip, start_boundary,
end_boundary - start_boundary - 1,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 107f2fdfe41f..cb10a0aaab3a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1372,21 +1372,29 @@ xfs_buf_iorequest(
xfs_buf_wait_unpin(bp);
xfs_buf_hold(bp);
- /* Set the count to 1 initially, this will stop an I/O
+ /*
+ * Set the count to 1 initially, this will stop an I/O
* completion callout which happens before we have started
* all the I/O from calling xfs_buf_ioend too early.
*/
atomic_set(&bp->b_io_remaining, 1);
_xfs_buf_ioapply(bp);
- _xfs_buf_ioend(bp, 1);
+ /*
+ * If _xfs_buf_ioapply failed, we'll get back here with
+ * only the reference we took above. _xfs_buf_ioend will
+ * drop it to zero, so we'd better not queue it for later,
+ * or we'll free it before it's done.
+ */
+ _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
xfs_buf_rele(bp);
}
/*
* Waits for I/O to complete on the buffer supplied. It returns immediately if
- * no I/O is pending or there is already a pending error on the buffer. It
- * returns the I/O error code, if any, or 0 if there was no error.
+ * no I/O is pending or there is already a pending error on the buffer, in which
+ * case nothing will ever complete. It returns the I/O error code, if any, or
+ * 0 if there was no error.
*/
int
xfs_buf_iowait(
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 6e95ea79f5d7..201c6091d26a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -60,10 +60,12 @@ typedef struct xfs_da_args {
int index; /* index of attr of interest in blk */
xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
int rmtblkcnt; /* remote attr value block count */
+ int rmtvaluelen; /* remote attr value length in bytes */
xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
int index2; /* index of 2nd attr in blk */
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
+ int rmtvaluelen2; /* remote attr value length in bytes */
int op_flags; /* operation flags */
enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 1399e187d425..753e467aa1a5 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
if (!lsn)
return 0;
- return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 79e96ce98733..830c1c937b88 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -155,7 +155,7 @@ xfs_dir_fsync(
if (!lsn)
return 0;
- return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
STATIC int
@@ -295,7 +295,7 @@ xfs_file_aio_read(
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
if (inode->i_mapping->nrpages) {
- ret = -filemap_write_and_wait_range(
+ ret = filemap_write_and_wait_range(
VFS_I(ip)->i_mapping,
pos, -1);
if (ret) {
@@ -679,7 +679,7 @@ xfs_file_dio_aio_write(
goto out;
if (mapping->nrpages) {
- ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
pos, -1);
if (ret)
goto out;
@@ -837,11 +837,19 @@ xfs_file_fallocate(
unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
if (offset & blksize_mask || len & blksize_mask) {
- error = -EINVAL;
+ error = EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * There is no need to overlap collapse range with EOF,
+ * in which case it is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode)) {
+ error = EINVAL;
goto out_unlock;
}
- ASSERT(offset + len < i_size_read(inode));
new_size = i_size_read(inode) - len;
error = xfs_collapse_file_space(ip, offset, len);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5e7a38fa6ee6..768087bedbac 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1334,7 +1334,8 @@ int
xfs_create_tmpfile(
struct xfs_inode *dp,
struct dentry *dentry,
- umode_t mode)
+ umode_t mode,
+ struct xfs_inode **ipp)
{
struct xfs_mount *mp = dp->i_mount;
struct xfs_inode *ip = NULL;
@@ -1402,7 +1403,6 @@ xfs_create_tmpfile(
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
ip->i_d.di_nlink--;
- d_tmpfile(dentry, VFS_I(ip));
error = xfs_iunlink(tp, ip);
if (error)
goto out_trans_abort;
@@ -1415,6 +1415,7 @@ xfs_create_tmpfile(
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
+ *ipp = ip;
return 0;
out_trans_abort:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 396cc1fafd0d..f2fcde52b66d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -334,7 +334,7 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
- umode_t mode);
+ umode_t mode, struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 89b07e43ca28..36d630319a27 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -72,8 +72,8 @@ xfs_initxattrs(
int error = 0;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- error = xfs_attr_set(ip, xattr->name, xattr->value,
- xattr->value_len, ATTR_SECURE);
+ error = -xfs_attr_set(ip, xattr->name, xattr->value,
+ xattr->value_len, ATTR_SECURE);
if (error < 0)
break;
}
@@ -93,8 +93,8 @@ xfs_init_security(
struct inode *dir,
const struct qstr *qstr)
{
- return security_inode_init_security(inode, dir, qstr,
- &xfs_initxattrs, NULL);
+ return -security_inode_init_security(inode, dir, qstr,
+ &xfs_initxattrs, NULL);
}
static void
@@ -124,15 +124,15 @@ xfs_cleanup_inode(
xfs_dentry_to_name(&teardown, dentry, 0);
xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
- iput(inode);
}
STATIC int
-xfs_vn_mknod(
+xfs_generic_create(
struct inode *dir,
struct dentry *dentry,
umode_t mode,
- dev_t rdev)
+ dev_t rdev,
+ bool tmpfile) /* unnamed file */
{
struct inode *inode;
struct xfs_inode *ip = NULL;
@@ -156,8 +156,12 @@ xfs_vn_mknod(
if (error)
return error;
- xfs_dentry_to_name(&name, dentry, mode);
- error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+ if (!tmpfile) {
+ xfs_dentry_to_name(&name, dentry, mode);
+ error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+ } else {
+ error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
+ }
if (unlikely(error))
goto out_free_acl;
@@ -169,18 +173,22 @@ xfs_vn_mknod(
#ifdef CONFIG_XFS_POSIX_ACL
if (default_acl) {
- error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
goto out_cleanup_inode;
}
if (acl) {
- error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
if (error)
goto out_cleanup_inode;
}
#endif
- d_instantiate(dentry, inode);
+ if (tmpfile)
+ d_tmpfile(dentry, inode);
+ else
+ d_instantiate(dentry, inode);
+
out_free_acl:
if (default_acl)
posix_acl_release(default_acl);
@@ -189,11 +197,23 @@ xfs_vn_mknod(
return -error;
out_cleanup_inode:
- xfs_cleanup_inode(dir, inode, dentry);
+ if (!tmpfile)
+ xfs_cleanup_inode(dir, inode, dentry);
+ iput(inode);
goto out_free_acl;
}
STATIC int
+xfs_vn_mknod(
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ dev_t rdev)
+{
+ return xfs_generic_create(dir, dentry, mode, rdev, false);
+}
+
+STATIC int
xfs_vn_create(
struct inode *dir,
struct dentry *dentry,
@@ -353,6 +373,7 @@ xfs_vn_symlink(
out_cleanup_inode:
xfs_cleanup_inode(dir, inode, dentry);
+ iput(inode);
out:
return -error;
}
@@ -1053,11 +1074,7 @@ xfs_vn_tmpfile(
struct dentry *dentry,
umode_t mode)
{
- int error;
-
- error = xfs_create_tmpfile(XFS_I(dir), dentry, mode);
-
- return -error;
+ return xfs_generic_create(dir, dentry, mode, 0, true);
}
static const struct inode_operations xfs_inode_operations = {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 8497a00e399d..a5f8bd9899d3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -616,11 +616,13 @@ xfs_log_mount(
int error = 0;
int min_logfsbs;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
- xfs_notice(mp, "Mounting Filesystem");
- else {
+ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+ xfs_notice(mp, "Mounting V%d Filesystem",
+ XFS_SB_VERSION_NUM(&mp->m_sb));
+ } else {
xfs_notice(mp,
-"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent.");
+"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
+ XFS_SB_VERSION_NUM(&mp->m_sb));
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
}
@@ -1181,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp)
/* log I/O is always issued ASYNC */
ASSERT(XFS_BUF_ISASYNC(bp));
xlog_state_done_syncing(iclog, aborted);
+
/*
- * do not reference the buffer (bp) here as we could race
- * with it being freed after writing the unmount record to the
- * log.
+ * drop the buffer lock now that we are done. Nothing references
+ * the buffer after this, so an unmount waiting on this lock can now
+ * tear it down safely. As such, it is unsafe to reference the buffer
+ * (bp) after the unlock as we could race with it being freed.
*/
+ xfs_buf_unlock(bp);
}
/*
@@ -1368,8 +1373,16 @@ xlog_alloc_log(
bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
if (!bp)
goto out_free_log;
- bp->b_iodone = xlog_iodone;
+
+ /*
+ * The iclogbuf buffer locks are held over IO but we are not going to do
+ * IO yet. Hence unlock the buffer so that the log IO path can grab it
+ * when appropriately.
+ */
ASSERT(xfs_buf_islocked(bp));
+ xfs_buf_unlock(bp);
+
+ bp->b_iodone = xlog_iodone;
log->l_xbuf = bp;
spin_lock_init(&log->l_icloglock);
@@ -1398,6 +1411,9 @@ xlog_alloc_log(
if (!bp)
goto out_free_iclog;
+ ASSERT(xfs_buf_islocked(bp));
+ xfs_buf_unlock(bp);
+
bp->b_iodone = xlog_iodone;
iclog->ic_bp = bp;
iclog->ic_data = bp->b_addr;
@@ -1422,7 +1438,6 @@ xlog_alloc_log(
iclog->ic_callback_tail = &(iclog->ic_callback);
iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
- ASSERT(xfs_buf_islocked(iclog->ic_bp));
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
@@ -1631,6 +1646,12 @@ xlog_cksum(
* we transition the iclogs to IOERROR state *after* flushing all existing
* iclogs to disk. This is because we don't want anymore new transactions to be
* started or completed afterwards.
+ *
+ * We lock the iclogbufs here so that we can serialise against IO completion
+ * during unmount. We might be processing a shutdown triggered during unmount,
+ * and that can occur asynchronously to the unmount thread, and hence we need to
+ * ensure that completes before tearing down the iclogbufs. Hence we need to
+ * hold the buffer lock across the log IO to acheive that.
*/
STATIC int
xlog_bdstrat(
@@ -1638,6 +1659,7 @@ xlog_bdstrat(
{
struct xlog_in_core *iclog = bp->b_fspriv;
+ xfs_buf_lock(bp);
if (iclog->ic_state & XLOG_STATE_IOERROR) {
xfs_buf_ioerror(bp, EIO);
xfs_buf_stale(bp);
@@ -1645,7 +1667,8 @@ xlog_bdstrat(
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
- * doing it here.
+ * doing it here. Similarly, IO completion will unlock the
+ * buffer, so we don't do it here.
*/
return 0;
}
@@ -1847,14 +1870,28 @@ xlog_dealloc_log(
xlog_cil_destroy(log);
/*
- * always need to ensure that the extra buffer does not point to memory
- * owned by another log buffer before we free it.
+ * Cycle all the iclogbuf locks to make sure all log IO completion
+ * is done before we tear down these buffers.
*/
+ iclog = log->l_iclog;
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ xfs_buf_lock(iclog->ic_bp);
+ xfs_buf_unlock(iclog->ic_bp);
+ iclog = iclog->ic_next;
+ }
+
+ /*
+ * Always need to ensure that the extra buffer does not point to memory
+ * owned by another log buffer before we free it. Also, cycle the lock
+ * first to ensure we've completed IO on it.
+ */
+ xfs_buf_lock(log->l_xbuf);
+ xfs_buf_unlock(log->l_xbuf);
xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
xfs_buf_free(log->l_xbuf);
iclog = log->l_iclog;
- for (i=0; i<log->l_iclog_bufs; i++) {
+ for (i = 0; i < log->l_iclog_bufs; i++) {
xfs_buf_free(iclog->ic_bp);
next_iclog = iclog->ic_next;
kmem_free(iclog);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 993cb19e7d39..944f3d9456a8 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -743,8 +743,6 @@ xfs_mountfs(
new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
mp->m_inode_cluster_size = new_size;
- xfs_info(mp, "Using inode cluster size of %d bytes",
- mp->m_inode_cluster_size);
}
/*
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 348e4d2ed6e6..dc977b6e6a36 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -843,22 +843,17 @@ xfs_qm_init_quotainfo(
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
- if ((error = list_lru_init(&qinf->qi_lru))) {
- kmem_free(qinf);
- mp->m_quotainfo = NULL;
- return error;
- }
+ error = -list_lru_init(&qinf->qi_lru);
+ if (error)
+ goto out_free_qinf;
/*
* See if quotainodes are setup, and if not, allocate them,
* and change the superblock accordingly.
*/
- if ((error = xfs_qm_init_quotainos(mp))) {
- list_lru_destroy(&qinf->qi_lru);
- kmem_free(qinf);
- mp->m_quotainfo = NULL;
- return error;
- }
+ error = xfs_qm_init_quotainos(mp);
+ if (error)
+ goto out_free_lru;
INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
@@ -918,7 +913,7 @@ xfs_qm_init_quotainfo(
qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
-
+
xfs_qm_dqdestroy(dqp);
} else {
qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -935,6 +930,13 @@ xfs_qm_init_quotainfo(
qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
register_shrinker(&qinf->qi_shrinker);
return 0;
+
+out_free_lru:
+ list_lru_destroy(&qinf->qi_lru);
+out_free_qinf:
+ kmem_free(qinf);
+ mp->m_quotainfo = NULL;
+ return error;
}
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 0c0e41bbe4e3..8baf61afae1d 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -201,10 +201,6 @@ xfs_mount_validate_sb(
* write validation, we don't need to check feature masks.
*/
if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
- xfs_alert(mp,
-"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
-"Use of these features in this kernel is at your own risk!");
-
if (xfs_sb_has_compat_feature(sbp,
XFS_SB_FEAT_COMPAT_UNKNOWN)) {
xfs_warn(mp,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 205376776377..3494eff8e4eb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1433,11 +1433,11 @@ xfs_fs_fill_super(
if (error)
goto out_free_fsname;
- error = xfs_init_mount_workqueues(mp);
+ error = -xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
- error = xfs_icsb_init_counters(mp);
+ error = -xfs_icsb_init_counters(mp);
if (error)
goto out_destroy_workqueues;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a4ae41c179a8..65d8c793a25c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink);
DEFINE_INODE_EVENT(xfs_inactive_symlink);
DEFINE_INODE_EVENT(xfs_alloc_file_space);
DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_zero_file_space);
DEFINE_INODE_EVENT(xfs_collapse_file_space);
DEFINE_INODE_EVENT(xfs_readdir);
#ifdef CONFIG_XFS_POSIX_ACL