From 06058632464845abb1af91521122fd04dd3daaec Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Apr 2019 09:26:03 -0600
Subject: io_uring: park SQPOLL thread if it's percpu

kthread expects this, or we can throw a warning on exit:

WARNING: CPU: 0 PID: 7822 at kernel/kthread.c:399
__kthread_bind_mask+0x3b/0xc0 kernel/kthread.c:399
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 7822 Comm: syz-executor030 Not tainted 5.1.0-rc4-next-20190412
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x172/0x1f0 lib/dump_stack.c:113
  panic+0x2cb/0x72b kernel/panic.c:214
  __warn.cold+0x20/0x46 kernel/panic.c:576
  report_bug+0x263/0x2b0 lib/bug.c:186
  fixup_bug arch/x86/kernel/traps.c:179 [inline]
  fixup_bug arch/x86/kernel/traps.c:174 [inline]
  do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:272
  do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:291
  invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973
RIP: 0010:__kthread_bind_mask+0x3b/0xc0 kernel/kthread.c:399
Code: 48 89 fb e8 f7 ab 24 00 4c 89 e6 48 89 df e8 ac e1 02 00 31 ff 49 89
c4 48 89 c6 e8 7f ad 24 00 4d 85 e4 75 15 e8 d5 ab 24 00 <0f> 0b e8 ce ab
24 00 5b 41 5c 41 5d 41 5e 5d c3 e8 c0 ab 24 00 4c
RSP: 0018:ffff8880a89bfbb8 EFLAGS: 00010293
RAX: ffff88808ca7a280 RBX: ffff8880a98e4380 RCX: ffffffff814bdd11
RDX: 0000000000000000 RSI: ffffffff814bdd1b RDI: 0000000000000007
RBP: ffff8880a89bfbd8 R08: ffff88808ca7a280 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: ffffffff87691148 R14: ffff8880a98e43a0 R15: ffffffff81c91e10
  __kthread_bind kernel/kthread.c:412 [inline]
  kthread_unpark+0x123/0x160 kernel/kthread.c:480
  kthread_stop+0xfa/0x6c0 kernel/kthread.c:556
  io_sq_thread_stop fs/io_uring.c:2057 [inline]
  io_sq_thread_stop fs/io_uring.c:2052 [inline]
  io_finish_async+0xab/0x180 fs/io_uring.c:2064
  io_ring_ctx_free fs/io_uring.c:2534 [inline]
  io_ring_ctx_wait_and_kill+0x133/0x510 fs/io_uring.c:2591
  io_uring_release+0x42/0x50 fs/io_uring.c:2599
  __fput+0x2e5/0x8d0 fs/file_table.c:278
  ____fput+0x16/0x20 fs/file_table.c:309
  task_work_run+0x14a/0x1c0 kernel/task_work.c:113
  exit_task_work include/linux/task_work.h:22 [inline]
  do_exit+0x90a/0x2fa0 kernel/exit.c:876
  do_group_exit+0x135/0x370 kernel/exit.c:980
  __do_sys_exit_group kernel/exit.c:991 [inline]
  __se_sys_exit_group kernel/exit.c:989 [inline]
  __x64_sys_exit_group+0x44/0x50 kernel/exit.c:989
  do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Reported-by: syzbot+6d4a92619eb0ad08602b@syzkaller.appspotmail.com
Fixes: 6c271ce2f1d5 ("io_uring: add submission polling")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 89aa8412b5f5..e5008c1b82be 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1920,6 +1920,10 @@ static int io_sq_thread(void *data)
 		unuse_mm(cur_mm);
 		mmput(cur_mm);
 	}
+
+	if (kthread_should_park())
+		kthread_parkme();
+
 	return 0;
 }
 
@@ -2054,6 +2058,7 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
 	if (ctx->sqo_thread) {
 		ctx->sqo_stop = 1;
 		mb();
+		kthread_park(ctx->sqo_thread);
 		kthread_stop(ctx->sqo_thread);
 		ctx->sqo_thread = NULL;
 	}
-- 
cgit v1.2.3


From 917257daa0fea7a007102691c0e27d9216a96768 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Apr 2019 09:28:55 -0600
Subject: io_uring: only test SQPOLL cpu after we've verified it

We currently call cpu_possible() even if we don't use the CPU. Move the
test under the SQ_AFF branch, which is the only place where we'll use
the value. Do the cpu_possible() test AFTER we've limited it to a max
of NR_CPUS. This avoids triggering the following warning:

WARNING: CPU: 1 PID: 7600 at include/linux/cpumask.h:121 cpu_max_bits_warn

if CONFIG_DEBUG_PER_CPU_MAPS is enabled.

While in there, also move the SQ thread idle period assignment inside
SETUP_SQPOLL, as we don't use it otherwise either.

Reported-by: syzbot+cd714a07c6de2bc34293@syzkaller.appspotmail.com
Fixes: 6c271ce2f1d5 ("io_uring: add submission polling")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e5008c1b82be..24355e0c47f0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2241,10 +2241,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 	mmgrab(current->mm);
 	ctx->sqo_mm = current->mm;
 
-	ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
-	if (!ctx->sq_thread_idle)
-		ctx->sq_thread_idle = HZ;
-
 	ret = -EINVAL;
 	if (!cpu_possible(p->sq_thread_cpu))
 		goto err;
@@ -2254,10 +2250,18 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 		if (!capable(CAP_SYS_ADMIN))
 			goto err;
 
+		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
+		if (!ctx->sq_thread_idle)
+			ctx->sq_thread_idle = HZ;
+
 		if (p->flags & IORING_SETUP_SQ_AFF) {
 			int cpu;
 
 			cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
+			ret = -EINVAL;
+			if (!cpu_possible(p->sq_thread_cpu))
+				goto err;
+
 			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
 							ctx, cpu,
 							"io_uring-sq");
-- 
cgit v1.2.3


From 77f1e0a52d26242b6c2dba019f6ebebfb9ff701e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 18 Jan 2019 10:34:16 -0700
Subject: bfq: update internal depth state when queue depth changes

A previous commit moved the shallow depth and BFQ depth map calculations
to be done at init time, moving it outside of the hotter IO path. This
potentially causes hangs if the users changes the depth of the scheduler
map, by writing to the 'nr_requests' sysfs file for that device.

Add a blk-mq-sched hook that allows blk-mq to inform the scheduler if
the depth changes, so that the scheduler can update its internal state.

Tested-by: Kai Krakow <kai@kaishome.de>
Reported-by: Paolo Valente <paolo.valente@linaro.org>
Fixes: f0635b8a416e ("bfq: calculate shallow depths at init time")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c      | 8 +++++++-
 block/blk-mq.c           | 2 ++
 include/linux/elevator.h | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index dfb8cb0af13a..5ba1e0d841b4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5396,7 +5396,7 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
 	return min_shallow;
 }
 
-static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
+static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
 {
 	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
 	struct blk_mq_tags *tags = hctx->sched_tags;
@@ -5404,6 +5404,11 @@ static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
 
 	min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
 	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+}
+
+static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
+{
+	bfq_depth_updated(hctx);
 	return 0;
 }
 
@@ -5826,6 +5831,7 @@ static struct elevator_type iosched_bfq_mq = {
 		.requests_merged	= bfq_requests_merged,
 		.request_merged		= bfq_request_merged,
 		.has_work		= bfq_has_work,
+		.depth_updated		= bfq_depth_updated,
 		.init_hctx		= bfq_init_hctx,
 		.init_sched		= bfq_init_queue,
 		.exit_sched		= bfq_exit_queue,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9516304a38ee..fc60ed7e940e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3135,6 +3135,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 		}
 		if (ret)
 			break;
+		if (q->elevator && q->elevator->type->ops.depth_updated)
+			q->elevator->type->ops.depth_updated(hctx);
 	}
 
 	if (!ret)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 2e9e2763bf47..6e8bc53740f0 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -31,6 +31,7 @@ struct elevator_mq_ops {
 	void (*exit_sched)(struct elevator_queue *);
 	int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
 	void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
+	void (*depth_updated)(struct blk_mq_hw_ctx *);
 
 	bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
 	bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
-- 
cgit v1.2.3


From 3d6770fbd9353988839611bab107e4e891506aad Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Apr 2019 11:50:54 -0600
Subject: io_uring: drop io_file_put() 'file' argument

Since the fget/fput handling was reworked in commit 09bb839434bd, we
never call io_file_put() with state == NULL (and hence file != NULL)
anymore. Remove that case.

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 24355e0c47f0..f4ddb9d23241 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -682,11 +682,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
 		list_add_tail(&req->list, &ctx->poll_list);
 }
 
-static void io_file_put(struct io_submit_state *state, struct file *file)
+static void io_file_put(struct io_submit_state *state)
 {
-	if (!state) {
-		fput(file);
-	} else if (state->file) {
+	if (state->file) {
 		int diff = state->has_refs - state->used_refs;
 
 		if (diff)
@@ -711,7 +709,7 @@ static struct file *io_file_get(struct io_submit_state *state, int fd)
 			state->ios_left--;
 			return state->file;
 		}
-		io_file_put(state, NULL);
+		io_file_put(state);
 	}
 	state->file = fget_many(fd, state->ios_left);
 	if (!state->file)
@@ -1671,7 +1669,7 @@ out:
 static void io_submit_state_end(struct io_submit_state *state)
 {
 	blk_finish_plug(&state->plug);
-	io_file_put(state, NULL);
+	io_file_put(state);
 	if (state->free_reqs)
 		kmem_cache_free_bulk(req_cachep, state->free_reqs,
 					&state->reqs[state->cur_req]);
-- 
cgit v1.2.3


From b19062a567266ee1f10f6709325f766bbcc07d1c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 15 Apr 2019 10:49:38 -0600
Subject: io_uring: fix possible deadlock between io_uring_{enter,register}

If we have multiple threads, one doing io_uring_enter() while the other
is doing io_uring_register(), we can run into a deadlock between the
two. io_uring_register() must wait for existing users of the io_uring
instance to exit. But it does so while holding the io_uring mutex.
Callers of io_uring_enter() may need this mutex to make progress (and
eventually exit). If we wait for users to exit in io_uring_register(),
we can't do so with the io_uring mutex held without potentially risking
a deadlock.

Drop the io_uring mutex while waiting for existing callers to exit. This
is safe and guaranteed to make forward progress, since we already killed
the percpu ref before doing so. Hence later callers of io_uring_enter()
will be rejected.

Reported-by: syzbot+16dc03452dee970a0c3e@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f4ddb9d23241..b35300e4c9a7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2929,11 +2929,23 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
 
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
+	__releases(ctx->uring_lock)
+	__acquires(ctx->uring_lock)
 {
 	int ret;
 
 	percpu_ref_kill(&ctx->refs);
+
+	/*
+	 * Drop uring mutex before waiting for references to exit. If another
+	 * thread is currently inside io_uring_enter() it might need to grab
+	 * the uring_lock to make progress. If we hold it here across the drain
+	 * wait, then we can deadlock. It's safe to drop the mutex here, since
+	 * no new references will come in after we've killed the percpu ref.
+	 */
+	mutex_unlock(&ctx->uring_lock);
 	wait_for_completion(&ctx->ctx_done);
+	mutex_lock(&ctx->uring_lock);
 
 	switch (opcode) {
 	case IORING_REGISTER_BUFFERS:
-- 
cgit v1.2.3


From 74f464e97044da33b25aaed00213914b0edf1f2e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 17 Apr 2019 08:57:48 -0600
Subject: io_uring: fix CQ overflow condition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a leftover from when the rings initially were not free flowing,
and hence a test for tail + 1 == head would indicate full. Since we now
let them wrap instead of mask them with the size, we need to check if
they drift more than the ring size from each other.

This fixes a case where we'd overwrite CQ ring entries, if the user
failed to reap completions. Both cases would ultimately result in lost
completions as the application violated the depth it asked for. The only
difference is that before this fix we'd return invalid entries for the
overflowed completions, instead of properly flagging it in the
cq_ring->overflow variable.

Reported-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b35300e4c9a7..f65f85d89217 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -338,7 +338,7 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 	tail = ctx->cached_cq_tail;
 	/* See comment at the top of the file */
 	smp_rmb();
-	if (tail + 1 == READ_ONCE(ring->r.head))
+	if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
 		return NULL;
 
 	ctx->cached_cq_tail++;
-- 
cgit v1.2.3


From b40fabc05ea047f6af5933d26a5483873340b0d4 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 19 Apr 2019 10:31:27 +0800
Subject: block: kill all_q_node in request_queue

all_q_node has not been used since commit 4b855ad37194 ("blk-mq: Create
hctx for each present CPU"), so remove it.

Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c58a3b2bf00..317ab30d2904 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -548,7 +548,6 @@ struct request_queue {
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
 	struct percpu_ref	q_usage_counter;
-	struct list_head	all_q_node;
 
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
-- 
cgit v1.2.3


From 6bedf00e55e5dd0a4ed1ad3f06131edd6fb56ec8 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 17 Apr 2019 09:11:26 +0800
Subject: block: make sure that bvec length can't be overflow

bvec->bv_offset may be bigger than PAGE_SIZE sometimes, such as,
when one bio is splitted in the middle of one bvec via bio_split(),
and bi_iter.bi_bvec_done is used to build offset of the 1st bvec of
remained bio. And the remained bio's bvec may be re-submitted to fs
layer via ITER_IBVEC, such as loop and nvme-loop.

So we have to make sure that every bvec's offset is less than
PAGE_SIZE from bio_for_each_segment_all() because some drivers(loop,
nvme-loop) passes the splitted bvec to fs layer via ITER_BVEC.

This patch fixes this issue reported by Zhang Yi When running nvme/011.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Yi Zhang <yi.zhang@redhat.com>
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Fixes: 6dc4f100c175 ("block: allow bio_for_each_segment_all() to iterate over multi-page bvec")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bvec.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 3bc91879e1e2..ff13cbc1887d 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -160,8 +160,9 @@ static inline void bvec_advance(const struct bio_vec *bvec,
 		bv->bv_page = nth_page(bv->bv_page, 1);
 		bv->bv_offset = 0;
 	} else {
-		bv->bv_page = bvec->bv_page;
-		bv->bv_offset = bvec->bv_offset;
+		bv->bv_page = bvec_nth_page(bvec->bv_page, bvec->bv_offset /
+					    PAGE_SIZE);
+		bv->bv_offset = bvec->bv_offset & ~PAGE_MASK;
 	}
 	bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset,
 			   bvec->bv_len - iter_all->done);
-- 
cgit v1.2.3