diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-29 11:51:49 -0800 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-29 11:51:49 -0800 | 
| commit | 0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0 (patch) | |
| tree | cefccd67dc1f27bb45830f6b8065dd4a1c05e83b /block | |
| parent | 9697e9da84299d0d715d515dd2cc48f1eceb277d (diff) | |
| parent | 796baeeef85a40b3495a907fb7425086e7010102 (diff) | |
| download | linux-0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0.tar.bz2 | |
Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
 "This is the main pull request for block IO related changes for the
  4.16 kernel. Nothing major in this pull request, but a good amount of
  improvements and fixes all over the map. This contains:
   - BFQ improvements, fixes, and cleanups from Angelo, Chiara, and
     Paolo.
   - Support for SMR zones for deadline and mq-deadline from Damien and
     Christoph.
   - Set of fixes for bcache by way of Michael Lyle, including fixes
     from himself, Kent, Rui, Tang, and Coly.
   - Series from Matias for lightnvm with fixes from Hans Holmberg,
     Javier, and Matias. Mostly centered around pblk, and the removing
     rrpc 1.2 in preparation for supporting 2.0.
   - A couple of NVMe pull requests from Christoph. Nothing major in
     here, just fixes and cleanups, and support for command tracing from
     Johannes.
   - Support for blk-throttle for tracking reads and writes separately.
     From Joseph Qi. A few cleanups/fixes also for blk-throttle from
     Weiping.
   - Series from Mike Snitzer that enables dm to register its queue more
     logically, something that's alwways been problematic on dm since
     it's a stacked device.
   - Series from Ming cleaning up some of the bio accessor use, in
     preparation for supporting multipage bvecs.
   - Various fixes from Ming closing up holes around queue mapping and
     quiescing.
   - BSD partition fix from Richard Narron, fixing a problem where we
     can't mount newer (10/11) FreeBSD partitions.
   - Series from Tejun reworking blk-mq timeout handling. The previous
     scheme relied on atomic bits, but it had races where we would think
     a request had timed out if it to reused at the wrong time.
   - null_blk now supports faking timeouts, to enable us to better
     exercise and test that functionality separately. From me.
   - Kill the separate atomic poll bit in the request struct. After
     this, we don't use the atomic bits on blk-mq anymore at all. From
     me.
   - sgl_alloc/free helpers from Bart.
   - Heavily contended tag case scalability improvement from me.
   - Various little fixes and cleanups from Arnd, Bart, Corentin,
     Douglas, Eryu, Goldwyn, and myself"
* 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits)
  block: remove smart1,2.h
  nvme: add tracepoint for nvme_complete_rq
  nvme: add tracepoint for nvme_setup_cmd
  nvme-pci: introduce RECONNECTING state to mark initializing procedure
  nvme-rdma: remove redundant boolean for inline_data
  nvme: don't free uuid pointer before printing it
  nvme-pci: Suspend queues after deleting them
  bsg: use pr_debug instead of hand crafted macros
  blk-mq-debugfs: don't allow write on attributes with seq_operations set
  nvme-pci: Fix queue double allocations
  block: Set BIO_TRACE_COMPLETION on new bio during split
  blk-throttle: use queue_is_rq_based
  block: Remove kblockd_schedule_delayed_work{,_on}()
  blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays
  blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly()
  lib/scatterlist: Fix chaining support in sgl_alloc_order()
  blk-throttle: track read and write request individually
  block: add bdev_read_only() checks to common helpers
  block: fail op_is_write() requests to read-only partitions
  blk-throttle: export io_serviced_recursive, io_service_bytes_recursive
  ...
Diffstat (limited to 'block')
| -rw-r--r-- | block/bfq-cgroup.c | 7 | ||||
| -rw-r--r-- | block/bfq-iosched.c | 529 | ||||
| -rw-r--r-- | block/bfq-iosched.h | 19 | ||||
| -rw-r--r-- | block/bfq-wf2q.c | 7 | ||||
| -rw-r--r-- | block/bio-integrity.c | 1 | ||||
| -rw-r--r-- | block/bio.c | 30 | ||||
| -rw-r--r-- | block/blk-core.c | 87 | ||||
| -rw-r--r-- | block/blk-exec.c | 2 | ||||
| -rw-r--r-- | block/blk-lib.c | 12 | ||||
| -rw-r--r-- | block/blk-map.c | 4 | ||||
| -rw-r--r-- | block/blk-merge.c | 13 | ||||
| -rw-r--r-- | block/blk-mq-debugfs.c | 22 | ||||
| -rw-r--r-- | block/blk-mq-sched.c | 3 | ||||
| -rw-r--r-- | block/blk-mq-sched.h | 2 | ||||
| -rw-r--r-- | block/blk-mq-sysfs.c | 9 | ||||
| -rw-r--r-- | block/blk-mq-tag.c | 13 | ||||
| -rw-r--r-- | block/blk-mq.c | 667 | ||||
| -rw-r--r-- | block/blk-mq.h | 52 | ||||
| -rw-r--r-- | block/blk-sysfs.c | 47 | ||||
| -rw-r--r-- | block/blk-throttle.c | 146 | ||||
| -rw-r--r-- | block/blk-timeout.c | 26 | ||||
| -rw-r--r-- | block/blk-zoned.c | 42 | ||||
| -rw-r--r-- | block/blk.h | 46 | ||||
| -rw-r--r-- | block/bounce.c | 33 | ||||
| -rw-r--r-- | block/bsg-lib.c | 3 | ||||
| -rw-r--r-- | block/bsg.c | 40 | ||||
| -rw-r--r-- | block/deadline-iosched.c | 114 | ||||
| -rw-r--r-- | block/elevator.c | 12 | ||||
| -rw-r--r-- | block/genhd.c | 23 | ||||
| -rw-r--r-- | block/mq-deadline.c | 141 | ||||
| -rw-r--r-- | block/partitions/msdos.c | 4 | ||||
| -rw-r--r-- | block/scsi_ioctl.c | 34 | 
32 files changed, 1506 insertions, 684 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index da1525ec4c87..d819dc77fe65 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)  	unsigned long flags;  	int i; +	spin_lock_irqsave(&bfqd->lock, flags); +  	if (!entity) /* root group */ -		return; +		goto put_async_queues; -	spin_lock_irqsave(&bfqd->lock, flags);  	/*  	 * Empty all service_trees belonging to this group before  	 * deactivating the group itself. @@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)  	}  	__bfq_deactivate_entity(entity, false); + +put_async_queues:  	bfq_put_async_queues(bfqd, bfqg);  	spin_unlock_irqrestore(&bfqd->lock, flags); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index bcb6d21baf12..47e6ec7427c4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10;  /* Default timeout values, in jiffies, approximating CFQ defaults. */  const int bfq_timeout = HZ / 8; +/* + * Time limit for merging (see comments in bfq_setup_cooperator). Set + * to the slowest value that, in our tests, proved to be effective in + * removing false positives, while not causing true positives to miss + * queue merging. + * + * As can be deduced from the low time limit below, queue merging, if + * successful, happens at the very beggining of the I/O of the involved + * cooperating processes, as a consequence of the arrival of the very + * first requests from each cooperator.  After that, there is very + * little chance to find cooperators. + */ +static const unsigned long bfq_merge_time_limit = HZ/10; +  static struct kmem_cache *bfq_pool;  /* Below this threshold (in ns), we consider thinktime immediate. */ @@ -178,7 +192,7 @@ static struct kmem_cache *bfq_pool;  #define BFQQ_SEEK_THR		(sector_t)(8 * 100)  #define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)  #define BFQQ_CLOSE_THR		(sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8) +#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 19)  /* Min number of samples required to perform peak-rate update */  #define BFQ_RATE_MIN_SAMPLES	32 @@ -195,15 +209,17 @@ static struct kmem_cache *bfq_pool;   * interactive applications automatically, using the following formula:   * duration = (R / r) * T, where r is the peak rate of the device, and   * R and T are two reference parameters. - * In particular, R is the peak rate of the reference device (see below), - * and T is a reference time: given the systems that are likely to be - * installed on the reference device according to its speed class, T is - * about the maximum time needed, under BFQ and while reading two files in - * parallel, to load typical large applications on these systems. - * In practice, the slower/faster the device at hand is, the more/less it - * takes to load applications with respect to the reference device. - * Accordingly, the longer/shorter BFQ grants weight raising to interactive - * applications. + * In particular, R is the peak rate of the reference device (see + * below), and T is a reference time: given the systems that are + * likely to be installed on the reference device according to its + * speed class, T is about the maximum time needed, under BFQ and + * while reading two files in parallel, to load typical large + * applications on these systems (see the comments on + * max_service_from_wr below, for more details on how T is obtained). + * In practice, the slower/faster the device at hand is, the more/less + * it takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to + * interactive applications.   *   * BFQ uses four different reference pairs (R, T), depending on:   * . whether the device is rotational or non-rotational; @@ -240,6 +256,60 @@ static int T_slow[2];  static int T_fast[2];  static int device_speed_thresh[2]; +/* + * BFQ uses the above-detailed, time-based weight-raising mechanism to + * privilege interactive tasks. This mechanism is vulnerable to the + * following false positives: I/O-bound applications that will go on + * doing I/O for much longer than the duration of weight + * raising. These applications have basically no benefit from being + * weight-raised at the beginning of their I/O. On the opposite end, + * while being weight-raised, these applications + * a) unjustly steal throughput to applications that may actually need + * low latency; + * b) make BFQ uselessly perform device idling; device idling results + * in loss of device throughput with most flash-based storage, and may + * increase latencies when used purposelessly. + * + * BFQ tries to reduce these problems, by adopting the following + * countermeasure. To introduce this countermeasure, we need first to + * finish explaining how the duration of weight-raising for + * interactive tasks is computed. + * + * For a bfq_queue deemed as interactive, the duration of weight + * raising is dynamically adjusted, as a function of the estimated + * peak rate of the device, so as to be equal to the time needed to + * execute the 'largest' interactive task we benchmarked so far. By + * largest task, we mean the task for which each involved process has + * to do more I/O than for any of the other tasks we benchmarked. This + * reference interactive task is the start-up of LibreOffice Writer, + * and in this task each process/bfq_queue needs to have at most ~110K + * sectors transferred. + * + * This last piece of information enables BFQ to reduce the actual + * duration of weight-raising for at least one class of I/O-bound + * applications: those doing sequential or quasi-sequential I/O. An + * example is file copy. In fact, once started, the main I/O-bound + * processes of these applications usually consume the above 110K + * sectors in much less time than the processes of an application that + * is starting, because these I/O-bound processes will greedily devote + * almost all their CPU cycles only to their target, + * throughput-friendly I/O operations. This is even more true if BFQ + * happens to be underestimating the device peak rate, and thus + * overestimating the duration of weight raising. But, according to + * our measurements, once transferred 110K sectors, these processes + * have no right to be weight-raised any longer. + * + * Basing on the last consideration, BFQ ends weight-raising for a + * bfq_queue if the latter happens to have received an amount of + * service at least equal to the following constant. The constant is + * set to slightly more than 110K, to have a minimum safety margin. + * + * This early ending of weight-raising reduces the amount of time + * during which interactive false positives cause the two problems + * described at the beginning of these comments. + */ +static const unsigned long max_service_from_wr = 120000; +  #define RQ_BIC(rq)		icq_to_bic((rq)->elv.priv[0])  #define RQ_BFQQ(rq)		((rq)->elv.priv[1]) @@ -403,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,  	}  } +/* + * See the comments on bfq_limit_depth for the purpose of + * the depths set in the function. + */ +static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) +{ +	bfqd->sb_shift = bt->sb.shift; + +	/* +	 * In-word depths if no bfq_queue is being weight-raised: +	 * leaving 25% of tags only for sync reads. +	 * +	 * In next formulas, right-shift the value +	 * (1U<<bfqd->sb_shift), instead of computing directly +	 * (1U<<(bfqd->sb_shift - something)), to be robust against +	 * any possible value of bfqd->sb_shift, without having to +	 * limit 'something'. +	 */ +	/* no more than 50% of tags for async I/O */ +	bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U); +	/* +	 * no more than 75% of tags for sync writes (25% extra tags +	 * w.r.t. async I/O, to prevent async I/O from starving sync +	 * writes) +	 */ +	bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U); + +	/* +	 * In-word depths in case some bfq_queue is being weight- +	 * raised: leaving ~63% of tags for sync reads. This is the +	 * highest percentage for which, in our tests, application +	 * start-up times didn't suffer from any regression due to tag +	 * shortage. +	 */ +	/* no more than ~18% of tags for async I/O */ +	bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U); +	/* no more than ~37% of tags for sync writes (~20% extra tags) */ +	bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U); +} + +/* + * Async I/O can easily starve sync I/O (both sync reads and sync + * writes), by consuming all tags. Similarly, storms of sync writes, + * such as those that sync(2) may trigger, can starve sync reads. + * Limit depths of async I/O and sync writes so as to counter both + * problems. + */ +static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ +	struct blk_mq_tags *tags = blk_mq_tags_from_data(data); +	struct bfq_data *bfqd = data->q->elevator->elevator_data; +	struct sbitmap_queue *bt; + +	if (op_is_sync(op) && !op_is_write(op)) +		return; + +	if (data->flags & BLK_MQ_REQ_RESERVED) { +		if (unlikely(!tags->nr_reserved_tags)) { +			WARN_ON_ONCE(1); +			return; +		} +		bt = &tags->breserved_tags; +	} else +		bt = &tags->bitmap_tags; + +	if (unlikely(bfqd->sb_shift != bt->sb.shift)) +		bfq_update_depths(bfqd, bt); + +	data->shallow_depth = +		bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + +	bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", +			__func__, bfqd->wr_busy_queues, op_is_sync(op), +			data->shallow_depth); +} +  static struct bfq_queue *  bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,  		     sector_t sector, struct rb_node **ret_parent, @@ -444,6 +590,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,  	return bfqq;  } +static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) +{ +	return bfqq->service_from_backlogged > 0 && +		time_is_before_jiffies(bfqq->first_IO_time + +				       bfq_merge_time_limit); +} +  void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)  {  	struct rb_node **p, *parent; @@ -454,6 +607,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)  		bfqq->pos_root = NULL;  	} +	/* +	 * bfqq cannot be merged any longer (see comments in +	 * bfq_setup_cooperator): no point in adding bfqq into the +	 * position tree. +	 */ +	if (bfq_too_late_for_merging(bfqq)) +		return; +  	if (bfq_class_idle(bfqq))  		return;  	if (!bfqq->next_rq) @@ -1247,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,  	if (old_wr_coeff == 1 && wr_or_deserves_wr) {  		/* start a weight-raising period */  		if (interactive) { +			bfqq->service_from_wr = 0;  			bfqq->wr_coeff = bfqd->bfq_wr_coeff;  			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);  		} else { @@ -1627,6 +1789,8 @@ static void bfq_remove_request(struct request_queue *q,  			rb_erase(&bfqq->pos_node, bfqq->pos_root);  			bfqq->pos_root = NULL;  		} +	} else { +		bfq_pos_tree_add_move(bfqd, bfqq);  	}  	if (rq->cmd_flags & REQ_META) @@ -1933,6 +2097,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)  static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,  					struct bfq_queue *new_bfqq)  { +	if (bfq_too_late_for_merging(new_bfqq)) +		return false; +  	if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||  	    (bfqq->ioprio_class != new_bfqq->ioprio_class))  		return false; @@ -1957,20 +2124,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,  }  /* - * If this function returns true, then bfqq cannot be merged. The idea - * is that true cooperation happens very early after processes start - * to do I/O. Usually, late cooperations are just accidental false - * positives. In case bfqq is weight-raised, such false positives - * would evidently degrade latency guarantees for bfqq. - */ -static bool wr_from_too_long(struct bfq_queue *bfqq) -{ -	return bfqq->wr_coeff > 1 && -		time_is_before_jiffies(bfqq->last_wr_start_finish + -				       msecs_to_jiffies(100)); -} - -/*   * Attempt to schedule a merge of bfqq with the currently in-service   * queue or with a close queue among the scheduled queues.  Return   * NULL if no merge was scheduled, a pointer to the shared bfq_queue @@ -1983,11 +2136,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq)   * to maintain. Besides, in such a critical condition as an out of memory,   * the benefits of queue merging may be little relevant, or even negligible.   * - * Weight-raised queues can be merged only if their weight-raising - * period has just started. In fact cooperating processes are usually - * started together. Thus, with this filter we avoid false positives - * that would jeopardize low-latency guarantees. - *   * WARNING: queue merging may impair fairness among non-weight raised   * queues, for at least two reasons: 1) the original weight of a   * merged queue may change during the merged state, 2) even being the @@ -2001,12 +2149,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,  {  	struct bfq_queue *in_service_bfqq, *new_bfqq; +	/* +	 * Prevent bfqq from being merged if it has been created too +	 * long ago. The idea is that true cooperating processes, and +	 * thus their associated bfq_queues, are supposed to be +	 * created shortly after each other. This is the case, e.g., +	 * for KVM/QEMU and dump I/O threads. Basing on this +	 * assumption, the following filtering greatly reduces the +	 * probability that two non-cooperating processes, which just +	 * happen to do close I/O for some short time interval, have +	 * their queues merged by mistake. +	 */ +	if (bfq_too_late_for_merging(bfqq)) +		return NULL; +  	if (bfqq->new_bfqq)  		return bfqq->new_bfqq; -	if (!io_struct || -	    wr_from_too_long(bfqq) || -	    unlikely(bfqq == &bfqd->oom_bfqq)) +	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))  		return NULL;  	/* If there is only one backlogged queue, don't search. */ @@ -2015,12 +2175,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,  	in_service_bfqq = bfqd->in_service_queue; -	if (!in_service_bfqq || in_service_bfqq == bfqq -	    || wr_from_too_long(in_service_bfqq) || -	    unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -		goto check_scheduled; - -	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && +	if (in_service_bfqq && in_service_bfqq != bfqq && +	    likely(in_service_bfqq != &bfqd->oom_bfqq) && +	    bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&  	    bfqq->entity.parent == in_service_bfqq->entity.parent &&  	    bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {  		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); @@ -2032,12 +2189,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,  	 * queues. The only thing we need is that the bio/request is not  	 * NULL, as we need it to establish whether a cooperator exists.  	 */ -check_scheduled:  	new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,  			bfq_io_struct_pos(io_struct, request)); -	if (new_bfqq && !wr_from_too_long(new_bfqq) && -	    likely(new_bfqq != &bfqd->oom_bfqq) && +	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&  	    bfq_may_be_close_cooperator(bfqq, new_bfqq))  		return bfq_setup_merge(bfqq, new_bfqq); @@ -2062,7 +2217,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)  	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);  	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);  	if (unlikely(bfq_bfqq_just_created(bfqq) && -		     !bfq_bfqq_in_large_burst(bfqq))) { +		     !bfq_bfqq_in_large_burst(bfqq) && +		     bfqq->bfqd->low_latency)) {  		/*  		 * bfqq being merged right after being created: bfqq  		 * would have deserved interactive weight raising, but @@ -2917,45 +3073,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,   * whereas soft_rt_next_start is set to infinity for applications that do   * not.   * - * Unfortunately, even a greedy application may happen to behave in an - * isochronous way if the CPU load is high. In fact, the application may - * stop issuing requests while the CPUs are busy serving other processes, - * then restart, then stop again for a while, and so on. In addition, if - * the disk achieves a low enough throughput with the request pattern - * issued by the application (e.g., because the request pattern is random - * and/or the device is slow), then the application may meet the above - * bandwidth requirement too. To prevent such a greedy application to be - * deemed as soft real-time, a further rule is used in the computation of - * soft_rt_next_start: soft_rt_next_start must be higher than the current - * time plus the maximum time for which the arrival of a request is waited - * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. - * This filters out greedy applications, as the latter issue instead their - * next request as soon as possible after the last one has been completed - * (in contrast, when a batch of requests is completed, a soft real-time - * application spends some time processing data). + * Unfortunately, even a greedy (i.e., I/O-bound) application may + * happen to meet, occasionally or systematically, both the above + * bandwidth and isochrony requirements. This may happen at least in + * the following circumstances. First, if the CPU load is high. The + * application may stop issuing requests while the CPUs are busy + * serving other processes, then restart, then stop again for a while, + * and so on. The other circumstances are related to the storage + * device: the storage device is highly loaded or reaches a low-enough + * throughput with the I/O of the application (e.g., because the I/O + * is random and/or the device is slow). In all these cases, the + * I/O of the application may be simply slowed down enough to meet + * the bandwidth and isochrony requirements. To reduce the probability + * that greedy applications are deemed as soft real-time in these + * corner cases, a further rule is used in the computation of + * soft_rt_next_start: the return value of this function is forced to + * be higher than the maximum between the following two quantities. + * + * (a) Current time plus: (1) the maximum time for which the arrival + *     of a request is waited for when a sync queue becomes idle, + *     namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We + *     postpone for a moment the reason for adding a few extra + *     jiffies; we get back to it after next item (b).  Lower-bounding + *     the return value of this function with the current time plus + *     bfqd->bfq_slice_idle tends to filter out greedy applications, + *     because the latter issue their next request as soon as possible + *     after the last one has been completed. In contrast, a soft + *     real-time application spends some time processing data, after a + *     batch of its requests has been completed.   * - * Unfortunately, the last filter may easily generate false positives if - * only bfqd->bfq_slice_idle is used as a reference time interval and one - * or both the following cases occur: - * 1) HZ is so low that the duration of a jiffy is comparable to or higher - *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with - *    HZ=100. + * (b) Current value of bfqq->soft_rt_next_start. As pointed out + *     above, greedy applications may happen to meet both the + *     bandwidth and isochrony requirements under heavy CPU or + *     storage-device load. In more detail, in these scenarios, these + *     applications happen, only for limited time periods, to do I/O + *     slowly enough to meet all the requirements described so far, + *     including the filtering in above item (a). These slow-speed + *     time intervals are usually interspersed between other time + *     intervals during which these applications do I/O at a very high + *     speed. Fortunately, exactly because of the high speed of the + *     I/O in the high-speed intervals, the values returned by this + *     function happen to be so high, near the end of any such + *     high-speed interval, to be likely to fall *after* the end of + *     the low-speed time interval that follows. These high values are + *     stored in bfqq->soft_rt_next_start after each invocation of + *     this function. As a consequence, if the last value of + *     bfqq->soft_rt_next_start is constantly used to lower-bound the + *     next value that this function may return, then, from the very + *     beginning of a low-speed interval, bfqq->soft_rt_next_start is + *     likely to be constantly kept so high that any I/O request + *     issued during the low-speed interval is considered as arriving + *     to soon for the application to be deemed as soft + *     real-time. Then, in the high-speed interval that follows, the + *     application will not be deemed as soft real-time, just because + *     it will do I/O at a high speed. And so on. + * + * Getting back to the filtering in item (a), in the following two + * cases this filtering might be easily passed by a greedy + * application, if the reference quantity was just + * bfqd->bfq_slice_idle: + * 1) HZ is so low that the duration of a jiffy is comparable to or + *    higher than bfqd->bfq_slice_idle. This happens, e.g., on slow + *    devices with HZ=100. The time granularity may be so coarse + *    that the approximation, in jiffies, of bfqd->bfq_slice_idle + *    is rather lower than the exact value.   * 2) jiffies, instead of increasing at a constant rate, may stop increasing   *    for a while, then suddenly 'jump' by several units to recover the lost   *    increments. This seems to happen, e.g., inside virtual machines. - * To address this issue, we do not use as a reference time interval just - * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In - * particular we add the minimum number of jiffies for which the filter - * seems to be quite precise also in embedded systems and KVM/QEMU virtual - * machines. + * To address this issue, in the filtering in (a) we do not use as a + * reference time interval just bfqd->bfq_slice_idle, but + * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the + * minimum number of jiffies for which the filter seems to be quite + * precise also in embedded systems and KVM/QEMU virtual machines.   */  static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,  						struct bfq_queue *bfqq)  { -	return max(bfqq->last_idle_bklogged + -		   HZ * bfqq->service_from_backlogged / -		   bfqd->bfq_wr_max_softrt_rate, -		   jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); +	return max3(bfqq->soft_rt_next_start, +		    bfqq->last_idle_bklogged + +		    HZ * bfqq->service_from_backlogged / +		    bfqd->bfq_wr_max_softrt_rate, +		    jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);  }  /** @@ -3000,17 +3198,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,  	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);  	/* -	 * Increase service_from_backlogged before next statement, -	 * because the possible next invocation of -	 * bfq_bfqq_charge_time would likely inflate -	 * entity->service. In contrast, service_from_backlogged must -	 * contain real service, to enable the soft real-time -	 * heuristic to correctly compute the bandwidth consumed by -	 * bfqq. -	 */ -	bfqq->service_from_backlogged += entity->service; - -	/*  	 * As above explained, charge slow (typically seeky) and  	 * timed-out queues with the time and not the service  	 * received, to favor sequential workloads. @@ -3535,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)  				bfqq->entity.prio_changed = 1;  			}  		} +		if (bfqq->wr_coeff > 1 && +		    bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && +		    bfqq->service_from_wr > max_service_from_wr) { +			/* see comments on max_service_from_wr */ +			bfq_bfqq_end_wr(bfqq); +		}  	}  	/*  	 * To improve latency (for this or other queues), immediately @@ -3630,8 +3823,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)  		}  		/* -		 * We exploit the put_rq_private hook to decrement -		 * rq_in_driver, but put_rq_private will not be +		 * We exploit the bfq_finish_request hook to decrement +		 * rq_in_driver, but bfq_finish_request will not be  		 * invoked on this request. So, to avoid unbalance,  		 * just start this request, without incrementing  		 * rq_in_driver. As a negative consequence, @@ -3640,14 +3833,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)  		 * bfq_schedule_dispatch to be invoked uselessly.  		 *  		 * As for implementing an exact solution, the -		 * put_request hook, if defined, is probably invoked -		 * also on this request. So, by exploiting this hook, -		 * we could 1) increment rq_in_driver here, and 2) -		 * decrement it in put_request. Such a solution would -		 * let the value of the counter be always accurate, -		 * but it would entail using an extra interface -		 * function. This cost seems higher than the benefit, -		 * being the frequency of non-elevator-private +		 * bfq_finish_request hook, if defined, is probably +		 * invoked also on this request. So, by exploiting +		 * this hook, we could 1) increment rq_in_driver here, +		 * and 2) decrement it in bfq_finish_request. Such a +		 * solution would let the value of the counter be +		 * always accurate, but it would entail using an extra +		 * interface function. This cost seems higher than the +		 * benefit, being the frequency of non-elevator-private  		 * requests very low.  		 */  		goto start_rq; @@ -3689,35 +3882,16 @@ exit:  	return rq;  } -static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -{ -	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -	struct request *rq;  #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) -	struct bfq_queue *in_serv_queue, *bfqq; -	bool waiting_rq, idle_timer_disabled; -#endif - -	spin_lock_irq(&bfqd->lock); - -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) -	in_serv_queue = bfqd->in_service_queue; -	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); - -	rq = __bfq_dispatch_request(hctx); - -	idle_timer_disabled = -		waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); - -#else -	rq = __bfq_dispatch_request(hctx); -#endif -	spin_unlock_irq(&bfqd->lock); +static void bfq_update_dispatch_stats(struct request_queue *q, +				      struct request *rq, +				      struct bfq_queue *in_serv_queue, +				      bool idle_timer_disabled) +{ +	struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) -	bfqq = rq ? RQ_BFQQ(rq) : NULL;  	if (!idle_timer_disabled && !bfqq) -		return rq; +		return;  	/*  	 * rq and bfqq are guaranteed to exist until this function @@ -3732,7 +3906,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)  	 * In addition, the following queue lock guarantees that  	 * bfqq_group(bfqq) exists as well.  	 */ -	spin_lock_irq(hctx->queue->queue_lock); +	spin_lock_irq(q->queue_lock);  	if (idle_timer_disabled)  		/*  		 * Since the idle timer has been disabled, @@ -3751,9 +3925,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)  		bfqg_stats_set_start_empty_time(bfqg);  		bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);  	} -	spin_unlock_irq(hctx->queue->queue_lock); +	spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_dispatch_stats(struct request_queue *q, +					     struct request *rq, +					     struct bfq_queue *in_serv_queue, +					     bool idle_timer_disabled) {}  #endif +static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ +	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; +	struct request *rq; +	struct bfq_queue *in_serv_queue; +	bool waiting_rq, idle_timer_disabled; + +	spin_lock_irq(&bfqd->lock); + +	in_serv_queue = bfqd->in_service_queue; +	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); + +	rq = __bfq_dispatch_request(hctx); + +	idle_timer_disabled = +		waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + +	spin_unlock_irq(&bfqd->lock); + +	bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, +				  idle_timer_disabled); +  	return rq;  } @@ -4002,10 +4204,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,  	bfqq->split_time = bfq_smallest_from_now();  	/* -	 * Set to the value for which bfqq will not be deemed as -	 * soft rt when it becomes backlogged. +	 * To not forget the possibly high bandwidth consumed by a +	 * process/queue in the recent past, +	 * bfq_bfqq_softrt_next_start() returns a value at least equal +	 * to the current value of bfqq->soft_rt_next_start (see +	 * comments on bfq_bfqq_softrt_next_start).  Set +	 * soft_rt_next_start to now, to mean that bfqq has consumed +	 * no bandwidth so far.  	 */ -	bfqq->soft_rt_next_start = bfq_greatest_from_now(); +	bfqq->soft_rt_next_start = jiffies;  	/* first request is almost certainly seeky */  	bfqq->seek_history = 1; @@ -4276,16 +4483,46 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)  	return idle_timer_disabled;  } +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) +static void bfq_update_insert_stats(struct request_queue *q, +				    struct bfq_queue *bfqq, +				    bool idle_timer_disabled, +				    unsigned int cmd_flags) +{ +	if (!bfqq) +		return; + +	/* +	 * bfqq still exists, because it can disappear only after +	 * either it is merged with another queue, or the process it +	 * is associated with exits. But both actions must be taken by +	 * the same process currently executing this flow of +	 * instructions. +	 * +	 * In addition, the following queue lock guarantees that +	 * bfqq_group(bfqq) exists as well. +	 */ +	spin_lock_irq(q->queue_lock); +	bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); +	if (idle_timer_disabled) +		bfqg_stats_update_idle_time(bfqq_group(bfqq)); +	spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_insert_stats(struct request_queue *q, +					   struct bfq_queue *bfqq, +					   bool idle_timer_disabled, +					   unsigned int cmd_flags) {} +#endif +  static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,  			       bool at_head)  {  	struct request_queue *q = hctx->queue;  	struct bfq_data *bfqd = q->elevator->elevator_data; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)  	struct bfq_queue *bfqq = RQ_BFQQ(rq);  	bool idle_timer_disabled = false;  	unsigned int cmd_flags; -#endif  	spin_lock_irq(&bfqd->lock);  	if (blk_mq_sched_try_insert_merge(q, rq)) { @@ -4304,7 +4541,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,  		else  			list_add_tail(&rq->queuelist, &bfqd->dispatch);  	} else { -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)  		idle_timer_disabled = __bfq_insert_request(bfqd, rq);  		/*  		 * Update bfqq, because, if a queue merge has occurred @@ -4312,9 +4548,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,  		 * redirected into a new queue.  		 */  		bfqq = RQ_BFQQ(rq); -#else -		__bfq_insert_request(bfqd, rq); -#endif  		if (rq_mergeable(rq)) {  			elv_rqhash_add(q, rq); @@ -4323,35 +4556,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,  		}  	} -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)  	/*  	 * Cache cmd_flags before releasing scheduler lock, because rq  	 * may disappear afterwards (for example, because of a request  	 * merge).  	 */  	cmd_flags = rq->cmd_flags; -#endif +  	spin_unlock_irq(&bfqd->lock); -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) -	if (!bfqq) -		return; -	/* -	 * bfqq still exists, because it can disappear only after -	 * either it is merged with another queue, or the process it -	 * is associated with exits. But both actions must be taken by -	 * the same process currently executing this flow of -	 * instruction. -	 * -	 * In addition, the following queue lock guarantees that -	 * bfqq_group(bfqq) exists as well. -	 */ -	spin_lock_irq(q->queue_lock); -	bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -	if (idle_timer_disabled) -		bfqg_stats_update_idle_time(bfqq_group(bfqq)); -	spin_unlock_irq(q->queue_lock); -#endif +	bfq_update_insert_stats(q, bfqq, idle_timer_disabled, +				cmd_flags);  }  static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, @@ -4482,7 +4697,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)  		bfq_schedule_dispatch(bfqd);  } -static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) +static void bfq_finish_request_body(struct bfq_queue *bfqq)  {  	bfqq->allocated--; @@ -4512,7 +4727,7 @@ static void bfq_finish_request(struct request *rq)  		spin_lock_irqsave(&bfqd->lock, flags);  		bfq_completed_request(bfqq, bfqd); -		bfq_put_rq_priv_body(bfqq); +		bfq_finish_request_body(bfqq);  		spin_unlock_irqrestore(&bfqd->lock, flags);  	} else { @@ -4533,7 +4748,7 @@ static void bfq_finish_request(struct request *rq)  			bfqg_stats_update_io_remove(bfqq_group(bfqq),  						    rq->cmd_flags);  		} -		bfq_put_rq_priv_body(bfqq); +		bfq_finish_request_body(bfqq);  	}  	rq->elv.priv[0] = NULL; @@ -4818,6 +5033,9 @@ static void bfq_exit_queue(struct elevator_queue *e)  	hrtimer_cancel(&bfqd->idle_slice_timer);  #ifdef CONFIG_BFQ_GROUP_IOSCHED +	/* release oom-queue reference to root group */ +	bfqg_and_blkg_put(bfqd->root_group); +  	blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);  #else  	spin_lock_irq(&bfqd->lock); @@ -5206,6 +5424,7 @@ static struct elv_fs_entry bfq_attrs[] = {  static struct elevator_type iosched_bfq_mq = {  	.ops.mq = { +		.limit_depth		= bfq_limit_depth,  		.prepare_request	= bfq_prepare_request,  		.finish_request		= bfq_finish_request,  		.exit_icq		= bfq_exit_icq, diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 91c4390903a1..350c39ae2896 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -337,6 +337,11 @@ struct bfq_queue {  	 * last transition from idle to backlogged.  	 */  	unsigned long service_from_backlogged; +	/* +	 * Cumulative service received from the @bfq_queue since its +	 * last transition to weight-raised state. +	 */ +	unsigned long service_from_wr;  	/*  	 * Value of wr start time when switching to soft rt @@ -344,6 +349,8 @@ struct bfq_queue {  	unsigned long wr_start_at_switch_to_srt;  	unsigned long split_time; /* time of last split */ + +	unsigned long first_IO_time; /* time of first I/O for this queue */  };  /** @@ -627,6 +634,18 @@ struct bfq_data {  	struct bfq_io_cq *bio_bic;  	/* bfqq associated with the task issuing current bio for merging */  	struct bfq_queue *bio_bfqq; + +	/* +	 * Cached sbitmap shift, used to compute depth limits in +	 * bfq_update_depths. +	 */ +	unsigned int sb_shift; + +	/* +	 * Depth limits used in bfq_limit_depth (see comments on the +	 * function) +	 */ +	unsigned int word_depths[2][2];  };  enum bfqq_state_flags { diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index e495d3f9b4b0..4498c43245e2 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)  	struct bfq_entity *entity = &bfqq->entity;  	struct bfq_service_tree *st; +	if (!bfqq->service_from_backlogged) +		bfqq->first_IO_time = jiffies; + +	if (bfqq->wr_coeff > 1) +		bfqq->service_from_wr += served; + +	bfqq->service_from_backlogged += served;  	for_each_entity(entity) {  		st = bfq_entity_service_tree(entity); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 23b42e8aa03e..9cfdd6c83b5b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work)  /**   * __bio_integrity_endio - Integrity I/O completion function   * @bio:	Protected bio - * @error:	Pointer to errno   *   * Description: Completion for integrity I/O   * diff --git a/block/bio.c b/block/bio.c index 9ef6cf3addb3..e1708db48258 100644 --- a/block/bio.c +++ b/block/bio.c @@ -971,34 +971,6 @@ void bio_advance(struct bio *bio, unsigned bytes)  EXPORT_SYMBOL(bio_advance);  /** - * bio_alloc_pages - allocates a single page for each bvec in a bio - * @bio: bio to allocate pages for - * @gfp_mask: flags for allocation - * - * Allocates pages up to @bio->bi_vcnt. - * - * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are - * freed. - */ -int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) -{ -	int i; -	struct bio_vec *bv; - -	bio_for_each_segment_all(bv, bio, i) { -		bv->bv_page = alloc_page(gfp_mask); -		if (!bv->bv_page) { -			while (--bv >= bio->bi_io_vec) -				__free_page(bv->bv_page); -			return -ENOMEM; -		} -	} - -	return 0; -} -EXPORT_SYMBOL(bio_alloc_pages); - -/**   * bio_copy_data - copy contents of data buffers from one chain of bios to   * another   * @src: source bio list @@ -1838,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors,  	bio_advance(bio, split->bi_iter.bi_size);  	if (bio_flagged(bio, BIO_TRACE_COMPLETION)) -		bio_set_flag(bio, BIO_TRACE_COMPLETION); +		bio_set_flag(split, BIO_TRACE_COMPLETION);  	return split;  } diff --git a/block/blk-core.c b/block/blk-core.c index 3ba4326a63b5..a2005a485335 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)  	rq->start_time = jiffies;  	set_start_time_ns(rq);  	rq->part = NULL; +	seqcount_init(&rq->gstate_seq); +	u64_stats_init(&rq->aborted_gstate_sync);  }  EXPORT_SYMBOL(blk_rq_init); @@ -699,6 +701,15 @@ void blk_cleanup_queue(struct request_queue *q)  	queue_flag_set(QUEUE_FLAG_DEAD, q);  	spin_unlock_irq(lock); +	/* +	 * make sure all in-progress dispatch are completed because +	 * blk_freeze_queue() can only complete all requests, and +	 * dispatch may still be in-progress since we dispatch requests +	 * from more than one contexts +	 */ +	if (q->mq_ops) +		blk_mq_quiesce_queue(q); +  	/* for synchronous bio-based driver finish in-flight integrity i/o */  	blk_flush_integrity(); @@ -1646,6 +1657,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)  	lockdep_assert_held(q->queue_lock); +	blk_req_zone_write_unlock(req);  	blk_pm_put_request(req);  	elv_completed_request(q, req); @@ -2055,6 +2067,21 @@ static inline bool should_fail_request(struct hd_struct *part,  #endif /* CONFIG_FAIL_MAKE_REQUEST */ +static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) +{ +	if (part->policy && op_is_write(bio_op(bio))) { +		char b[BDEVNAME_SIZE]; + +		printk(KERN_ERR +		       "generic_make_request: Trying to write " +			"to read-only block-device %s (partno %d)\n", +			bio_devname(bio, b), part->partno); +		return true; +	} + +	return false; +} +  /*   * Remap block n of partition p to block n+start(p) of the disk.   */ @@ -2063,27 +2090,28 @@ static inline int blk_partition_remap(struct bio *bio)  	struct hd_struct *p;  	int ret = 0; +	rcu_read_lock(); +	p = __disk_get_part(bio->bi_disk, bio->bi_partno); +	if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) || +		     bio_check_ro(bio, p))) { +		ret = -EIO; +		goto out; +	} +  	/*  	 * Zone reset does not include bi_size so bio_sectors() is always 0.  	 * Include a test for the reset op code and perform the remap if needed.  	 */ -	if (!bio->bi_partno || -	    (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)) -		return 0; +	if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET) +		goto out; -	rcu_read_lock(); -	p = __disk_get_part(bio->bi_disk, bio->bi_partno); -	if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) { -		bio->bi_iter.bi_sector += p->start_sect; -		bio->bi_partno = 0; -		trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), -				bio->bi_iter.bi_sector - p->start_sect); -	} else { -		printk("%s: fail for partition %d\n", __func__, bio->bi_partno); -		ret = -EIO; -	} -	rcu_read_unlock(); +	bio->bi_iter.bi_sector += p->start_sect; +	bio->bi_partno = 0; +	trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), +			      bio->bi_iter.bi_sector - p->start_sect); +out: +	rcu_read_unlock();  	return ret;  } @@ -2142,15 +2170,19 @@ generic_make_request_checks(struct bio *bio)  	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP  	 * if queue is not a request based queue.  	 */ -  	if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))  		goto not_supported;  	if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))  		goto end_io; -	if (blk_partition_remap(bio)) -		goto end_io; +	if (!bio->bi_partno) { +		if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) +			goto end_io; +	} else { +		if (blk_partition_remap(bio)) +			goto end_io; +	}  	if (bio_check_eod(bio, nr_sectors))  		goto end_io; @@ -2493,8 +2525,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *  		 * bypass a potential scheduler on the bottom device for  		 * insert.  		 */ -		blk_mq_request_bypass_insert(rq, true); -		return BLK_STS_OK; +		return blk_mq_request_issue_directly(rq);  	}  	spin_lock_irqsave(q->queue_lock, flags); @@ -2846,7 +2877,7 @@ void blk_start_request(struct request *req)  		wbt_issue(req->q->rq_wb, &req->issue_stat);  	} -	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); +	BUG_ON(blk_rq_is_complete(req));  	blk_add_timer(req);  }  EXPORT_SYMBOL(blk_start_request); @@ -3415,20 +3446,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,  }  EXPORT_SYMBOL(kblockd_mod_delayed_work_on); -int kblockd_schedule_delayed_work(struct delayed_work *dwork, -				  unsigned long delay) -{ -	return queue_delayed_work(kblockd_workqueue, dwork, delay); -} -EXPORT_SYMBOL(kblockd_schedule_delayed_work); - -int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, -				     unsigned long delay) -{ -	return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); -} -EXPORT_SYMBOL(kblockd_schedule_delayed_work_on); -  /**   * blk_start_plug - initialize blk_plug and track it inside the task_struct   * @plug:	The &struct blk_plug that needs to be initialized diff --git a/block/blk-exec.c b/block/blk-exec.c index 5c0f3dc446dc..f7b292f12449 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,  	 * be reused after dying flag is set  	 */  	if (q->mq_ops) { -		blk_mq_sched_insert_request(rq, at_head, true, false, false); +		blk_mq_sched_insert_request(rq, at_head, true, false);  		return;  	} diff --git a/block/blk-lib.c b/block/blk-lib.c index 2bc544ce3d2e..a676084d4740 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,  	if (!q)  		return -ENXIO; +	if (bdev_read_only(bdev)) +		return -EPERM; +  	if (flags & BLKDEV_DISCARD_SECURE) {  		if (!blk_queue_secure_erase(q))  			return -EOPNOTSUPP; @@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,  	if (!q)  		return -ENXIO; +	if (bdev_read_only(bdev)) +		return -EPERM; +  	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;  	if ((sector | nr_sects) & bs_mask)  		return -EINVAL; @@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,  	if (!q)  		return -ENXIO; +	if (bdev_read_only(bdev)) +		return -EPERM; +  	/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */  	max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); @@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,  	if (!q)  		return -ENXIO; +	if (bdev_read_only(bdev)) +		return -EPERM; +  	while (nr_sects != 0) {  		bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),  			       gfp_mask); diff --git a/block/blk-map.c b/block/blk-map.c index d3a94719f03f..db9373bd31ac 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -119,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,  	unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);  	struct bio *bio = NULL;  	struct iov_iter i; -	int ret; +	int ret = -EINVAL;  	if (!iter_is_iovec(iter))  		goto fail; @@ -148,7 +148,7 @@ unmap_rq:  	__blk_rq_unmap_user(bio);  fail:  	rq->bio = NULL; -	return -EINVAL; +	return ret;  }  EXPORT_SYMBOL(blk_rq_map_user_iov); diff --git a/block/blk-merge.c b/block/blk-merge.c index f5dedd57dff6..8452fc7164cc 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,  				nsegs++;  				sectors = max_sectors;  			} -			if (sectors) -				goto split; -			/* Make this single bvec as the 1st segment */ +			goto split;  		}  		if (bvprvp && blk_queue_cluster(q)) { @@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,  			bvprvp = &bvprv;  			sectors += bv.bv_len >> 9; -			if (nsegs == 1 && seg_size > front_seg_size) -				front_seg_size = seg_size;  			continue;  		}  new_segment:  		if (nsegs == queue_max_segments(q))  			goto split; +		if (nsegs == 1 && seg_size > front_seg_size) +			front_seg_size = seg_size; +  		nsegs++;  		bvprv = bv;  		bvprvp = &bvprv;  		seg_size = bv.bv_len;  		sectors += bv.bv_len >> 9; -		if (nsegs == 1 && seg_size > front_seg_size) -			front_seg_size = seg_size;  	}  	do_split = false; @@ -174,6 +171,8 @@ split:  			bio = new;  	} +	if (nsegs == 1 && seg_size > front_seg_size) +		front_seg_size = seg_size;  	bio->bi_seg_front_size = front_seg_size;  	if (seg_size > bio->bi_seg_back_size)  		bio->bi_seg_back_size = seg_size; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b56a4f35720d..21cbc1f071c6 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -289,17 +289,12 @@ static const char *const rqf_name[] = {  	RQF_NAME(HASHED),  	RQF_NAME(STATS),  	RQF_NAME(SPECIAL_PAYLOAD), +	RQF_NAME(ZONE_WRITE_LOCKED), +	RQF_NAME(MQ_TIMEOUT_EXPIRED), +	RQF_NAME(MQ_POLL_SLEPT),  };  #undef RQF_NAME -#define RQAF_NAME(name) [REQ_ATOM_##name] = #name -static const char *const rqaf_name[] = { -	RQAF_NAME(COMPLETE), -	RQAF_NAME(STARTED), -	RQAF_NAME(POLL_SLEPT), -}; -#undef RQAF_NAME -  int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)  {  	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; @@ -316,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)  	seq_puts(m, ", .rq_flags=");  	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,  		       ARRAY_SIZE(rqf_name)); -	seq_puts(m, ", .atomic_flags="); -	blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name)); +	seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));  	seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,  		   rq->internal_tag);  	if (mq_ops->show_rq) @@ -409,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)  	const struct show_busy_params *params = data;  	if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx && -	    test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) +	    blk_mq_rq_state(rq) != MQ_RQ_IDLE)  		__blk_mq_debugfs_rq_show(params->m,  					 list_entry_rq(&rq->queuelist));  } @@ -703,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,  	const struct blk_mq_debugfs_attr *attr = m->private;  	void *data = d_inode(file->f_path.dentry->d_parent)->i_private; -	if (!attr->write) +	/* +	 * Attributes that only implement .seq_ops are read-only and 'attr' is +	 * the same with 'data' in this case. +	 */ +	if (attr == data || !attr->write)  		return -EPERM;  	return attr->write(data, buf, count, ppos); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c117bd8fd1f6..55c0a745b427 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)  	WRITE_ONCE(hctx->dispatch_from, ctx);  } -/* return true if hw queue need to be run again */  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)  {  	struct request_queue *q = hctx->queue; @@ -428,7 +427,7 @@ done:  }  void blk_mq_sched_insert_request(struct request *rq, bool at_head, -				 bool run_queue, bool async, bool can_block) +				 bool run_queue, bool async)  {  	struct request_queue *q = rq->q;  	struct elevator_queue *e = q->elevator; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index ba1d1418a96d..1e9c9018ace1 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);  void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);  void blk_mq_sched_insert_request(struct request *rq, bool at_head, -				 bool run_queue, bool async, bool can_block); +				 bool run_queue, bool async);  void blk_mq_sched_insert_requests(struct request_queue *q,  				  struct blk_mq_ctx *ctx,  				  struct list_head *list, bool run_queue_async); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 79969c3c234f..a54b4b070f1c 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)  	return ret;  } -static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) +void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)  {  	struct blk_mq_hw_ctx *hctx;  	int i; @@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)  	q->mq_sysfs_init_done = false;  } -void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) -{ -	mutex_lock(&q->sysfs_lock); -	__blk_mq_unregister_dev(dev, q); -	mutex_unlock(&q->sysfs_lock); -} -  void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)  {  	kobject_init(&hctx->kobj, &blk_mq_hw_ktype); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c81b40ecd3f1..336dde07b230 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)  	ws = bt_wait_ptr(bt, data->hctx);  	drop_ctx = data->ctx == NULL;  	do { -		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - -		tag = __blk_mq_get_tag(data, bt); -		if (tag != -1) -			break; -  		/*  		 * We're out of tags on this hardware queue, kick any  		 * pending IO submits before going to sleep waiting for @@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)  		if (tag != -1)  			break; +		prepare_to_wait_exclusive(&ws->wait, &wait, +						TASK_UNINTERRUPTIBLE); + +		tag = __blk_mq_get_tag(data, bt); +		if (tag != -1) +			break; +  		if (data->ctx)  			blk_mq_put_ctx(data->ctx); diff --git a/block/blk-mq.c b/block/blk-mq.c index 3d3797327491..01f271d40825 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -95,8 +95,7 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,  {  	struct mq_inflight *mi = priv; -	if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && -	    !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { +	if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {  		/*  		 * index[0] counts the specific partition that was asked  		 * for. index[1] counts the ones that are active on the @@ -222,7 +221,7 @@ void blk_mq_quiesce_queue(struct request_queue *q)  	queue_for_each_hw_ctx(q, hctx, i) {  		if (hctx->flags & BLK_MQ_F_BLOCKING) -			synchronize_srcu(hctx->queue_rq_srcu); +			synchronize_srcu(hctx->srcu);  		else  			rcu = true;  	} @@ -272,15 +271,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,  {  	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);  	struct request *rq = tags->static_rqs[tag]; - -	rq->rq_flags = 0; +	req_flags_t rq_flags = 0;  	if (data->flags & BLK_MQ_REQ_INTERNAL) {  		rq->tag = -1;  		rq->internal_tag = tag;  	} else {  		if (blk_mq_tag_busy(data->hctx)) { -			rq->rq_flags = RQF_MQ_INFLIGHT; +			rq_flags = RQF_MQ_INFLIGHT;  			atomic_inc(&data->hctx->nr_active);  		}  		rq->tag = tag; @@ -288,27 +286,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,  		data->hctx->tags->rqs[rq->tag] = rq;  	} -	INIT_LIST_HEAD(&rq->queuelist);  	/* csd/requeue_work/fifo_time is initialized before use */  	rq->q = data->q;  	rq->mq_ctx = data->ctx; +	rq->rq_flags = rq_flags; +	rq->cpu = -1;  	rq->cmd_flags = op;  	if (data->flags & BLK_MQ_REQ_PREEMPT)  		rq->rq_flags |= RQF_PREEMPT;  	if (blk_queue_io_stat(data->q))  		rq->rq_flags |= RQF_IO_STAT; -	/* do not touch atomic flags, it needs atomic ops against the timer */ -	rq->cpu = -1; +	INIT_LIST_HEAD(&rq->queuelist);  	INIT_HLIST_NODE(&rq->hash);  	RB_CLEAR_NODE(&rq->rb_node);  	rq->rq_disk = NULL;  	rq->part = NULL;  	rq->start_time = jiffies; -#ifdef CONFIG_BLK_CGROUP -	rq->rl = NULL; -	set_start_time_ns(rq); -	rq->io_start_time_ns = 0; -#endif  	rq->nr_phys_segments = 0;  #if defined(CONFIG_BLK_DEV_INTEGRITY)  	rq->nr_integrity_segments = 0; @@ -316,6 +309,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,  	rq->special = NULL;  	/* tag was already set */  	rq->extra_len = 0; +	rq->__deadline = 0;  	INIT_LIST_HEAD(&rq->timeout_list);  	rq->timeout = 0; @@ -324,6 +318,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,  	rq->end_io_data = NULL;  	rq->next_rq = NULL; +#ifdef CONFIG_BLK_CGROUP +	rq->rl = NULL; +	set_start_time_ns(rq); +	rq->io_start_time_ns = 0; +#endif +  	data->ctx->rq_dispatched[op_is_sync(op)]++;  	return rq;  } @@ -443,7 +443,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,  		blk_queue_exit(q);  		return ERR_PTR(-EXDEV);  	} -	cpu = cpumask_first(alloc_data.hctx->cpumask); +	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);  	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);  	rq = blk_mq_get_request(q, NULL, op, &alloc_data); @@ -485,8 +485,7 @@ void blk_mq_free_request(struct request *rq)  	if (blk_rq_rl(rq))  		blk_put_rl(blk_rq_rl(rq)); -	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); -	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); +	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);  	if (rq->tag != -1)  		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);  	if (sched_tag != -1) @@ -532,6 +531,9 @@ static void __blk_mq_complete_request(struct request *rq)  	bool shared = false;  	int cpu; +	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT); +	blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE); +  	if (rq->internal_tag != -1)  		blk_mq_sched_completed_request(rq);  	if (rq->rq_flags & RQF_STATS) { @@ -559,6 +561,56 @@ static void __blk_mq_complete_request(struct request *rq)  	put_cpu();  } +static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) +	__releases(hctx->srcu) +{ +	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) +		rcu_read_unlock(); +	else +		srcu_read_unlock(hctx->srcu, srcu_idx); +} + +static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) +	__acquires(hctx->srcu) +{ +	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { +		/* shut up gcc false positive */ +		*srcu_idx = 0; +		rcu_read_lock(); +	} else +		*srcu_idx = srcu_read_lock(hctx->srcu); +} + +static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate) +{ +	unsigned long flags; + +	/* +	 * blk_mq_rq_aborted_gstate() is used from the completion path and +	 * can thus be called from irq context.  u64_stats_fetch in the +	 * middle of update on the same CPU leads to lockup.  Disable irq +	 * while updating. +	 */ +	local_irq_save(flags); +	u64_stats_update_begin(&rq->aborted_gstate_sync); +	rq->aborted_gstate = gstate; +	u64_stats_update_end(&rq->aborted_gstate_sync); +	local_irq_restore(flags); +} + +static u64 blk_mq_rq_aborted_gstate(struct request *rq) +{ +	unsigned int start; +	u64 aborted_gstate; + +	do { +		start = u64_stats_fetch_begin(&rq->aborted_gstate_sync); +		aborted_gstate = rq->aborted_gstate; +	} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start)); + +	return aborted_gstate; +} +  /**   * blk_mq_complete_request - end I/O on a request   * @rq:		the request being processed @@ -570,17 +622,33 @@ static void __blk_mq_complete_request(struct request *rq)  void blk_mq_complete_request(struct request *rq)  {  	struct request_queue *q = rq->q; +	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); +	int srcu_idx;  	if (unlikely(blk_should_fake_timeout(q)))  		return; -	if (!blk_mark_rq_complete(rq)) + +	/* +	 * If @rq->aborted_gstate equals the current instance, timeout is +	 * claiming @rq and we lost.  This is synchronized through +	 * hctx_lock().  See blk_mq_timeout_work() for details. +	 * +	 * Completion path never blocks and we can directly use RCU here +	 * instead of hctx_lock() which can be either RCU or SRCU. +	 * However, that would complicate paths which want to synchronize +	 * against us.  Let stay in sync with the issue path so that +	 * hctx_lock() covers both issue and completion paths. +	 */ +	hctx_lock(hctx, &srcu_idx); +	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)  		__blk_mq_complete_request(rq); +	hctx_unlock(hctx, srcu_idx);  }  EXPORT_SYMBOL(blk_mq_complete_request);  int blk_mq_request_started(struct request *rq)  { -	return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); +	return blk_mq_rq_state(rq) != MQ_RQ_IDLE;  }  EXPORT_SYMBOL_GPL(blk_mq_request_started); @@ -598,34 +666,27 @@ void blk_mq_start_request(struct request *rq)  		wbt_issue(q->rq_wb, &rq->issue_stat);  	} -	blk_add_timer(rq); - -	WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); +	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);  	/* -	 * Mark us as started and clear complete. Complete might have been -	 * set if requeue raced with timeout, which then marked it as -	 * complete. So be sure to clear complete again when we start -	 * the request, otherwise we'll ignore the completion event. +	 * Mark @rq in-flight which also advances the generation number, +	 * and register for timeout.  Protect with a seqcount to allow the +	 * timeout path to read both @rq->gstate and @rq->deadline +	 * coherently.  	 * -	 * Ensure that ->deadline is visible before we set STARTED, such that -	 * blk_mq_check_expired() is guaranteed to observe our ->deadline when -	 * it observes STARTED. +	 * This is the only place where a request is marked in-flight.  If +	 * the timeout path reads an in-flight @rq->gstate, the +	 * @rq->deadline it reads together under @rq->gstate_seq is +	 * guaranteed to be the matching one.  	 */ -	smp_wmb(); -	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); -	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { -		/* -		 * Coherence order guarantees these consecutive stores to a -		 * single variable propagate in the specified order. Thus the -		 * clear_bit() is ordered _after_ the set bit. See -		 * blk_mq_check_expired(). -		 * -		 * (the bits must be part of the same byte for this to be -		 * true). -		 */ -		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); -	} +	preempt_disable(); +	write_seqcount_begin(&rq->gstate_seq); + +	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT); +	blk_add_timer(rq); + +	write_seqcount_end(&rq->gstate_seq); +	preempt_enable();  	if (q->dma_drain_size && blk_rq_bytes(rq)) {  		/* @@ -639,13 +700,9 @@ void blk_mq_start_request(struct request *rq)  EXPORT_SYMBOL(blk_mq_start_request);  /* - * When we reach here because queue is busy, REQ_ATOM_COMPLETE - * flag isn't set yet, so there may be race with timeout handler, - * but given rq->deadline is just set in .queue_rq() under - * this situation, the race won't be possible in reality because - * rq->timeout should be set as big enough to cover the window - * between blk_mq_start_request() called from .queue_rq() and - * clearing REQ_ATOM_STARTED here. + * When we reach here because queue is busy, it's safe to change the state + * to IDLE without checking @rq->aborted_gstate because we should still be + * holding the RCU read lock and thus protected against timeout.   */  static void __blk_mq_requeue_request(struct request *rq)  { @@ -657,7 +714,8 @@ static void __blk_mq_requeue_request(struct request *rq)  	wbt_requeue(q->rq_wb, &rq->issue_stat);  	blk_mq_sched_requeue_request(rq); -	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { +	if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) { +		blk_mq_rq_update_state(rq, MQ_RQ_IDLE);  		if (q->dma_drain_size && blk_rq_bytes(rq))  			rq->nr_phys_segments--;  	} @@ -689,13 +747,13 @@ static void blk_mq_requeue_work(struct work_struct *work)  		rq->rq_flags &= ~RQF_SOFTBARRIER;  		list_del_init(&rq->queuelist); -		blk_mq_sched_insert_request(rq, true, false, false, true); +		blk_mq_sched_insert_request(rq, true, false, false);  	}  	while (!list_empty(&rq_list)) {  		rq = list_entry(rq_list.next, struct request, queuelist);  		list_del_init(&rq->queuelist); -		blk_mq_sched_insert_request(rq, false, false, false, true); +		blk_mq_sched_insert_request(rq, false, false, false);  	}  	blk_mq_run_hw_queues(q, false); @@ -729,7 +787,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list);  void blk_mq_kick_requeue_list(struct request_queue *q)  { -	kblockd_schedule_delayed_work(&q->requeue_work, 0); +	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);  }  EXPORT_SYMBOL(blk_mq_kick_requeue_list); @@ -755,24 +813,15 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);  struct blk_mq_timeout_data {  	unsigned long next;  	unsigned int next_set; +	unsigned int nr_expired;  }; -void blk_mq_rq_timed_out(struct request *req, bool reserved) +static void blk_mq_rq_timed_out(struct request *req, bool reserved)  {  	const struct blk_mq_ops *ops = req->q->mq_ops;  	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; -	/* -	 * We know that complete is set at this point. If STARTED isn't set -	 * anymore, then the request isn't active and the "timeout" should -	 * just be ignored. This can happen due to the bitflag ordering. -	 * Timeout first checks if STARTED is set, and if it is, assumes -	 * the request is active. But if we race with completion, then -	 * both flags will get cleared. So check here again, and ignore -	 * a timeout event with a request that isn't active. -	 */ -	if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) -		return; +	req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;  	if (ops->timeout)  		ret = ops->timeout(req, reserved); @@ -782,8 +831,13 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)  		__blk_mq_complete_request(req);  		break;  	case BLK_EH_RESET_TIMER: +		/* +		 * As nothing prevents from completion happening while +		 * ->aborted_gstate is set, this may lead to ignored +		 * completions and further spurious timeouts. +		 */ +		blk_mq_rq_update_aborted_gstate(req, 0);  		blk_add_timer(req); -		blk_clear_rq_complete(req);  		break;  	case BLK_EH_NOT_HANDLED:  		break; @@ -797,50 +851,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,  		struct request *rq, void *priv, bool reserved)  {  	struct blk_mq_timeout_data *data = priv; -	unsigned long deadline; +	unsigned long gstate, deadline; +	int start; -	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) -		return; +	might_sleep(); -	/* -	 * Ensures that if we see STARTED we must also see our -	 * up-to-date deadline, see blk_mq_start_request(). -	 */ -	smp_rmb(); +	if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) +		return; -	deadline = READ_ONCE(rq->deadline); +	/* read coherent snapshots of @rq->state_gen and @rq->deadline */ +	while (true) { +		start = read_seqcount_begin(&rq->gstate_seq); +		gstate = READ_ONCE(rq->gstate); +		deadline = blk_rq_deadline(rq); +		if (!read_seqcount_retry(&rq->gstate_seq, start)) +			break; +		cond_resched(); +	} -	/* -	 * The rq being checked may have been freed and reallocated -	 * out already here, we avoid this race by checking rq->deadline -	 * and REQ_ATOM_COMPLETE flag together: -	 * -	 * - if rq->deadline is observed as new value because of -	 *   reusing, the rq won't be timed out because of timing. -	 * - if rq->deadline is observed as previous value, -	 *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path -	 *   because we put a barrier between setting rq->deadline -	 *   and clearing the flag in blk_mq_start_request(), so -	 *   this rq won't be timed out too. -	 */ -	if (time_after_eq(jiffies, deadline)) { -		if (!blk_mark_rq_complete(rq)) { -			/* -			 * Again coherence order ensures that consecutive reads -			 * from the same variable must be in that order. This -			 * ensures that if we see COMPLETE clear, we must then -			 * see STARTED set and we'll ignore this timeout. -			 * -			 * (There's also the MB implied by the test_and_clear()) -			 */ -			blk_mq_rq_timed_out(rq, reserved); -		} +	/* if in-flight && overdue, mark for abortion */ +	if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT && +	    time_after_eq(jiffies, deadline)) { +		blk_mq_rq_update_aborted_gstate(rq, gstate); +		data->nr_expired++; +		hctx->nr_expired++;  	} else if (!data->next_set || time_after(data->next, deadline)) {  		data->next = deadline;  		data->next_set = 1;  	}  } +static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx, +		struct request *rq, void *priv, bool reserved) +{ +	/* +	 * We marked @rq->aborted_gstate and waited for RCU.  If there were +	 * completions that we lost to, they would have finished and +	 * updated @rq->gstate by now; otherwise, the completion path is +	 * now guaranteed to see @rq->aborted_gstate and yield.  If +	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours. +	 */ +	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) && +	    READ_ONCE(rq->gstate) == rq->aborted_gstate) +		blk_mq_rq_timed_out(rq, reserved); +} +  static void blk_mq_timeout_work(struct work_struct *work)  {  	struct request_queue *q = @@ -848,7 +903,9 @@ static void blk_mq_timeout_work(struct work_struct *work)  	struct blk_mq_timeout_data data = {  		.next		= 0,  		.next_set	= 0, +		.nr_expired	= 0,  	}; +	struct blk_mq_hw_ctx *hctx;  	int i;  	/* A deadlock might occur if a request is stuck requiring a @@ -867,14 +924,46 @@ static void blk_mq_timeout_work(struct work_struct *work)  	if (!percpu_ref_tryget(&q->q_usage_counter))  		return; +	/* scan for the expired ones and set their ->aborted_gstate */  	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); +	if (data.nr_expired) { +		bool has_rcu = false; + +		/* +		 * Wait till everyone sees ->aborted_gstate.  The +		 * sequential waits for SRCUs aren't ideal.  If this ever +		 * becomes a problem, we can add per-hw_ctx rcu_head and +		 * wait in parallel. +		 */ +		queue_for_each_hw_ctx(q, hctx, i) { +			if (!hctx->nr_expired) +				continue; + +			if (!(hctx->flags & BLK_MQ_F_BLOCKING)) +				has_rcu = true; +			else +				synchronize_srcu(hctx->srcu); + +			hctx->nr_expired = 0; +		} +		if (has_rcu) +			synchronize_rcu(); + +		/* terminate the ones we won */ +		blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL); +	} +  	if (data.next_set) {  		data.next = blk_rq_timeout(round_jiffies_up(data.next));  		mod_timer(&q->timeout, data.next);  	} else { -		struct blk_mq_hw_ctx *hctx; - +		/* +		 * Request timeouts are handled as a forward rolling timer. If +		 * we end up here it means that no requests are pending and +		 * also that no request has been pending for a while. Mark +		 * each hctx as idle. +		 */  		queue_for_each_hw_ctx(q, hctx, i) {  			/* the hctx may be unmapped, so check it here */  			if (blk_mq_hw_queue_mapped(hctx)) @@ -1010,66 +1099,67 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,  /*   * Mark us waiting for a tag. For shared tags, this involves hooking us into - * the tag wakeups. For non-shared tags, we can simply mark us nedeing a - * restart. For both caes, take care to check the condition again after + * the tag wakeups. For non-shared tags, we can simply mark us needing a + * restart. For both cases, take care to check the condition again after   * marking us as waiting.   */  static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,  				 struct request *rq)  {  	struct blk_mq_hw_ctx *this_hctx = *hctx; -	bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;  	struct sbq_wait_state *ws;  	wait_queue_entry_t *wait;  	bool ret; -	if (!shared_tags) { +	if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {  		if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))  			set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); -	} else { -		wait = &this_hctx->dispatch_wait; -		if (!list_empty_careful(&wait->entry)) -			return false; -		spin_lock(&this_hctx->lock); -		if (!list_empty(&wait->entry)) { -			spin_unlock(&this_hctx->lock); -			return false; -		} +		/* +		 * It's possible that a tag was freed in the window between the +		 * allocation failure and adding the hardware queue to the wait +		 * queue. +		 * +		 * Don't clear RESTART here, someone else could have set it. +		 * At most this will cost an extra queue run. +		 */ +		return blk_mq_get_driver_tag(rq, hctx, false); +	} -		ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); -		add_wait_queue(&ws->wait, wait); +	wait = &this_hctx->dispatch_wait; +	if (!list_empty_careful(&wait->entry)) +		return false; + +	spin_lock(&this_hctx->lock); +	if (!list_empty(&wait->entry)) { +		spin_unlock(&this_hctx->lock); +		return false;  	} +	ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); +	add_wait_queue(&ws->wait, wait); +  	/*  	 * It's possible that a tag was freed in the window between the  	 * allocation failure and adding the hardware queue to the wait  	 * queue.  	 */  	ret = blk_mq_get_driver_tag(rq, hctx, false); - -	if (!shared_tags) { -		/* -		 * Don't clear RESTART here, someone else could have set it. -		 * At most this will cost an extra queue run. -		 */ -		return ret; -	} else { -		if (!ret) { -			spin_unlock(&this_hctx->lock); -			return false; -		} - -		/* -		 * We got a tag, remove ourselves from the wait queue to ensure -		 * someone else gets the wakeup. -		 */ -		spin_lock_irq(&ws->wait.lock); -		list_del_init(&wait->entry); -		spin_unlock_irq(&ws->wait.lock); +	if (!ret) {  		spin_unlock(&this_hctx->lock); -		return true; +		return false;  	} + +	/* +	 * We got a tag, remove ourselves from the wait queue to ensure +	 * someone else gets the wakeup. +	 */ +	spin_lock_irq(&ws->wait.lock); +	list_del_init(&wait->entry); +	spin_unlock_irq(&ws->wait.lock); +	spin_unlock(&this_hctx->lock); + +	return true;  }  bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, @@ -1206,9 +1296,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)  	/*  	 * We should be running this queue from one of the CPUs that  	 * are mapped to it. +	 * +	 * There are at least two related races now between setting +	 * hctx->next_cpu from blk_mq_hctx_next_cpu() and running +	 * __blk_mq_run_hw_queue(): +	 * +	 * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), +	 *   but later it becomes online, then this warning is harmless +	 *   at all +	 * +	 * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), +	 *   but later it becomes offline, then the warning can't be +	 *   triggered, and we depend on blk-mq timeout handler to +	 *   handle dispatched requests to this hctx  	 */ -	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && -		cpu_online(hctx->next_cpu)); +	if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && +		cpu_online(hctx->next_cpu)) { +		printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", +			raw_smp_processor_id(), +			cpumask_empty(hctx->cpumask) ? "inactive": "active"); +		dump_stack(); +	}  	/*  	 * We can't run the queue inline with ints disabled. Ensure that @@ -1216,17 +1324,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)  	 */  	WARN_ON_ONCE(in_interrupt()); -	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { -		rcu_read_lock(); -		blk_mq_sched_dispatch_requests(hctx); -		rcu_read_unlock(); -	} else { -		might_sleep(); +	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); -		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); -		blk_mq_sched_dispatch_requests(hctx); -		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); -	} +	hctx_lock(hctx, &srcu_idx); +	blk_mq_sched_dispatch_requests(hctx); +	hctx_unlock(hctx, srcu_idx);  }  /* @@ -1237,20 +1339,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)   */  static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)  { +	bool tried = false; +  	if (hctx->queue->nr_hw_queues == 1)  		return WORK_CPU_UNBOUND;  	if (--hctx->next_cpu_batch <= 0) {  		int next_cpu; - -		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); +select_cpu: +		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask, +				cpu_online_mask);  		if (next_cpu >= nr_cpu_ids) -			next_cpu = cpumask_first(hctx->cpumask); +			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask); -		hctx->next_cpu = next_cpu; +		/* +		 * No online CPU is found, so have to make sure hctx->next_cpu +		 * is set correctly for not breaking workqueue. +		 */ +		if (next_cpu >= nr_cpu_ids) +			hctx->next_cpu = cpumask_first(hctx->cpumask); +		else +			hctx->next_cpu = next_cpu;  		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;  	} +	/* +	 * Do unbound schedule if we can't find a online CPU for this hctx, +	 * and it should only happen in the path of handling CPU DEAD. +	 */ +	if (!cpu_online(hctx->next_cpu)) { +		if (!tried) { +			tried = true; +			goto select_cpu; +		} + +		/* +		 * Make sure to re-select CPU next time once after CPUs +		 * in hctx->cpumask become online again. +		 */ +		hctx->next_cpu_batch = 1; +		return WORK_CPU_UNBOUND; +	}  	return hctx->next_cpu;  } @@ -1274,9 +1403,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,  		put_cpu();  	} -	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), -					 &hctx->run_work, -					 msecs_to_jiffies(msecs)); +	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, +				    msecs_to_jiffies(msecs));  }  void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) @@ -1287,7 +1415,23 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);  bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)  { -	if (blk_mq_hctx_has_pending(hctx)) { +	int srcu_idx; +	bool need_run; + +	/* +	 * When queue is quiesced, we may be switching io scheduler, or +	 * updating nr_hw_queues, or other things, and we can't run queue +	 * any more, even __blk_mq_hctx_has_pending() can't be called safely. +	 * +	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is +	 * quiesced. +	 */ +	hctx_lock(hctx, &srcu_idx); +	need_run = !blk_queue_quiesced(hctx->queue) && +		blk_mq_hctx_has_pending(hctx); +	hctx_unlock(hctx, srcu_idx); + +	if (need_run) {  		__blk_mq_delay_run_hw_queue(hctx, async, 0);  		return true;  	} @@ -1595,9 +1739,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)  	return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);  } -static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, -					struct request *rq, -					blk_qc_t *cookie, bool may_sleep) +static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, +					    struct request *rq, +					    blk_qc_t *cookie)  {  	struct request_queue *q = rq->q;  	struct blk_mq_queue_data bd = { @@ -1606,15 +1750,52 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,  	};  	blk_qc_t new_cookie;  	blk_status_t ret; + +	new_cookie = request_to_qc_t(hctx, rq); + +	/* +	 * For OK queue, we are done. For error, caller may kill it. +	 * Any other error (busy), just add it to our list as we +	 * previously would have done. +	 */ +	ret = q->mq_ops->queue_rq(hctx, &bd); +	switch (ret) { +	case BLK_STS_OK: +		*cookie = new_cookie; +		break; +	case BLK_STS_RESOURCE: +		__blk_mq_requeue_request(rq); +		break; +	default: +		*cookie = BLK_QC_T_NONE; +		break; +	} + +	return ret; +} + +static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, +						struct request *rq, +						blk_qc_t *cookie, +						bool bypass_insert) +{ +	struct request_queue *q = rq->q;  	bool run_queue = true; -	/* RCU or SRCU read lock is needed before checking quiesced flag */ +	/* +	 * RCU or SRCU read lock is needed before checking quiesced flag. +	 * +	 * When queue is stopped or quiesced, ignore 'bypass_insert' from +	 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, +	 * and avoid driver to try to dispatch again. +	 */  	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {  		run_queue = false; +		bypass_insert = false;  		goto insert;  	} -	if (q->elevator) +	if (q->elevator && !bypass_insert)  		goto insert;  	if (!blk_mq_get_driver_tag(rq, NULL, false)) @@ -1625,47 +1806,47 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,  		goto insert;  	} -	new_cookie = request_to_qc_t(hctx, rq); - -	/* -	 * For OK queue, we are done. For error, kill it. Any other -	 * error (busy), just add it to our list as we previously -	 * would have done -	 */ -	ret = q->mq_ops->queue_rq(hctx, &bd); -	switch (ret) { -	case BLK_STS_OK: -		*cookie = new_cookie; -		return; -	case BLK_STS_RESOURCE: -		__blk_mq_requeue_request(rq); -		goto insert; -	default: -		*cookie = BLK_QC_T_NONE; -		blk_mq_end_request(rq, ret); -		return; -	} - +	return __blk_mq_issue_directly(hctx, rq, cookie);  insert: -	blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); +	if (bypass_insert) +		return BLK_STS_RESOURCE; + +	blk_mq_sched_insert_request(rq, false, run_queue, false); +	return BLK_STS_OK;  }  static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,  		struct request *rq, blk_qc_t *cookie)  { -	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { -		rcu_read_lock(); -		__blk_mq_try_issue_directly(hctx, rq, cookie, false); -		rcu_read_unlock(); -	} else { -		unsigned int srcu_idx; +	blk_status_t ret; +	int srcu_idx; -		might_sleep(); +	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); -		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); -		__blk_mq_try_issue_directly(hctx, rq, cookie, true); -		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); -	} +	hctx_lock(hctx, &srcu_idx); + +	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); +	if (ret == BLK_STS_RESOURCE) +		blk_mq_sched_insert_request(rq, false, true, false); +	else if (ret != BLK_STS_OK) +		blk_mq_end_request(rq, ret); + +	hctx_unlock(hctx, srcu_idx); +} + +blk_status_t blk_mq_request_issue_directly(struct request *rq) +{ +	blk_status_t ret; +	int srcu_idx; +	blk_qc_t unused_cookie; +	struct blk_mq_ctx *ctx = rq->mq_ctx; +	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + +	hctx_lock(hctx, &srcu_idx); +	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); +	hctx_unlock(hctx, srcu_idx); + +	return ret;  }  static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) @@ -1776,7 +1957,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)  	} else if (q->elevator) {  		blk_mq_put_ctx(data.ctx);  		blk_mq_bio_to_request(rq, bio); -		blk_mq_sched_insert_request(rq, false, true, true, true); +		blk_mq_sched_insert_request(rq, false, true, true);  	} else {  		blk_mq_put_ctx(data.ctx);  		blk_mq_bio_to_request(rq, bio); @@ -1869,6 +2050,22 @@ static size_t order_to_size(unsigned int order)  	return (size_t)PAGE_SIZE << order;  } +static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, +			       unsigned int hctx_idx, int node) +{ +	int ret; + +	if (set->ops->init_request) { +		ret = set->ops->init_request(set, rq, hctx_idx, node); +		if (ret) +			return ret; +	} + +	seqcount_init(&rq->gstate_seq); +	u64_stats_init(&rq->aborted_gstate_sync); +	return 0; +} +  int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,  		     unsigned int hctx_idx, unsigned int depth)  { @@ -1930,12 +2127,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,  			struct request *rq = p;  			tags->static_rqs[i] = rq; -			if (set->ops->init_request) { -				if (set->ops->init_request(set, rq, hctx_idx, -						node)) { -					tags->static_rqs[i] = NULL; -					goto fail; -				} +			if (blk_mq_init_request(set, rq, hctx_idx, node)) { +				tags->static_rqs[i] = NULL; +				goto fail;  			}  			p += rq_size; @@ -1994,7 +2188,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,  {  	blk_mq_debugfs_unregister_hctx(hctx); -	blk_mq_tag_idle(hctx); +	if (blk_mq_hw_queue_mapped(hctx)) +		blk_mq_tag_idle(hctx);  	if (set->ops->exit_request)  		set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); @@ -2005,7 +2200,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,  		set->ops->exit_hctx(hctx, hctx_idx);  	if (hctx->flags & BLK_MQ_F_BLOCKING) -		cleanup_srcu_struct(hctx->queue_rq_srcu); +		cleanup_srcu_struct(hctx->srcu);  	blk_mq_remove_cpuhp(hctx);  	blk_free_flush_queue(hctx->fq); @@ -2074,13 +2269,11 @@ static int blk_mq_init_hctx(struct request_queue *q,  	if (!hctx->fq)  		goto sched_exit_hctx; -	if (set->ops->init_request && -	    set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx, -				   node)) +	if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))  		goto free_fq;  	if (hctx->flags & BLK_MQ_F_BLOCKING) -		init_srcu_struct(hctx->queue_rq_srcu); +		init_srcu_struct(hctx->srcu);  	blk_mq_debugfs_register_hctx(q, hctx); @@ -2116,16 +2309,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,  		INIT_LIST_HEAD(&__ctx->rq_list);  		__ctx->queue = q; -		/* If the cpu isn't present, the cpu is mapped to first hctx */ -		if (!cpu_present(i)) -			continue; - -		hctx = blk_mq_map_queue(q, i); -  		/*  		 * Set local node, IFF we have more than one hw queue. If  		 * not, we remain on the home node of the device  		 */ +		hctx = blk_mq_map_queue(q, i);  		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)  			hctx->numa_node = local_memory_node(cpu_to_node(i));  	} @@ -2182,7 +2370,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)  	 *  	 * If the cpu isn't present, the cpu is mapped to first hctx.  	 */ -	for_each_present_cpu(i) { +	for_each_possible_cpu(i) {  		hctx_idx = q->mq_map[i];  		/* unmapped hw queue can be remapped after CPU topo changed */  		if (!set->tags[hctx_idx] && @@ -2236,7 +2424,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)  		/*  		 * Initialize batch roundrobin counts  		 */ -		hctx->next_cpu = cpumask_first(hctx->cpumask); +		hctx->next_cpu = cpumask_first_and(hctx->cpumask, +				cpu_online_mask);  		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;  	}  } @@ -2369,7 +2558,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)  {  	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); -	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu), +	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),  			   __alignof__(struct blk_mq_hw_ctx)) !=  		     sizeof(struct blk_mq_hw_ctx)); @@ -2386,6 +2575,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,  	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;  	blk_mq_sysfs_unregister(q); + +	/* protect against switching io scheduler  */ +	mutex_lock(&q->sysfs_lock);  	for (i = 0; i < set->nr_hw_queues; i++) {  		int node; @@ -2430,6 +2622,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,  		}  	}  	q->nr_hw_queues = i; +	mutex_unlock(&q->sysfs_lock);  	blk_mq_sysfs_register(q);  } @@ -2601,9 +2794,27 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)  static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)  { -	if (set->ops->map_queues) +	if (set->ops->map_queues) { +		int cpu; +		/* +		 * transport .map_queues is usually done in the following +		 * way: +		 * +		 * for (queue = 0; queue < set->nr_hw_queues; queue++) { +		 * 	mask = get_cpu_mask(queue) +		 * 	for_each_cpu(cpu, mask) +		 * 		set->mq_map[cpu] = queue; +		 * } +		 * +		 * When we need to remap, the table has to be cleared for +		 * killing stale mapping since one CPU may not be mapped +		 * to any hw queue. +		 */ +		for_each_possible_cpu(cpu) +			set->mq_map[cpu] = 0; +  		return set->ops->map_queues(set); -	else +	} else  		return blk_mq_map_queues(set);  } @@ -2712,6 +2923,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)  		return -EINVAL;  	blk_mq_freeze_queue(q); +	blk_mq_quiesce_queue(q);  	ret = 0;  	queue_for_each_hw_ctx(q, hctx, i) { @@ -2735,6 +2947,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)  	if (!ret)  		q->nr_requests = nr; +	blk_mq_unquiesce_queue(q);  	blk_mq_unfreeze_queue(q);  	return ret; @@ -2850,7 +3063,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,  	unsigned int nsecs;  	ktime_t kt; -	if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) +	if (rq->rq_flags & RQF_MQ_POLL_SLEPT)  		return false;  	/* @@ -2870,7 +3083,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,  	if (!nsecs)  		return false; -	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); +	rq->rq_flags |= RQF_MQ_POLL_SLEPT;  	/*  	 * This will be replaced with the stats tracking code, using @@ -2884,7 +3097,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,  	hrtimer_init_sleeper(&hs, current);  	do { -		if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) +		if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)  			break;  		set_current_state(TASK_UNINTERRUPTIBLE);  		hrtimer_start_expires(&hs.timer, mode); @@ -2970,12 +3183,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)  static int __init blk_mq_init(void)  { -	/* -	 * See comment in block/blk.h rq_atomic_flags enum -	 */ -	BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != -			(REQ_ATOM_COMPLETE / BITS_PER_BYTE)); -  	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,  				blk_mq_hctx_notify_dead);  	return 0; diff --git a/block/blk-mq.h b/block/blk-mq.h index 6c7c3ff5bf62..88c558f71819 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -27,6 +27,20 @@ struct blk_mq_ctx {  	struct kobject		kobj;  } ____cacheline_aligned_in_smp; +/* + * Bits for request->gstate.  The lower two bits carry MQ_RQ_* state value + * and the upper bits the generation number. + */ +enum mq_rq_state { +	MQ_RQ_IDLE		= 0, +	MQ_RQ_IN_FLIGHT		= 1, +	MQ_RQ_COMPLETE		= 2, + +	MQ_RQ_STATE_BITS	= 2, +	MQ_RQ_STATE_MASK	= (1 << MQ_RQ_STATE_BITS) - 1, +	MQ_RQ_GEN_INC		= 1 << MQ_RQ_STATE_BITS, +}; +  void blk_mq_freeze_queue(struct request_queue *q);  void blk_mq_free_queue(struct request_queue *q);  int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); @@ -60,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);  void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,  				struct list_head *list); +/* Used by blk_insert_cloned_request() to issue request directly */ +blk_status_t blk_mq_request_issue_directly(struct request *rq); +  /*   * CPU -> queue mappings   */ @@ -81,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q);  extern void blk_mq_sysfs_unregister(struct request_queue *q);  extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); -extern void blk_mq_rq_timed_out(struct request *req, bool reserved); -  void blk_mq_release(struct request_queue *q); +/** + * blk_mq_rq_state() - read the current MQ_RQ_* state of a request + * @rq: target request. + */ +static inline int blk_mq_rq_state(struct request *rq) +{ +	return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK; +} + +/** + * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request + * @rq: target request. + * @state: new state to set. + * + * Set @rq's state to @state.  The caller is responsible for ensuring that + * there are no other updaters.  A request can transition into IN_FLIGHT + * only from IDLE and doing so increments the generation number. + */ +static inline void blk_mq_rq_update_state(struct request *rq, +					  enum mq_rq_state state) +{ +	u64 old_val = READ_ONCE(rq->gstate); +	u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state; + +	if (state == MQ_RQ_IN_FLIGHT) { +		WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE); +		new_val += MQ_RQ_GEN_INC; +	} + +	/* avoid exposing interim values */ +	WRITE_ONCE(rq->gstate, new_val); +} +  static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,  					   unsigned int cpu)  { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 870484eaed1f..cbea895a5547 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = {  	.release	= blk_release_queue,  }; +/** + * blk_register_queue - register a block layer queue with sysfs + * @disk: Disk of which the request queue should be registered with sysfs. + */  int blk_register_queue(struct gendisk *disk)  {  	int ret; @@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk)  	if (q->request_fn || (q->mq_ops && q->elevator)) {  		ret = elv_register_queue(q);  		if (ret) { +			mutex_unlock(&q->sysfs_lock);  			kobject_uevent(&q->kobj, KOBJ_REMOVE);  			kobject_del(&q->kobj);  			blk_trace_remove_sysfs(dev);  			kobject_put(&dev->kobj); -			goto unlock; +			return ret;  		}  	}  	ret = 0; @@ -921,7 +926,15 @@ unlock:  	mutex_unlock(&q->sysfs_lock);  	return ret;  } +EXPORT_SYMBOL_GPL(blk_register_queue); +/** + * blk_unregister_queue - counterpart of blk_register_queue() + * @disk: Disk of which the request queue should be unregistered from sysfs. + * + * Note: the caller is responsible for guaranteeing that this function is called + * after blk_register_queue() has finished. + */  void blk_unregister_queue(struct gendisk *disk)  {  	struct request_queue *q = disk->queue; @@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk)  	if (WARN_ON(!q))  		return; -	mutex_lock(&q->sysfs_lock); -	queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); -	mutex_unlock(&q->sysfs_lock); +	/* Return early if disk->queue was never registered. */ +	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) +		return; -	wbt_exit(q); +	/* +	 * Since sysfs_remove_dir() prevents adding new directory entries +	 * before removal of existing entries starts, protect against +	 * concurrent elv_iosched_store() calls. +	 */ +	mutex_lock(&q->sysfs_lock); +	spin_lock_irq(q->queue_lock); +	queue_flag_clear(QUEUE_FLAG_REGISTERED, q); +	spin_unlock_irq(q->queue_lock); +	/* +	 * Remove the sysfs attributes before unregistering the queue data +	 * structures that can be modified through sysfs. +	 */  	if (q->mq_ops)  		blk_mq_unregister_dev(disk_to_dev(disk), q); - -	if (q->request_fn || (q->mq_ops && q->elevator)) -		elv_unregister_queue(q); +	mutex_unlock(&q->sysfs_lock);  	kobject_uevent(&q->kobj, KOBJ_REMOVE);  	kobject_del(&q->kobj);  	blk_trace_remove_sysfs(disk_to_dev(disk)); + +	wbt_exit(q); + +	mutex_lock(&q->sysfs_lock); +	if (q->request_fn || (q->mq_ops && q->elevator)) +		elv_unregister_queue(q); +	mutex_unlock(&q->sysfs_lock); +  	kobject_put(&disk_to_dev(disk)->kobj);  } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d19f416d6101..c5a131673733 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -216,9 +216,9 @@ struct throtl_data  	unsigned int scale; -	struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; -	struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; -	struct latency_bucket __percpu *latency_buckets; +	struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; +	struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; +	struct latency_bucket __percpu *latency_buckets[2];  	unsigned long last_calculate_time;  	unsigned long filtered_latency; @@ -1511,10 +1511,20 @@ static struct cftype throtl_legacy_files[] = {  		.seq_show = blkg_print_stat_bytes,  	},  	{ +		.name = "throttle.io_service_bytes_recursive", +		.private = (unsigned long)&blkcg_policy_throtl, +		.seq_show = blkg_print_stat_bytes_recursive, +	}, +	{  		.name = "throttle.io_serviced",  		.private = (unsigned long)&blkcg_policy_throtl,  		.seq_show = blkg_print_stat_ios,  	}, +	{ +		.name = "throttle.io_serviced_recursive", +		.private = (unsigned long)&blkcg_policy_throtl, +		.seq_show = blkg_print_stat_ios_recursive, +	},  	{ }	/* terminate */  }; @@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW  static void throtl_update_latency_buckets(struct throtl_data *td)  { -	struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; -	int i, cpu; -	unsigned long last_latency = 0; -	unsigned long latency; +	struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; +	int i, cpu, rw; +	unsigned long last_latency[2] = { 0 }; +	unsigned long latency[2];  	if (!blk_queue_nonrot(td->queue))  		return; @@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td)  	td->last_calculate_time = jiffies;  	memset(avg_latency, 0, sizeof(avg_latency)); -	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { -		struct latency_bucket *tmp = &td->tmp_buckets[i]; - -		for_each_possible_cpu(cpu) { -			struct latency_bucket *bucket; - -			/* this isn't race free, but ok in practice */ -			bucket = per_cpu_ptr(td->latency_buckets, cpu); -			tmp->total_latency += bucket[i].total_latency; -			tmp->samples += bucket[i].samples; -			bucket[i].total_latency = 0; -			bucket[i].samples = 0; -		} +	for (rw = READ; rw <= WRITE; rw++) { +		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { +			struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; + +			for_each_possible_cpu(cpu) { +				struct latency_bucket *bucket; + +				/* this isn't race free, but ok in practice */ +				bucket = per_cpu_ptr(td->latency_buckets[rw], +					cpu); +				tmp->total_latency += bucket[i].total_latency; +				tmp->samples += bucket[i].samples; +				bucket[i].total_latency = 0; +				bucket[i].samples = 0; +			} -		if (tmp->samples >= 32) { -			int samples = tmp->samples; +			if (tmp->samples >= 32) { +				int samples = tmp->samples; -			latency = tmp->total_latency; +				latency[rw] = tmp->total_latency; -			tmp->total_latency = 0; -			tmp->samples = 0; -			latency /= samples; -			if (latency == 0) -				continue; -			avg_latency[i].latency = latency; +				tmp->total_latency = 0; +				tmp->samples = 0; +				latency[rw] /= samples; +				if (latency[rw] == 0) +					continue; +				avg_latency[rw][i].latency = latency[rw]; +			}  		}  	} -	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { -		if (!avg_latency[i].latency) { -			if (td->avg_buckets[i].latency < last_latency) -				td->avg_buckets[i].latency = last_latency; -			continue; -		} +	for (rw = READ; rw <= WRITE; rw++) { +		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { +			if (!avg_latency[rw][i].latency) { +				if (td->avg_buckets[rw][i].latency < last_latency[rw]) +					td->avg_buckets[rw][i].latency = +						last_latency[rw]; +				continue; +			} -		if (!td->avg_buckets[i].valid) -			latency = avg_latency[i].latency; -		else -			latency = (td->avg_buckets[i].latency * 7 + -				avg_latency[i].latency) >> 3; +			if (!td->avg_buckets[rw][i].valid) +				latency[rw] = avg_latency[rw][i].latency; +			else +				latency[rw] = (td->avg_buckets[rw][i].latency * 7 + +					avg_latency[rw][i].latency) >> 3; -		td->avg_buckets[i].latency = max(latency, last_latency); -		td->avg_buckets[i].valid = true; -		last_latency = td->avg_buckets[i].latency; +			td->avg_buckets[rw][i].latency = max(latency[rw], +				last_latency[rw]); +			td->avg_buckets[rw][i].valid = true; +			last_latency[rw] = td->avg_buckets[rw][i].latency; +		}  	}  	for (i = 0; i < LATENCY_BUCKET_SIZE; i++)  		throtl_log(&td->service_queue, -			"Latency bucket %d: latency=%ld, valid=%d", i, -			td->avg_buckets[i].latency, td->avg_buckets[i].valid); +			"Latency bucket %d: read latency=%ld, read valid=%d, " +			"write latency=%ld, write valid=%d", i, +			td->avg_buckets[READ][i].latency, +			td->avg_buckets[READ][i].valid, +			td->avg_buckets[WRITE][i].latency, +			td->avg_buckets[WRITE][i].valid);  }  #else  static inline void throtl_update_latency_buckets(struct throtl_data *td) @@ -2242,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,  	struct latency_bucket *latency;  	int index; -	if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || +	if (!td || td->limit_index != LIMIT_LOW || +	    !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||  	    !blk_queue_nonrot(td->queue))  		return;  	index = request_bucket_index(size); -	latency = get_cpu_ptr(td->latency_buckets); +	latency = get_cpu_ptr(td->latency_buckets[op]);  	latency[index].total_latency += time;  	latency[index].samples++; -	put_cpu_ptr(td->latency_buckets); +	put_cpu_ptr(td->latency_buckets[op]);  }  void blk_throtl_stat_add(struct request *rq, u64 time_ns) @@ -2270,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio)  	unsigned long finish_time;  	unsigned long start_time;  	unsigned long lat; +	int rw = bio_data_dir(bio);  	tg = bio->bi_cg_private;  	if (!tg) @@ -2298,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio)  		bucket = request_bucket_index(  			blk_stat_size(&bio->bi_issue_stat)); -		threshold = tg->td->avg_buckets[bucket].latency + +		threshold = tg->td->avg_buckets[rw][bucket].latency +  			tg->latency_target;  		if (lat > threshold)  			tg->bad_bio_cnt++; @@ -2391,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q)  	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);  	if (!td)  		return -ENOMEM; -	td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * +	td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *  		LATENCY_BUCKET_SIZE, __alignof__(u64)); -	if (!td->latency_buckets) { +	if (!td->latency_buckets[READ]) { +		kfree(td); +		return -ENOMEM; +	} +	td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * +		LATENCY_BUCKET_SIZE, __alignof__(u64)); +	if (!td->latency_buckets[WRITE]) { +		free_percpu(td->latency_buckets[READ]);  		kfree(td);  		return -ENOMEM;  	} @@ -2412,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q)  	/* activate policy */  	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);  	if (ret) { -		free_percpu(td->latency_buckets); +		free_percpu(td->latency_buckets[READ]); +		free_percpu(td->latency_buckets[WRITE]);  		kfree(td);  	}  	return ret; @@ -2423,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q)  	BUG_ON(!q->td);  	throtl_shutdown_wq(q);  	blkcg_deactivate_policy(q, &blkcg_policy_throtl); -	free_percpu(q->td->latency_buckets); +	free_percpu(q->td->latency_buckets[READ]); +	free_percpu(q->td->latency_buckets[WRITE]);  	kfree(q->td);  } @@ -2441,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q)  	} else {  		td->throtl_slice = DFL_THROTL_SLICE_HD;  		td->filtered_latency = LATENCY_FILTERED_HD; -		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) -			td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY; +		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { +			td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; +			td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; +		}  	}  #ifndef CONFIG_BLK_DEV_THROTTLING_LOW  	/* if no low limit, use previous default */  	td->throtl_slice = DFL_THROTL_SLICE_HD;  #endif -	td->track_bio_latency = !q->mq_ops && !q->request_fn; +	td->track_bio_latency = !queue_is_rq_based(q);  	if (!td->track_bio_latency)  		blk_stat_enable_accounting(q);  } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 764ecf9aeb30..a05e3676d24a 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req)  static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,  			  unsigned int *next_set)  { -	if (time_after_eq(jiffies, rq->deadline)) { +	const unsigned long deadline = blk_rq_deadline(rq); + +	if (time_after_eq(jiffies, deadline)) {  		list_del_init(&rq->timeout_list);  		/* @@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout  		 */  		if (!blk_mark_rq_complete(rq))  			blk_rq_timed_out(rq); -	} else if (!*next_set || time_after(*next_timeout, rq->deadline)) { -		*next_timeout = rq->deadline; +	} else if (!*next_set || time_after(*next_timeout, deadline)) { +		*next_timeout = deadline;  		*next_set = 1;  	}  } @@ -156,12 +158,17 @@ void blk_timeout_work(struct work_struct *work)   */  void blk_abort_request(struct request *req)  { -	if (blk_mark_rq_complete(req)) -		return; -  	if (req->q->mq_ops) { -		blk_mq_rq_timed_out(req, false); +		/* +		 * All we need to ensure is that timeout scan takes place +		 * immediately and that scan sees the new timeout value. +		 * No need for fancy synchronizations. +		 */ +		blk_rq_set_deadline(req, jiffies); +		mod_timer(&req->q->timeout, 0);  	} else { +		if (blk_mark_rq_complete(req)) +			return;  		blk_delete_timer(req);  		blk_rq_timed_out(req);  	} @@ -208,7 +215,8 @@ void blk_add_timer(struct request *req)  	if (!req->timeout)  		req->timeout = q->rq_timeout; -	WRITE_ONCE(req->deadline, jiffies + req->timeout); +	blk_rq_set_deadline(req, jiffies + req->timeout); +	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;  	/*  	 * Only the non-mq case needs to add the request to a protected list. @@ -222,7 +230,7 @@ void blk_add_timer(struct request *req)  	 * than an existing one, modify the timer. Round up to next nearest  	 * second.  	 */ -	expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); +	expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));  	if (!timer_pending(&q->timeout) ||  	    time_before(expiry, q->timeout.expires)) { diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ff57fb51b338..acb7252c7e81 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -22,6 +22,48 @@ static inline sector_t blk_zone_start(struct request_queue *q,  }  /* + * Return true if a request is a write requests that needs zone write locking. + */ +bool blk_req_needs_zone_write_lock(struct request *rq) +{ +	if (!rq->q->seq_zones_wlock) +		return false; + +	if (blk_rq_is_passthrough(rq)) +		return false; + +	switch (req_op(rq)) { +	case REQ_OP_WRITE_ZEROES: +	case REQ_OP_WRITE_SAME: +	case REQ_OP_WRITE: +		return blk_rq_zone_is_seq(rq); +	default: +		return false; +	} +} +EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); + +void __blk_req_zone_write_lock(struct request *rq) +{ +	if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), +					  rq->q->seq_zones_wlock))) +		return; + +	WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); +	rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; +} +EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); + +void __blk_req_zone_write_unlock(struct request *rq) +{ +	rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; +	if (rq->q->seq_zones_wlock) +		WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), +						 rq->q->seq_zones_wlock)); +} +EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); + +/*   * Check that a zone report belongs to the partition.   * If yes, fix its start sector and write pointer, copy it in the   * zone information array and return true. Return false otherwise. diff --git a/block/blk.h b/block/blk.h index 442098aa9463..46db5dc83dcb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -120,33 +120,23 @@ void blk_account_io_completion(struct request *req, unsigned int bytes);  void blk_account_io_done(struct request *req);  /* - * Internal atomic flags for request handling - */ -enum rq_atomic_flags { -	/* -	 * Keep these two bits first - not because we depend on the -	 * value of them, but we do depend on them being in the same -	 * byte of storage to ensure ordering on writes. Keeping them -	 * first will achieve that nicely. -	 */ -	REQ_ATOM_COMPLETE = 0, -	REQ_ATOM_STARTED, - -	REQ_ATOM_POLL_SLEPT, -}; - -/*   * EH timer and IO completion will both attempt to 'grab' the request, make - * sure that only one of them succeeds + * sure that only one of them succeeds. Steal the bottom bit of the + * __deadline field for this.   */  static inline int blk_mark_rq_complete(struct request *rq)  { -	return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); +	return test_and_set_bit(0, &rq->__deadline);  }  static inline void blk_clear_rq_complete(struct request *rq)  { -	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); +	clear_bit(0, &rq->__deadline); +} + +static inline bool blk_rq_is_complete(struct request *rq) +{ +	return test_bit(0, &rq->__deadline);  }  /* @@ -172,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq  		e->type->ops.sq.elevator_deactivate_req_fn(q, rq);  } +int elv_register_queue(struct request_queue *q); +void elv_unregister_queue(struct request_queue *q); +  struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);  #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -246,6 +239,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)  }  /* + * Steal a bit from this field for legacy IO path atomic IO marking. Note that + * setting the deadline clears the bottom bit, potentially clearing the + * completed bit. The user has to be OK with this (current ones are fine). + */ +static inline void blk_rq_set_deadline(struct request *rq, unsigned long time) +{ +	rq->__deadline = time & ~0x1UL; +} + +static inline unsigned long blk_rq_deadline(struct request *rq) +{ +	return rq->__deadline & ~0x1UL; +} + +/*   * Internal io_context interface   */  void get_io_context(struct io_context *ioc); diff --git a/block/bounce.c b/block/bounce.c index 1d05c422c932..6a3e68292273 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -113,45 +113,50 @@ int init_emergency_isa_pool(void)  static void copy_to_high_bio_irq(struct bio *to, struct bio *from)  {  	unsigned char *vfrom; -	struct bio_vec tovec, *fromvec = from->bi_io_vec; +	struct bio_vec tovec, fromvec;  	struct bvec_iter iter; +	/* +	 * The bio of @from is created by bounce, so we can iterate +	 * its bvec from start to end, but the @from->bi_iter can't be +	 * trusted because it might be changed by splitting. +	 */ +	struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;  	bio_for_each_segment(tovec, to, iter) { -		if (tovec.bv_page != fromvec->bv_page) { +		fromvec = bio_iter_iovec(from, from_iter); +		if (tovec.bv_page != fromvec.bv_page) {  			/*  			 * fromvec->bv_offset and fromvec->bv_len might have  			 * been modified by the block layer, so use the original  			 * copy, bounce_copy_vec already uses tovec->bv_len  			 */ -			vfrom = page_address(fromvec->bv_page) + +			vfrom = page_address(fromvec.bv_page) +  				tovec.bv_offset;  			bounce_copy_vec(&tovec, vfrom);  			flush_dcache_page(tovec.bv_page);  		} - -		fromvec++; +		bio_advance_iter(from, &from_iter, tovec.bv_len);  	}  }  static void bounce_end_io(struct bio *bio, mempool_t *pool)  {  	struct bio *bio_orig = bio->bi_private; -	struct bio_vec *bvec, *org_vec; +	struct bio_vec *bvec, orig_vec;  	int i; -	int start = bio_orig->bi_iter.bi_idx; +	struct bvec_iter orig_iter = bio_orig->bi_iter;  	/*  	 * free up bounce indirect pages used  	 */  	bio_for_each_segment_all(bvec, bio, i) { -		org_vec = bio_orig->bi_io_vec + i + start; - -		if (bvec->bv_page == org_vec->bv_page) -			continue; - -		dec_zone_page_state(bvec->bv_page, NR_BOUNCE); -		mempool_free(bvec->bv_page, pool); +		orig_vec = bio_iter_iovec(bio_orig, orig_iter); +		if (bvec->bv_page != orig_vec.bv_page) { +			dec_zone_page_state(bvec->bv_page, NR_BOUNCE); +			mempool_free(bvec->bv_page, pool); +		} +		bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);  	}  	bio_orig->bi_status = bio->bi_status; diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 15d25ccd51a5..1474153f73e3 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -30,7 +30,7 @@  /**   * bsg_teardown_job - routine to teardown a bsg job - * @job: bsg_job that is to be torn down + * @kref: kref inside bsg_job that is to be torn down   */  static void bsg_teardown_job(struct kref *kref)  { @@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)   * @name: device to give bsg device   * @job_fn: bsg job handler   * @dd_job_size: size of LLD data needed for each job + * @release: @dev release function   */  struct request_queue *bsg_setup_queue(struct device *dev, const char *name,  		bsg_job_fn *job_fn, int dd_job_size, diff --git a/block/bsg.c b/block/bsg.c index 452f94f1c5d4..a1bcbb6ba50b 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -32,6 +32,9 @@  #define BSG_DESCRIPTION	"Block layer SCSI generic (bsg) driver"  #define BSG_VERSION	"0.4" +#define bsg_dbg(bd, fmt, ...) \ +	pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__) +  struct bsg_device {  	struct request_queue *queue;  	spinlock_t lock; @@ -55,14 +58,6 @@ enum {  #define BSG_DEFAULT_CMDS	64  #define BSG_MAX_DEVS		32768 -#undef BSG_DEBUG - -#ifdef BSG_DEBUG -#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args) -#else -#define dprintk(fmt, args...) -#endif -  static DEFINE_MUTEX(bsg_mutex);  static DEFINE_IDR(bsg_minor_idr); @@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)  	bc->bd = bd;  	INIT_LIST_HEAD(&bc->list); -	dprintk("%s: returning free cmd %p\n", bd->name, bc); +	bsg_dbg(bd, "returning free cmd %p\n", bc);  	return bc;  out:  	spin_unlock_irq(&bd->lock); @@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)  	if (!bcd->class_dev)  		return ERR_PTR(-ENXIO); -	dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, +	bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n", +		(unsigned long long) hdr->dout_xferp,  		hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,  		hdr->din_xfer_len); @@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status)  	struct bsg_device *bd = bc->bd;  	unsigned long flags; -	dprintk("%s: finished rq %p bc %p, bio %p\n", -		bd->name, rq, bc, bc->bio); +	bsg_dbg(bd, "finished rq %p bc %p, bio %p\n", +		rq, bc, bc->bio);  	bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); @@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,  	list_add_tail(&bc->list, &bd->busy_list);  	spin_unlock_irq(&bd->lock); -	dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc); +	bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);  	rq->end_io_data = bc;  	blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); @@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)  		}  	} while (1); -	dprintk("%s: returning done %p\n", bd->name, bc); +	bsg_dbg(bd, "returning done %p\n", bc);  	return bc;  } @@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,  	struct scsi_request *req = scsi_req(rq);  	int ret = 0; -	dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result); +	pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result);  	/*  	 * fill in all the output members  	 */ @@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)  	struct bsg_command *bc;  	int ret, tret; -	dprintk("%s: entered\n", bd->name); +	bsg_dbg(bd, "entered\n");  	/*  	 * wait for all commands to complete @@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)  	int ret;  	ssize_t bytes_read; -	dprintk("%s: read %zd bytes\n", bd->name, count); +	bsg_dbg(bd, "read %zd bytes\n", count);  	bsg_set_block(bd, file); @@ -646,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)  	ssize_t bytes_written;  	int ret; -	dprintk("%s: write %zd bytes\n", bd->name, count); +	bsg_dbg(bd, "write %zd bytes\n", count);  	if (unlikely(uaccess_kernel()))  		return -EINVAL; @@ -664,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)  	if (!bytes_written || err_block_err(ret))  		bytes_written = ret; -	dprintk("%s: returning %zd\n", bd->name, bytes_written); +	bsg_dbg(bd, "returning %zd\n", bytes_written);  	return bytes_written;  } @@ -717,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd)  	hlist_del(&bd->dev_list);  	mutex_unlock(&bsg_mutex); -	dprintk("%s: tearing down\n", bd->name); +	bsg_dbg(bd, "tearing down\n");  	/*  	 * close can always block @@ -744,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,  					 struct file *file)  {  	struct bsg_device *bd; -#ifdef BSG_DEBUG  	unsigned char buf[32]; -#endif  	if (!blk_queue_scsi_passthrough(rq)) {  		WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); @@ -771,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,  	hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));  	strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); -	dprintk("bound to <%s>, max queue %d\n", +	bsg_dbg(bd, "bound to <%s>, max queue %d\n",  		format_dev_t(buf, inode->i_rdev), bd->max_queue);  	mutex_unlock(&bsg_mutex); diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index b83f77460d28..9de9f156e203 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -50,8 +50,6 @@ struct deadline_data {  	int front_merges;  }; -static void deadline_move_request(struct deadline_data *, struct request *); -  static inline struct rb_root *  deadline_rb_root(struct deadline_data *dd, struct request *rq)  { @@ -100,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq)  	struct deadline_data *dd = q->elevator->elevator_data;  	const int data_dir = rq_data_dir(rq); +	/* +	 * This may be a requeue of a write request that has locked its +	 * target zone. If it is the case, this releases the zone lock. +	 */ +	blk_req_zone_write_unlock(rq); +  	deadline_add_rq_rb(dd, rq);  	/* @@ -190,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)  {  	struct request_queue *q = rq->q; +	/* +	 * For a zoned block device, write requests must write lock their +	 * target zone. +	 */ +	blk_req_zone_write_lock(rq); +  	deadline_remove_request(q, rq);  	elv_dispatch_add_tail(q, rq);  } @@ -231,6 +241,69 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)  }  /* + * For the specified data direction, return the next request to dispatch using + * arrival ordered lists. + */ +static struct request * +deadline_fifo_request(struct deadline_data *dd, int data_dir) +{ +	struct request *rq; + +	if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) +		return NULL; + +	if (list_empty(&dd->fifo_list[data_dir])) +		return NULL; + +	rq = rq_entry_fifo(dd->fifo_list[data_dir].next); +	if (data_dir == READ || !blk_queue_is_zoned(rq->q)) +		return rq; + +	/* +	 * Look for a write request that can be dispatched, that is one with +	 * an unlocked target zone. +	 */ +	list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { +		if (blk_req_can_dispatch_to_zone(rq)) +			return rq; +	} + +	return NULL; +} + +/* + * For the specified data direction, return the next request to dispatch using + * sector position sorted lists. + */ +static struct request * +deadline_next_request(struct deadline_data *dd, int data_dir) +{ +	struct request *rq; + +	if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) +		return NULL; + +	rq = dd->next_rq[data_dir]; +	if (!rq) +		return NULL; + +	if (data_dir == READ || !blk_queue_is_zoned(rq->q)) +		return rq; + +	/* +	 * Look for a write request that can be dispatched, that is one with +	 * an unlocked target zone. +	 */ +	while (rq) { +		if (blk_req_can_dispatch_to_zone(rq)) +			return rq; +		rq = deadline_latter_request(rq); +	} + +	return NULL; +} + +/*   * deadline_dispatch_requests selects the best request according to   * read/write expire, fifo_batch, etc   */ @@ -239,16 +312,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)  	struct deadline_data *dd = q->elevator->elevator_data;  	const int reads = !list_empty(&dd->fifo_list[READ]);  	const int writes = !list_empty(&dd->fifo_list[WRITE]); -	struct request *rq; +	struct request *rq, *next_rq;  	int data_dir;  	/*  	 * batches are currently reads XOR writes  	 */ -	if (dd->next_rq[WRITE]) -		rq = dd->next_rq[WRITE]; -	else -		rq = dd->next_rq[READ]; +	rq = deadline_next_request(dd, WRITE); +	if (!rq) +		rq = deadline_next_request(dd, READ);  	if (rq && dd->batching < dd->fifo_batch)  		/* we have a next request are still entitled to batch */ @@ -262,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)  	if (reads) {  		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); -		if (writes && (dd->starved++ >= dd->writes_starved)) +		if (deadline_fifo_request(dd, WRITE) && +		    (dd->starved++ >= dd->writes_starved))  			goto dispatch_writes;  		data_dir = READ; @@ -291,21 +364,29 @@ dispatch_find_request:  	/*  	 * we are not running a batch, find best request for selected data_dir  	 */ -	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { +	next_rq = deadline_next_request(dd, data_dir); +	if (deadline_check_fifo(dd, data_dir) || !next_rq) {  		/*  		 * A deadline has expired, the last request was in the other  		 * direction, or we have run out of higher-sectored requests.  		 * Start again from the request with the earliest expiry time.  		 */ -		rq = rq_entry_fifo(dd->fifo_list[data_dir].next); +		rq = deadline_fifo_request(dd, data_dir);  	} else {  		/*  		 * The last req was the same dir and we have a next request in  		 * sort order. No expired requests so continue on from here.  		 */ -		rq = dd->next_rq[data_dir]; +		rq = next_rq;  	} +	/* +	 * For a zoned block device, if we only have writes queued and none of +	 * them can be dispatched, rq will be NULL. +	 */ +	if (!rq) +		return 0; +  	dd->batching = 0;  dispatch_request: @@ -318,6 +399,16 @@ dispatch_request:  	return 1;  } +/* + * For zoned block devices, write unlock the target zone of completed + * write requests. + */ +static void +deadline_completed_request(struct request_queue *q, struct request *rq) +{ +	blk_req_zone_write_unlock(rq); +} +  static void deadline_exit_queue(struct elevator_queue *e)  {  	struct deadline_data *dd = e->elevator_data; @@ -439,6 +530,7 @@ static struct elevator_type iosched_deadline = {  		.elevator_merged_fn =		deadline_merged_request,  		.elevator_merge_req_fn =	deadline_merged_requests,  		.elevator_dispatch_fn =		deadline_dispatch_requests, +		.elevator_completed_req_fn =	deadline_completed_request,  		.elevator_add_req_fn =		deadline_add_request,  		.elevator_former_req_fn =	elv_rb_former_request,  		.elevator_latter_req_fn =	elv_rb_latter_request, diff --git a/block/elevator.c b/block/elevator.c index 7bda083d5968..e87e9b43aba0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q)  	struct elevator_queue *e = q->elevator;  	int error; +	lockdep_assert_held(&q->sysfs_lock); +  	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");  	if (!error) {  		struct elv_fs_entry *attr = e->type->elevator_attrs; @@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q)  	}  	return error;  } -EXPORT_SYMBOL(elv_register_queue);  void elv_unregister_queue(struct request_queue *q)  { +	lockdep_assert_held(&q->sysfs_lock); +  	if (q) {  		struct elevator_queue *e = q->elevator; @@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q)  		wbt_enable_default(q);  	}  } -EXPORT_SYMBOL(elv_unregister_queue);  int elv_register(struct elevator_type *e)  { @@ -967,7 +969,10 @@ static int elevator_switch_mq(struct request_queue *q,  {  	int ret; +	lockdep_assert_held(&q->sysfs_lock); +  	blk_mq_freeze_queue(q); +	blk_mq_quiesce_queue(q);  	if (q->elevator) {  		if (q->elevator->registered) @@ -994,6 +999,7 @@ static int elevator_switch_mq(struct request_queue *q,  		blk_add_trace_msg(q, "elv switch: none");  out: +	blk_mq_unquiesce_queue(q);  	blk_mq_unfreeze_queue(q);  	return ret;  } @@ -1010,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)  	bool old_registered = false;  	int err; +	lockdep_assert_held(&q->sysfs_lock); +  	if (q->mq_ops)  		return elevator_switch_mq(q, new_e); diff --git a/block/genhd.c b/block/genhd.c index 96a66f671720..88a53c188cb7 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -629,16 +629,18 @@ exit:  }  /** - * device_add_disk - add partitioning information to kernel list + * __device_add_disk - add disk information to kernel list   * @parent: parent device for the disk   * @disk: per-device partitioning information + * @register_queue: register the queue if set to true   *   * This function registers the partitioning information in @disk   * with the kernel.   *   * FIXME: error handling   */ -void device_add_disk(struct device *parent, struct gendisk *disk) +static void __device_add_disk(struct device *parent, struct gendisk *disk, +			      bool register_queue)  {  	dev_t devt;  	int retval; @@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)  				    exact_match, exact_lock, disk);  	}  	register_disk(parent, disk); -	blk_register_queue(disk); +	if (register_queue) +		blk_register_queue(disk);  	/*  	 * Take an extra ref on queue which will be put on disk_release() @@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk)  	disk_add_events(disk);  	blk_integrity_add(disk);  } + +void device_add_disk(struct device *parent, struct gendisk *disk) +{ +	__device_add_disk(parent, disk, true); +}  EXPORT_SYMBOL(device_add_disk); +void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) +{ +	__device_add_disk(parent, disk, false); +} +EXPORT_SYMBOL(device_add_disk_no_queue_reg); +  void del_gendisk(struct gendisk *disk)  {  	struct disk_part_iter piter; @@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk)  		 * Unregister bdi before releasing device numbers (as they can  		 * get reused and we'd get clashes in sysfs).  		 */ -		bdi_unregister(disk->queue->backing_dev_info); +		if (!(disk->flags & GENHD_FL_HIDDEN)) +			bdi_unregister(disk->queue->backing_dev_info);  		blk_unregister_queue(disk);  	} else {  		WARN_ON(1); diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 0179e484ec98..c56f211c8440 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -59,6 +59,7 @@ struct deadline_data {  	int front_merges;  	spinlock_t lock; +	spinlock_t zone_lock;  	struct list_head dispatch;  }; @@ -192,13 +193,83 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)  }  /* + * For the specified data direction, return the next request to + * dispatch using arrival ordered lists. + */ +static struct request * +deadline_fifo_request(struct deadline_data *dd, int data_dir) +{ +	struct request *rq; +	unsigned long flags; + +	if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) +		return NULL; + +	if (list_empty(&dd->fifo_list[data_dir])) +		return NULL; + +	rq = rq_entry_fifo(dd->fifo_list[data_dir].next); +	if (data_dir == READ || !blk_queue_is_zoned(rq->q)) +		return rq; + +	/* +	 * Look for a write request that can be dispatched, that is one with +	 * an unlocked target zone. +	 */ +	spin_lock_irqsave(&dd->zone_lock, flags); +	list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { +		if (blk_req_can_dispatch_to_zone(rq)) +			goto out; +	} +	rq = NULL; +out: +	spin_unlock_irqrestore(&dd->zone_lock, flags); + +	return rq; +} + +/* + * For the specified data direction, return the next request to + * dispatch using sector position sorted lists. + */ +static struct request * +deadline_next_request(struct deadline_data *dd, int data_dir) +{ +	struct request *rq; +	unsigned long flags; + +	if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) +		return NULL; + +	rq = dd->next_rq[data_dir]; +	if (!rq) +		return NULL; + +	if (data_dir == READ || !blk_queue_is_zoned(rq->q)) +		return rq; + +	/* +	 * Look for a write request that can be dispatched, that is one with +	 * an unlocked target zone. +	 */ +	spin_lock_irqsave(&dd->zone_lock, flags); +	while (rq) { +		if (blk_req_can_dispatch_to_zone(rq)) +			break; +		rq = deadline_latter_request(rq); +	} +	spin_unlock_irqrestore(&dd->zone_lock, flags); + +	return rq; +} + +/*   * deadline_dispatch_requests selects the best request according to   * read/write expire, fifo_batch, etc   */ -static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +static struct request *__dd_dispatch_request(struct deadline_data *dd)  { -	struct deadline_data *dd = hctx->queue->elevator->elevator_data; -	struct request *rq; +	struct request *rq, *next_rq;  	bool reads, writes;  	int data_dir; @@ -214,10 +285,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)  	/*  	 * batches are currently reads XOR writes  	 */ -	if (dd->next_rq[WRITE]) -		rq = dd->next_rq[WRITE]; -	else -		rq = dd->next_rq[READ]; +	rq = deadline_next_request(dd, WRITE); +	if (!rq) +		rq = deadline_next_request(dd, READ);  	if (rq && dd->batching < dd->fifo_batch)  		/* we have a next request are still entitled to batch */ @@ -231,7 +301,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)  	if (reads) {  		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); -		if (writes && (dd->starved++ >= dd->writes_starved)) +		if (deadline_fifo_request(dd, WRITE) && +		    (dd->starved++ >= dd->writes_starved))  			goto dispatch_writes;  		data_dir = READ; @@ -260,21 +331,29 @@ dispatch_find_request:  	/*  	 * we are not running a batch, find best request for selected data_dir  	 */ -	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { +	next_rq = deadline_next_request(dd, data_dir); +	if (deadline_check_fifo(dd, data_dir) || !next_rq) {  		/*  		 * A deadline has expired, the last request was in the other  		 * direction, or we have run out of higher-sectored requests.  		 * Start again from the request with the earliest expiry time.  		 */ -		rq = rq_entry_fifo(dd->fifo_list[data_dir].next); +		rq = deadline_fifo_request(dd, data_dir);  	} else {  		/*  		 * The last req was the same dir and we have a next request in  		 * sort order. No expired requests so continue on from here.  		 */ -		rq = dd->next_rq[data_dir]; +		rq = next_rq;  	} +	/* +	 * For a zoned block device, if we only have writes queued and none of +	 * them can be dispatched, rq will be NULL. +	 */ +	if (!rq) +		return NULL; +  	dd->batching = 0;  dispatch_request: @@ -284,17 +363,27 @@ dispatch_request:  	dd->batching++;  	deadline_move_request(dd, rq);  done: +	/* +	 * If the request needs its target zone locked, do it. +	 */ +	blk_req_zone_write_lock(rq);  	rq->rq_flags |= RQF_STARTED;  	return rq;  } +/* + * One confusing aspect here is that we get called for a specific + * hardware queue, but we return a request that may not be for a + * different hardware queue. This is because mq-deadline has shared + * state for all hardware queues, in terms of sorting, FIFOs, etc. + */  static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)  {  	struct deadline_data *dd = hctx->queue->elevator->elevator_data;  	struct request *rq;  	spin_lock(&dd->lock); -	rq = __dd_dispatch_request(hctx); +	rq = __dd_dispatch_request(dd);  	spin_unlock(&dd->lock);  	return rq; @@ -339,6 +428,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)  	dd->front_merges = 1;  	dd->fifo_batch = fifo_batch;  	spin_lock_init(&dd->lock); +	spin_lock_init(&dd->zone_lock);  	INIT_LIST_HEAD(&dd->dispatch);  	q->elevator = eq; @@ -395,6 +485,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,  	struct deadline_data *dd = q->elevator->elevator_data;  	const int data_dir = rq_data_dir(rq); +	/* +	 * This may be a requeue of a write request that has locked its +	 * target zone. If it is the case, this releases the zone lock. +	 */ +	blk_req_zone_write_unlock(rq); +  	if (blk_mq_sched_try_insert_merge(q, rq))  		return; @@ -439,6 +535,26 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,  	spin_unlock(&dd->lock);  } +/* + * For zoned block devices, write unlock the target zone of + * completed write requests. Do this while holding the zone lock + * spinlock so that the zone is never unlocked while deadline_fifo_request() + * while deadline_next_request() are executing. + */ +static void dd_completed_request(struct request *rq) +{ +	struct request_queue *q = rq->q; + +	if (blk_queue_is_zoned(q)) { +		struct deadline_data *dd = q->elevator->elevator_data; +		unsigned long flags; + +		spin_lock_irqsave(&dd->zone_lock, flags); +		blk_req_zone_write_unlock(rq); +		spin_unlock_irqrestore(&dd->zone_lock, flags); +	} +} +  static bool dd_has_work(struct blk_mq_hw_ctx *hctx)  {  	struct deadline_data *dd = hctx->queue->elevator->elevator_data; @@ -640,6 +756,7 @@ static struct elevator_type mq_deadline = {  	.ops.mq = {  		.insert_requests	= dd_insert_requests,  		.dispatch_request	= dd_dispatch_request, +		.completed_request	= dd_completed_request,  		.next_request		= elv_rb_latter_request,  		.former_request		= elv_rb_former_request,  		.bio_merge		= dd_bio_merge, diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 0af3a3db6fb0..82c44f7df911 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state,  			continue;  		bsd_start = le32_to_cpu(p->p_offset);  		bsd_size = le32_to_cpu(p->p_size); -		if (memcmp(flavour, "bsd\0", 4) == 0) +		/* FreeBSD has relative offset if C partition offset is zero */ +		if (memcmp(flavour, "bsd\0", 4) == 0 && +		    le32_to_cpu(l->d_partitions[2].p_offset) == 0)  			bsd_start += offset;  		if (offset == bsd_start && size == bsd_size)  			/* full parent partition, we have it already */ diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index edcfff974527..60b471f8621b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -384,9 +384,10 @@ out_put_request:  /**   * sg_scsi_ioctl  --  handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl - * @file:	file this ioctl operates on (optional)   * @q:		request queue to send scsi commands down   * @disk:	gendisk to operate on (option) + * @mode:	mode used to open the file through which the ioctl has been + *		submitted   * @sic:	userspace structure describing the command to perform   *   * Send down the scsi command described by @sic to the device below @@ -415,10 +416,10 @@ out_put_request:   *      Positive numbers returned are the compacted SCSI error codes (4   *      bytes in one int) where the lowest byte is the SCSI status.   */ -#define OMAX_SB_LEN 16          /* For backward compatibility */  int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,  		struct scsi_ioctl_command __user *sic)  { +	enum { OMAX_SB_LEN = 16 };	/* For backward compatibility */  	struct request *rq;  	struct scsi_request *req;  	int err; @@ -692,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)  	if (bd && bd == bd->bd_contains)  		return 0; -	/* Actually none of these is particularly useful on a partition, -	 * but they are safe. -	 */ -	switch (cmd) { -	case SCSI_IOCTL_GET_IDLUN: -	case SCSI_IOCTL_GET_BUS_NUMBER: -	case SCSI_IOCTL_GET_PCI: -	case SCSI_IOCTL_PROBE_HOST: -	case SG_GET_VERSION_NUM: -	case SG_SET_TIMEOUT: -	case SG_GET_TIMEOUT: -	case SG_GET_RESERVED_SIZE: -	case SG_SET_RESERVED_SIZE: -	case SG_EMULATED_HOST: -		return 0; -	case CDROM_GET_CAPABILITY: -		/* Keep this until we remove the printk below.  udev sends it -		 * and we do not want to spam dmesg about it.   CD-ROMs do -		 * not have partitions, so we get here only for disks. -		 */ -		return -ENOIOCTLCMD; -	default: -		break; -	} -  	if (capable(CAP_SYS_RAWIO))  		return 0; -	/* In particular, rule out all resets and host-specific ioctls.  */ -	printk_ratelimited(KERN_WARNING -			   "%s: sending ioctl %x to a partition!\n", current->comm, cmd); -  	return -ENOIOCTLCMD;  }  EXPORT_SYMBOL(scsi_verify_blk_ioctl);  |