summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-03-21 16:48:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-03-21 16:48:55 -0700
commit616355cc818c6ddadc393fdfd4491f94458cb715 (patch)
treea96907b179ccbeb84cd1df5515cd10c90eb5bd59
parentb080cee72ef355669cbc52ff55dc513d37433600 (diff)
parent8f9e7b65f833cb9a4b2e2f54a049d74df394d906 (diff)
downloadlinux-616355cc818c6ddadc393fdfd4491f94458cb715.tar.bz2
Merge tag 'for-5.18/block-2022-03-18' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: - BFQ cleanups and fixes (Yu, Zhang, Yahu, Paolo) - blk-rq-qos completion fix (Tejun) - blk-cgroup merge fix (Tejun) - Add offline error return value to distinguish it from an IO error on the device (Song) - IO stats fixes (Zhang, Christoph) - blkcg refcount fixes (Ming, Yu) - Fix for indefinite dispatch loop softlockup (Shin'ichiro) - blk-mq hardware queue management improvements (Ming) - sbitmap dead code removal (Ming, John) - Plugging merge improvements (me) - Show blk-crypto capabilities in sysfs (Eric) - Multiple delayed queue run improvement (David) - Block throttling fixes (Ming) - Start deprecating auto module loading based on dev_t (Christoph) - bio allocation improvements (Christoph, Chaitanya) - Get rid of bio_devname (Christoph) - bio clone improvements (Christoph) - Block plugging improvements (Christoph) - Get rid of genhd.h header (Christoph) - Ensure drivers use appropriate flush helpers (Christoph) - Refcounting improvements (Christoph) - Queue initialization and teardown improvements (Ming, Christoph) - Misc fixes/improvements (Barry, Chaitanya, Colin, Dan, Jiapeng, Lukas, Nian, Yang, Eric, Chengming) * tag 'for-5.18/block-2022-03-18' of git://git.kernel.dk/linux-block: (127 commits) block: cancel all throttled bios in del_gendisk() block: let blkcg_gq grab request queue's refcnt block: avoid use-after-free on throttle data block: limit request dispatch loop duration block/bfq-iosched: Fix spelling mistake "tenative" -> "tentative" sr: simplify the local variable initialization in sr_block_open() block: don't merge across cgroup boundaries if blkcg is enabled block: fix rq-qos breakage from skipping rq_qos_done_bio() block: flush plug based on hardware and software queue order block: ensure plug merging checks the correct queue at least once block: move rq_qos_exit() into disk_release() block: do more work in elevator_exit block: move blk_exit_queue into disk_release block: move q_usage_counter release into blk_queue_release block: don't remove hctx debugfs dir from blk_mq_exit_queue block: move blkcg initialization/destroy into disk allocation/release handler sr: implement ->free_disk to simplify refcounting sd: implement ->free_disk to simplify refcounting sd: delay calling free_opal_dev sd: call sd_zbc_release_disk before releasing the scsi_device reference ...
-rw-r--r--Documentation/ABI/stable/sysfs-block49
-rw-r--r--Documentation/block/biodoc.rst1164
-rw-r--r--Documentation/block/capability.rst2
-rw-r--r--Documentation/block/index.rst1
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/m68k/atari/stdma.c1
-rw-r--r--arch/m68k/bvme6000/config.c1
-rw-r--r--arch/m68k/emu/nfblock.c1
-rw-r--r--arch/m68k/kernel/setup_mm.c1
-rw-r--r--arch/m68k/mvme147/config.c1
-rw-r--r--arch/m68k/mvme16x/config.c1
-rw-r--r--block/Kconfig13
-rw-r--r--block/Makefile3
-rw-r--r--block/bdev.c11
-rw-r--r--block/bfq-cgroup.c16
-rw-r--r--block/bfq-iosched.c37
-rw-r--r--block/bfq-iosched.h2
-rw-r--r--block/bfq-wf2q.c17
-rw-r--r--block/bio-integrity.c1
-rw-r--r--block/bio.c190
-rw-r--r--block/blk-cgroup-rwstat.h2
-rw-r--r--block/blk-cgroup.c20
-rw-r--r--block/blk-cgroup.h494
-rw-r--r--block/blk-core.c293
-rw-r--r--block/blk-crypto-fallback.c2
-rw-r--r--block/blk-crypto-internal.h12
-rw-r--r--block/blk-crypto-sysfs.c172
-rw-r--r--block/blk-crypto.c4
-rw-r--r--block/blk-flush.c4
-rw-r--r--block/blk-iocost.c2
-rw-r--r--block/blk-iolatency.c4
-rw-r--r--block/blk-ioprio.c2
-rw-r--r--block/blk-lib.c46
-rw-r--r--block/blk-merge.c33
-rw-r--r--block/blk-mq-debugfs.c6
-rw-r--r--block/blk-mq-debugfs.h2
-rw-r--r--block/blk-mq-sched.c18
-rw-r--r--block/blk-mq-sysfs.c16
-rw-r--r--block/blk-mq-tag.c6
-rw-r--r--block/blk-mq.c303
-rw-r--r--block/blk-mq.h2
-rw-r--r--block/blk-rq-qos.h20
-rw-r--r--block/blk-sysfs.c44
-rw-r--r--block/blk-throttle.c110
-rw-r--r--block/blk-throttle.h19
-rw-r--r--block/blk-zoned.c14
-rw-r--r--block/blk.h10
-rw-r--r--block/bounce.c11
-rw-r--r--block/disk-events.c2
-rw-r--r--block/elevator.c16
-rw-r--r--block/fops.c35
-rw-r--r--block/genhd.c67
-rw-r--r--block/holder.c2
-rw-r--r--block/partitions/check.h1
-rw-r--r--block/partitions/core.c1
-rw-r--r--block/partitions/efi.h1
-rw-r--r--block/partitions/ldm.h1
-rw-r--r--block/sed-opal.c2
-rw-r--r--drivers/base/class.c2
-rw-r--r--drivers/base/core.c2
-rw-r--r--drivers/base/devtmpfs.c2
-rw-r--r--drivers/block/aoe/aoeblk.c1
-rw-r--r--drivers/block/aoe/aoecmd.c1
-rw-r--r--drivers/block/drbd/drbd_actlog.c5
-rw-r--r--drivers/block/drbd/drbd_bitmap.c7
-rw-r--r--drivers/block/drbd/drbd_int.h1
-rw-r--r--drivers/block/drbd/drbd_receiver.c32
-rw-r--r--drivers/block/drbd/drbd_req.c5
-rw-r--r--drivers/block/drbd/drbd_worker.c4
-rw-r--r--drivers/block/floppy.c4
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c5
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h1
-rw-r--r--drivers/block/pktcdvd.c21
-rw-r--r--drivers/block/rnbd/rnbd-clt.c2
-rw-r--r--drivers/block/rnbd/rnbd-srv-dev.c61
-rw-r--r--drivers/block/rnbd/rnbd-srv-dev.h18
-rw-r--r--drivers/block/rnbd/rnbd-srv-sysfs.c1
-rw-r--r--drivers/block/rnbd/rnbd-srv.c45
-rw-r--r--drivers/block/rnbd/rnbd-srv.h1
-rw-r--r--drivers/block/sunvdc.c1
-rw-r--r--drivers/block/virtio_blk.c66
-rw-r--r--drivers/block/xen-blkback/blkback.c25
-rw-r--r--drivers/block/zram/zram_drv.c17
-rw-r--r--drivers/cdrom/gdrom.c1
-rw-r--r--drivers/char/random.c2
-rw-r--r--drivers/md/Kconfig1
-rw-r--r--drivers/md/bcache/io.c3
-rw-r--r--drivers/md/bcache/journal.c16
-rw-r--r--drivers/md/bcache/movinggc.c4
-rw-r--r--drivers/md/bcache/request.c22
-rw-r--r--drivers/md/bcache/super.c9
-rw-r--r--drivers/md/bcache/writeback.c4
-rw-r--r--drivers/md/dm-cache-target.c26
-rw-r--r--drivers/md/dm-core.h1
-rw-r--r--drivers/md/dm-crypt.c46
-rw-r--r--drivers/md/dm-integrity.c5
-rw-r--r--drivers/md/dm-io.c5
-rw-r--r--drivers/md/dm-log-writes.c39
-rw-r--r--drivers/md/dm-rq.c26
-rw-r--r--drivers/md/dm-snap.c21
-rw-r--r--drivers/md/dm-thin.c41
-rw-r--r--drivers/md/dm-writecache.c7
-rw-r--r--drivers/md/dm-zoned-metadata.c26
-rw-r--r--drivers/md/dm-zoned-target.c3
-rw-r--r--drivers/md/dm.c172
-rw-r--r--drivers/md/md-faulty.c4
-rw-r--r--drivers/md/md-multipath.c13
-rw-r--r--drivers/md/md.c29
-rw-r--r--drivers/md/raid1.c47
-rw-r--r--drivers/md/raid10.c30
-rw-r--r--drivers/md/raid5-cache.c19
-rw-r--r--drivers/md/raid5-ppl.c26
-rw-r--r--drivers/md/raid5.c16
-rw-r--r--drivers/memstick/core/ms_block.c64
-rw-r--r--drivers/memstick/core/ms_block.h1
-rw-r--r--drivers/memstick/core/mspro_block.c57
-rw-r--r--drivers/mtd/mtdswap.c2
-rw-r--r--drivers/mtd/nand/raw/sharpsl.c1
-rw-r--r--drivers/nvdimm/blk.c1
-rw-r--r--drivers/nvdimm/btt.c1
-rw-r--r--drivers/nvdimm/btt_devs.c1
-rw-r--r--drivers/nvdimm/bus.c1
-rw-r--r--drivers/nvdimm/nd_virtio.c6
-rw-r--r--drivers/nvdimm/pfn_devs.c1
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c18
-rw-r--r--drivers/nvme/target/passthru.c7
-rw-r--r--drivers/nvme/target/zns.c14
-rw-r--r--drivers/s390/block/dasd_int.h1
-rw-r--r--drivers/s390/block/scm_blk.c1
-rw-r--r--drivers/s390/block/scm_blk.h1
-rw-r--r--drivers/scsi/scsi_debug.c1
-rw-r--r--drivers/scsi/scsi_lib.c2
-rw-r--r--drivers/scsi/scsicam.c1
-rw-r--r--drivers/scsi/sd.c115
-rw-r--r--drivers/scsi/sd.h12
-rw-r--r--drivers/scsi/sr.c131
-rw-r--r--drivers/scsi/sr.h6
-rw-r--r--drivers/scsi/st.c1
-rw-r--r--drivers/scsi/st.h1
-rw-r--r--drivers/scsi/ufs/ufshpb.c4
-rw-r--r--drivers/target/target_core_iblock.c12
-rw-r--r--drivers/target/target_core_pscsi.c1
-rw-r--r--fs/btrfs/check-integrity.c1
-rw-r--r--fs/btrfs/disk-io.c10
-rw-r--r--fs/btrfs/extent_io.c6
-rw-r--r--fs/buffer.c14
-rw-r--r--fs/crypto/bio.c13
-rw-r--r--fs/dax.c1
-rw-r--r--fs/direct-io.c5
-rw-r--r--fs/erofs/zdata.c5
-rw-r--r--fs/ext4/page-io.c8
-rw-r--r--fs/ext4/readpage.c8
-rw-r--r--fs/f2fs/data.c7
-rw-r--r--fs/fs-writeback.c6
-rw-r--r--fs/gfs2/lops.c8
-rw-r--r--fs/gfs2/meta_io.c4
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/hfs/mdb.c2
-rw-r--r--fs/hfsplus/wrapper.c5
-rw-r--r--fs/iomap/buffered-io.c26
-rw-r--r--fs/iomap/direct-io.c8
-rw-r--r--fs/jfs/jfs_logmgr.c11
-rw-r--r--fs/jfs/jfs_metapage.c9
-rw-r--r--fs/ksmbd/vfs.c1
-rw-r--r--fs/mpage.c34
-rw-r--r--fs/nfs/blocklayout/blocklayout.c26
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c1
-rw-r--r--fs/nfsd/blocklayout.c1
-rw-r--r--fs/nilfs2/segbuf.c31
-rw-r--r--fs/ntfs3/fsntfs.c36
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/squashfs/block.c11
-rw-r--r--fs/xfs/xfs_bio_io.c14
-rw-r--r--fs/xfs/xfs_buf.c4
-rw-r--r--fs/xfs/xfs_log.c14
-rw-r--r--fs/zonefs/super.c9
-rw-r--r--include/linux/bio.h38
-rw-r--r--include/linux/blk-cgroup.h461
-rw-r--r--include/linux/blk-mq.h6
-rw-r--r--include/linux/blk_types.h10
-rw-r--r--include/linux/blkdev.h296
-rw-r--r--include/linux/genhd.h291
-rw-r--r--include/linux/part_stat.h2
-rw-r--r--include/linux/sbitmap.h51
-rw-r--r--include/scsi/scsi_cmnd.h9
-rw-r--r--include/scsi/scsi_driver.h9
-rw-r--r--include/trace/events/block.h49
-rw-r--r--init/do_mounts.c1
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/power/hibernate.c1
-rw-r--r--kernel/power/swap.c6
-rw-r--r--kernel/sched/core.c7
-rw-r--r--lib/sbitmap.c40
-rw-r--r--mm/page_io.c10
-rw-r--r--security/integrity/ima/ima_policy.c1
196 files changed, 2412 insertions, 3998 deletions
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 8dd3e84a8aad..e8797cd09aff 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -155,6 +155,55 @@ Description:
last zone of the device which may be smaller.
+What: /sys/block/<disk>/queue/crypto/
+Date: February 2022
+Contact: linux-block@vger.kernel.org
+Description:
+ The presence of this subdirectory of /sys/block/<disk>/queue/
+ indicates that the device supports inline encryption. This
+ subdirectory contains files which describe the inline encryption
+ capabilities of the device. For more information about inline
+ encryption, refer to Documentation/block/inline-encryption.rst.
+
+
+What: /sys/block/<disk>/queue/crypto/max_dun_bits
+Date: February 2022
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This file shows the maximum length, in bits, of data unit
+ numbers accepted by the device in inline encryption requests.
+
+
+What: /sys/block/<disk>/queue/crypto/modes/<mode>
+Date: February 2022
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] For each crypto mode (i.e., encryption/decryption
+ algorithm) the device supports with inline encryption, a file
+ will exist at this location. It will contain a hexadecimal
+ number that is a bitmask of the supported data unit sizes, in
+ bytes, for that crypto mode.
+
+ Currently, the crypto modes that may be supported are:
+
+ * AES-256-XTS
+ * AES-128-CBC-ESSIV
+ * Adiantum
+
+ For example, if a device supports AES-256-XTS inline encryption
+ with data unit sizes of 512 and 4096 bytes, the file
+ /sys/block/<disk>/queue/crypto/modes/AES-256-XTS will exist and
+ will contain "0x1200".
+
+
+What: /sys/block/<disk>/queue/crypto/num_keyslots
+Date: February 2022
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This file shows the number of keyslots the device has for
+ use with inline encryption.
+
+
What: /sys/block/<disk>/queue/dax
Date: June 2016
Contact: linux-block@vger.kernel.org
diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst
deleted file mode 100644
index 2098477851a4..000000000000
--- a/Documentation/block/biodoc.rst
+++ /dev/null
@@ -1,1164 +0,0 @@
-=====================================================
-Notes on the Generic Block Layer Rewrite in Linux 2.5
-=====================================================
-
-.. note::
-
- It seems that there are lot of outdated stuff here. This seems
- to be written somewhat as a task list. Yet, eventually, something
- here might still be useful.
-
-Notes Written on Jan 15, 2002:
-
- - Jens Axboe <jens.axboe@oracle.com>
- - Suparna Bhattacharya <suparna@in.ibm.com>
-
-Last Updated May 2, 2002
-
-September 2003: Updated I/O Scheduler portions
- - Nick Piggin <npiggin@kernel.dk>
-
-Introduction
-============
-
-These are some notes describing some aspects of the 2.5 block layer in the
-context of the bio rewrite. The idea is to bring out some of the key
-changes and a glimpse of the rationale behind those changes.
-
-Please mail corrections & suggestions to suparna@in.ibm.com.
-
-Credits
-=======
-
-2.5 bio rewrite:
- - Jens Axboe <jens.axboe@oracle.com>
-
-Many aspects of the generic block layer redesign were driven by and evolved
-over discussions, prior patches and the collective experience of several
-people. See sections 8 and 9 for a list of some related references.
-
-The following people helped with review comments and inputs for this
-document:
-
- - Christoph Hellwig <hch@infradead.org>
- - Arjan van de Ven <arjanv@redhat.com>
- - Randy Dunlap <rdunlap@xenotime.net>
- - Andre Hedrick <andre@linux-ide.org>
-
-The following people helped with fixes/contributions to the bio patches
-while it was still work-in-progress:
-
- - David S. Miller <davem@redhat.com>
-
-
-.. Description of Contents:
-
- 1. Scope for tuning of logic to various needs
- 1.1 Tuning based on device or low level driver capabilities
- - Per-queue parameters
- - Highmem I/O support
- - I/O scheduler modularization
- 1.2 Tuning based on high level requirements/capabilities
- 1.2.1 Request Priority/Latency
- 1.3 Direct access/bypass to lower layers for diagnostics and special
- device operations
- 1.3.1 Pre-built commands
- 2. New flexible and generic but minimalist i/o structure or descriptor
- (instead of using buffer heads at the i/o layer)
- 2.1 Requirements/Goals addressed
- 2.2 The bio struct in detail (multi-page io unit)
- 2.3 Changes in the request structure
- 3. Using bios
- 3.1 Setup/teardown (allocation, splitting)
- 3.2 Generic bio helper routines
- 3.2.1 Traversing segments and completion units in a request
- 3.2.2 Setting up DMA scatterlists
- 3.2.3 I/O completion
- 3.2.4 Implications for drivers that do not interpret bios (don't handle
- multiple segments)
- 3.3 I/O submission
- 4. The I/O scheduler
- 5. Scalability related changes
- 5.1 Granular locking: Removal of io_request_lock
- 5.2 Prepare for transition to 64 bit sector_t
- 6. Other Changes/Implications
- 6.1 Partition re-mapping handled by the generic block layer
- 7. A few tips on migration of older drivers
- 8. A list of prior/related/impacted patches/ideas
- 9. Other References/Discussion Threads
-
-
-Bio Notes
-=========
-
-Let us discuss the changes in the context of how some overall goals for the
-block layer are addressed.
-
-1. Scope for tuning the generic logic to satisfy various requirements
-=====================================================================
-
-The block layer design supports adaptable abstractions to handle common
-processing with the ability to tune the logic to an appropriate extent
-depending on the nature of the device and the requirements of the caller.
-One of the objectives of the rewrite was to increase the degree of tunability
-and to enable higher level code to utilize underlying device/driver
-capabilities to the maximum extent for better i/o performance. This is
-important especially in the light of ever improving hardware capabilities
-and application/middleware software designed to take advantage of these
-capabilities.
-
-1.1 Tuning based on low level device / driver capabilities
-----------------------------------------------------------
-
-Sophisticated devices with large built-in caches, intelligent i/o scheduling
-optimizations, high memory DMA support, etc may find some of the
-generic processing an overhead, while for less capable devices the
-generic functionality is essential for performance or correctness reasons.
-Knowledge of some of the capabilities or parameters of the device should be
-used at the generic block layer to take the right decisions on
-behalf of the driver.
-
-How is this achieved ?
-
-Tuning at a per-queue level:
-
-i. Per-queue limits/values exported to the generic layer by the driver
-
-Various parameters that the generic i/o scheduler logic uses are set at
-a per-queue level (e.g maximum request size, maximum number of segments in
-a scatter-gather list, logical block size)
-
-Some parameters that were earlier available as global arrays indexed by
-major/minor are now directly associated with the queue. Some of these may
-move into the block device structure in the future. Some characteristics
-have been incorporated into a queue flags field rather than separate fields
-in themselves. There are blk_queue_xxx functions to set the parameters,
-rather than update the fields directly
-
-Some new queue property settings:
-
- blk_queue_bounce_limit(q, u64 dma_address)
- Enable I/O to highmem pages, dma_address being the
- limit. No highmem default.
-
- blk_queue_max_sectors(q, max_sectors)
- Sets two variables that limit the size of the request.
-
- - The request queue's max_sectors, which is a soft size in
- units of 512 byte sectors, and could be dynamically varied
- by the core kernel.
-
- - The request queue's max_hw_sectors, which is a hard limit
- and reflects the maximum size request a driver can handle
- in units of 512 byte sectors.
-
- The default for both max_sectors and max_hw_sectors is
- 255. The upper limit of max_sectors is 1024.
-
- blk_queue_max_phys_segments(q, max_segments)
- Maximum physical segments you can handle in a request. 128
- default (driver limit). (See 3.2.2)
-
- blk_queue_max_hw_segments(q, max_segments)
- Maximum dma segments the hardware can handle in a request. 128
- default (host adapter limit, after dma remapping).
- (See 3.2.2)
-
- blk_queue_max_segment_size(q, max_seg_size)
- Maximum size of a clustered segment, 64kB default.
-
- blk_queue_logical_block_size(q, logical_block_size)
- Lowest possible sector size that the hardware can operate
- on, 512 bytes default.
-
-New queue flags:
-
- - QUEUE_FLAG_CLUSTER (see 3.2.2)
- - QUEUE_FLAG_QUEUED (see 3.2.4)
-
-
-ii. High-mem i/o capabilities are now considered the default
-
-The generic bounce buffer logic, present in 2.4, where the block layer would
-by default copyin/out i/o requests on high-memory buffers to low-memory buffers
-assuming that the driver wouldn't be able to handle it directly, has been
-changed in 2.5. The bounce logic is now applied only for memory ranges
-for which the device cannot handle i/o. A driver can specify this by
-setting the queue bounce limit for the request queue for the device
-(blk_queue_bounce_limit()). This avoids the inefficiencies of the copyin/out
-where a device is capable of handling high memory i/o.
-
-In order to enable high-memory i/o where the device is capable of supporting
-it, the pci dma mapping routines and associated data structures have now been
-modified to accomplish a direct page -> bus translation, without requiring
-a virtual address mapping (unlike the earlier scheme of virtual address
--> bus translation). So this works uniformly for high-memory pages (which
-do not have a corresponding kernel virtual address space mapping) and
-low-memory pages.
-
-Note: Please refer to Documentation/core-api/dma-api-howto.rst for a discussion
-on PCI high mem DMA aspects and mapping of scatter gather lists, and support
-for 64 bit PCI.
-
-Special handling is required only for cases where i/o needs to happen on
-pages at physical memory addresses beyond what the device can support. In these
-cases, a bounce bio representing a buffer from the supported memory range
-is used for performing the i/o with copyin/copyout as needed depending on
-the type of the operation. For example, in case of a read operation, the
-data read has to be copied to the original buffer on i/o completion, so a
-callback routine is set up to do this, while for write, the data is copied
-from the original buffer to the bounce buffer prior to issuing the
-operation. Since an original buffer may be in a high memory area that's not
-mapped in kernel virtual addr, a kmap operation may be required for
-performing the copy, and special care may be needed in the completion path
-as it may not be in irq context. Special care is also required (by way of
-GFP flags) when allocating bounce buffers, to avoid certain highmem
-deadlock possibilities.
-
-It is also possible that a bounce buffer may be allocated from high-memory
-area that's not mapped in kernel virtual addr, but within the range that the
-device can use directly; so the bounce page may need to be kmapped during
-copy operations. [Note: This does not hold in the current implementation,
-though]
-
-There are some situations when pages from high memory may need to
-be kmapped, even if bounce buffers are not necessary. For example a device
-may need to abort DMA operations and revert to PIO for the transfer, in
-which case a virtual mapping of the page is required. For SCSI it is also
-done in some scenarios where the low level driver cannot be trusted to
-handle a single sg entry correctly. The driver is expected to perform the
-kmaps as needed on such occasions as appropriate. A driver could also use
-the blk_queue_bounce() routine on its own to bounce highmem i/o to low
-memory for specific requests if so desired.
-
-iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
-
-As in 2.4, it is possible to plugin a brand new i/o scheduler for a particular
-queue or pick from (copy) existing generic schedulers and replace/override
-certain portions of it. The 2.5 rewrite provides improved modularization
-of the i/o scheduler. There are more pluggable callbacks, e.g for init,
-add request, extract request, which makes it possible to abstract specific
-i/o scheduling algorithm aspects and details outside of the generic loop.
-It also makes it possible to completely hide the implementation details of
-the i/o scheduler from block drivers.
-
-I/O scheduler wrappers are to be used instead of accessing the queue directly.
-See section 4. The I/O scheduler for details.
-
-1.2 Tuning Based on High level code capabilities
-------------------------------------------------
-
-i. Application capabilities for raw i/o
-
-This comes from some of the high-performance database/middleware
-requirements where an application prefers to make its own i/o scheduling
-decisions based on an understanding of the access patterns and i/o
-characteristics
-
-ii. High performance filesystems or other higher level kernel code's
-capabilities
-
-Kernel components like filesystems could also take their own i/o scheduling
-decisions for optimizing performance. Journalling filesystems may need
-some control over i/o ordering.
-
-What kind of support exists at the generic block layer for this ?
-
-The flags and rw fields in the bio structure can be used for some tuning
-from above e.g indicating that an i/o is just a readahead request, or priority
-settings (currently unused). As far as user applications are concerned they
-would need an additional mechanism either via open flags or ioctls, or some
-other upper level mechanism to communicate such settings to block.
-
-1.2.1 Request Priority/Latency
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Todo/Under discussion::
-
- Arjan's proposed request priority scheme allows higher levels some broad
- control (high/med/low) over the priority of an i/o request vs other pending
- requests in the queue. For example it allows reads for bringing in an
- executable page on demand to be given a higher priority over pending write
- requests which haven't aged too much on the queue. Potentially this priority
- could even be exposed to applications in some manner, providing higher level
- tunability. Time based aging avoids starvation of lower priority
- requests. Some bits in the bi_opf flags field in the bio structure are
- intended to be used for this priority information.
-
-
-1.3 Direct Access to Low level Device/Driver Capabilities (Bypass mode)
------------------------------------------------------------------------
-
-(e.g Diagnostics, Systems Management)
-
-There are situations where high-level code needs to have direct access to
-the low level device capabilities or requires the ability to issue commands
-to the device bypassing some of the intermediate i/o layers.
-These could, for example, be special control commands issued through ioctl
-interfaces, or could be raw read/write commands that stress the drive's
-capabilities for certain kinds of fitness tests. Having direct interfaces at
-multiple levels without having to pass through upper layers makes
-it possible to perform bottom up validation of the i/o path, layer by
-layer, starting from the media.
-
-The normal i/o submission interfaces, e.g submit_bio, could be bypassed
-for specially crafted requests which such ioctl or diagnostics
-interfaces would typically use, and the elevator add_request routine
-can instead be used to directly insert such requests in the queue or preferably
-the blk_do_rq routine can be used to place the request on the queue and
-wait for completion. Alternatively, sometimes the caller might just
-invoke a lower level driver specific interface with the request as a
-parameter.
-
-If the request is a means for passing on special information associated with
-the command, then such information is associated with the request->special
-field (rather than misuse the request->buffer field which is meant for the
-request data buffer's virtual mapping).
-
-For passing request data, the caller must build up a bio descriptor
-representing the concerned memory buffer if the underlying driver interprets
-bio segments or uses the block layer end*request* functions for i/o
-completion. Alternatively one could directly use the request->buffer field to
-specify the virtual address of the buffer, if the driver expects buffer
-addresses passed in this way and ignores bio entries for the request type
-involved. In the latter case, the driver would modify and manage the
-request->buffer, request->sector and request->nr_sectors or
-request->current_nr_sectors fields itself rather than using the block layer
-end_request or end_that_request_first completion interfaces.
-(See 2.3 or Documentation/block/request.rst for a brief explanation of
-the request structure fields)
-
-::
-
- [TBD: end_that_request_last should be usable even in this case;
- Perhaps an end_that_direct_request_first routine could be implemented to make
- handling direct requests easier for such drivers; Also for drivers that
- expect bios, a helper function could be provided for setting up a bio
- corresponding to a data buffer]
-
- <JENS: I dont understand the above, why is end_that_request_first() not
- usable? Or _last for that matter. I must be missing something>
-
- <SUP: What I meant here was that if the request doesn't have a bio, then
- end_that_request_first doesn't modify nr_sectors or current_nr_sectors,
- and hence can't be used for advancing request state settings on the
- completion of partial transfers. The driver has to modify these fields
- directly by hand.
- This is because end_that_request_first only iterates over the bio list,
- and always returns 0 if there are none associated with the request.
- _last works OK in this case, and is not a problem, as I mentioned earlier
- >
-
-1.3.1 Pre-built Commands
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-A request can be created with a pre-built custom command to be sent directly
-to the device. The cmd block in the request structure has room for filling
-in the command bytes. (i.e rq->cmd is now 16 bytes in size, and meant for
-command pre-building, and the type of the request is now indicated
-through rq->flags instead of via rq->cmd)
-
-The request structure flags can be set up to indicate the type of request
-in such cases (REQ_PC: direct packet command passed to driver, REQ_BLOCK_PC:
-packet command issued via blk_do_rq, REQ_SPECIAL: special request).
-
-It can help to pre-build device commands for requests in advance.
-Drivers can now specify a request prepare function (q->prep_rq_fn) that the
-block layer would invoke to pre-build device commands for a given request,
-or perform other preparatory processing for the request. This is routine is
-called by elv_next_request(), i.e. typically just before servicing a request.
-(The prepare function would not be called for requests that have RQF_DONTPREP
-enabled)
-
-Aside:
- Pre-building could possibly even be done early, i.e before placing the
- request on the queue, rather than construct the command on the fly in the
- driver while servicing the request queue when it may affect latencies in
- interrupt context or responsiveness in general. One way to add early
- pre-building would be to do it whenever we fail to merge on a request.
- Now REQ_NOMERGE is set in the request flags to skip this one in the future,
- which means that it will not change before we feed it to the device. So
- the pre-builder hook can be invoked there.
-
-
-2. Flexible and generic but minimalist i/o structure/descriptor
-===============================================================
-
-2.1 Reason for a new structure and requirements addressed
----------------------------------------------------------
-
-Prior to 2.5, buffer heads were used as the unit of i/o at the generic block
-layer, and the low level request structure was associated with a chain of
-buffer heads for a contiguous i/o request. This led to certain inefficiencies
-when it came to large i/o requests and readv/writev style operations, as it
-forced such requests to be broken up into small chunks before being passed
-on to the generic block layer, only to be merged by the i/o scheduler
-when the underlying device was capable of handling the i/o in one shot.
-Also, using the buffer head as an i/o structure for i/os that didn't originate
-from the buffer cache unnecessarily added to the weight of the descriptors
-which were generated for each such chunk.
-
-The following were some of the goals and expectations considered in the
-redesign of the block i/o data structure in 2.5.
-
-1. Should be appropriate as a descriptor for both raw and buffered i/o -
- avoid cache related fields which are irrelevant in the direct/page i/o path,
- or filesystem block size alignment restrictions which may not be relevant
- for raw i/o.
-2. Ability to represent high-memory buffers (which do not have a virtual
- address mapping in kernel address space).
-3. Ability to represent large i/os w/o unnecessarily breaking them up (i.e
- greater than PAGE_SIZE chunks in one shot)
-4. At the same time, ability to retain independent identity of i/os from
- different sources or i/o units requiring individual completion (e.g. for
- latency reasons)
-5. Ability to represent an i/o involving multiple physical memory segments
- (including non-page aligned page fragments, as specified via readv/writev)
- without unnecessarily breaking it up, if the underlying device is capable of
- handling it.
-6. Preferably should be based on a memory descriptor structure that can be
- passed around different types of subsystems or layers, maybe even
- networking, without duplication or extra copies of data/descriptor fields
- themselves in the process
-7. Ability to handle the possibility of splits/merges as the structure passes
- through layered drivers (lvm, md, evms), with minimal overhead.
-
-The solution was to define a new structure (bio) for the block layer,
-instead of using the buffer head structure (bh) directly, the idea being
-avoidance of some associated baggage and limitations. The bio structure
-is uniformly used for all i/o at the block layer ; it forms a part of the
-bh structure for buffered i/o, and in the case of raw/direct i/o kiobufs are
-mapped to bio structures.
-
-2.2 The bio struct
-------------------
-
-The bio structure uses a vector representation pointing to an array of tuples
-of <page, offset, len> to describe the i/o buffer, and has various other
-fields describing i/o parameters and state that needs to be maintained for
-performing the i/o.
-
-Notice that this representation means that a bio has no virtual address
-mapping at all (unlike buffer heads).
-
-::
-
- struct bio_vec {
- struct page *bv_page;
- unsigned short bv_len;
- unsigned short bv_offset;
- };
-
- /*
- * main unit of I/O for the block layer and lower layers (ie drivers)
- */
- struct bio {
- struct bio *bi_next; /* request queue link */
- struct block_device *bi_bdev; /* target device */
- unsigned long bi_flags; /* status, command, etc */
- unsigned long bi_opf; /* low bits: r/w, high: priority */
-
- unsigned int bi_vcnt; /* how may bio_vec's */
- struct bvec_iter bi_iter; /* current index into bio_vec array */
-
- unsigned int bi_size; /* total size in bytes */
- unsigned short bi_hw_segments; /* segments after DMA remapping */
- unsigned int bi_max; /* max bio_vecs we can hold
- used as index into pool */
- struct bio_vec *bi_io_vec; /* the actual vec list */
- bio_end_io_t *bi_end_io; /* bi_end_io (bio) */
- atomic_t bi_cnt; /* pin count: free when it hits zero */
- void *bi_private;
- };
-
-With this multipage bio design:
-
-- Large i/os can be sent down in one go using a bio_vec list consisting
- of an array of <page, offset, len> fragments (similar to the way fragments
- are represented in the zero-copy network code)
-- Splitting of an i/o request across multiple devices (as in the case of
- lvm or raid) is achieved by cloning the bio (where the clone points to
- the same bi_io_vec array, but with the index and size accordingly modified)
-- A linked list of bios is used as before for unrelated merges [#]_ - this
- avoids reallocs and makes independent completions easier to handle.
-- Code that traverses the req list can find all the segments of a bio
- by using rq_for_each_segment. This handles the fact that a request
- has multiple bios, each of which can have multiple segments.
-- Drivers which can't process a large bio in one shot can use the bi_iter
- field to keep track of the next bio_vec entry to process.
- (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE)
- [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying
- bi_offset an len fields]
-
-.. [#]
-
- unrelated merges -- a request ends up containing two or more bios that
- didn't originate from the same place.
-
-bi_end_io() i/o callback gets called on i/o completion of the entire bio.
-
-At a lower level, drivers build a scatter gather list from the merged bios.
-The scatter gather list is in the form of an array of <page, offset, len>
-entries with their corresponding dma address mappings filled in at the
-appropriate time. As an optimization, contiguous physical pages can be
-covered by a single entry where <page> refers to the first page and <len>
-covers the range of pages (up to 16 contiguous pages could be covered this
-way). There is a helper routine (blk_rq_map_sg) which drivers can use to build
-the sg list.
-
-Note: Right now the only user of bios with more than one page is ll_rw_kio,
-which in turn means that only raw I/O uses it (direct i/o may not work
-right now). The intent however is to enable clustering of pages etc to
-become possible. The pagebuf abstraction layer from SGI also uses multi-page
-bios, but that is currently not included in the stock development kernels.
-The same is true of Andrew Morton's work-in-progress multipage bio writeout
-and readahead patches.
-
-2.3 Changes in the Request Structure
-------------------------------------
-
-The request structure is the structure that gets passed down to low level
-drivers. The block layer make_request function builds up a request structure,
-places it on the queue and invokes the drivers request_fn. The driver makes
-use of block layer helper routine elv_next_request to pull the next request
-off the queue. Control or diagnostic functions might bypass block and directly
-invoke underlying driver entry points passing in a specially constructed
-request structure.
-
-Only some relevant fields (mainly those which changed or may be referred
-to in some of the discussion here) are listed below, not necessarily in
-the order in which they occur in the structure (see include/linux/blkdev.h)
-Refer to Documentation/block/request.rst for details about all the request
-structure fields and a quick reference about the layers which are
-supposed to use or modify those fields::
-
- struct request {
- struct list_head queuelist; /* Not meant to be directly accessed by
- the driver.
- Used by q->elv_next_request_fn
- rq->queue is gone
- */
- .
- .
- unsigned char cmd[16]; /* prebuilt command data block */
- unsigned long flags; /* also includes earlier rq->cmd settings */
- .
- .
- sector_t sector; /* this field is now of type sector_t instead of int
- preparation for 64 bit sectors */
- .
- .
-
- /* Number of scatter-gather DMA addr+len pairs after
- * physical address coalescing is performed.
- */
- unsigned short nr_phys_segments;
-
- /* Number of scatter-gather addr+len pairs after
- * physical and DMA remapping hardware coalescing is performed.
- * This is the number of scatter-gather entries the driver
- * will actually have to deal with after DMA mapping is done.
- */
- unsigned short nr_hw_segments;
-
- /* Various sector counts */
- unsigned long nr_sectors; /* no. of sectors left: driver modifiable */
- unsigned long hard_nr_sectors; /* block internal copy of above */
- unsigned int current_nr_sectors; /* no. of sectors left in the
- current segment:driver modifiable */
- unsigned long hard_cur_sectors; /* block internal copy of the above */
- .
- .
- int tag; /* command tag associated with request */
- void *special; /* same as before */
- char *buffer; /* valid only for low memory buffers up to
- current_nr_sectors */
- .
- .
- struct bio *bio, *biotail; /* bio list instead of bh */
- struct request_list *rl;
- }
-
-See the req_ops and req_flag_bits definitions for an explanation of the various
-flags available. Some bits are used by the block layer or i/o scheduler.
-
-The behaviour of the various sector counts are almost the same as before,
-except that since we have multi-segment bios, current_nr_sectors refers
-to the numbers of sectors in the current segment being processed which could
-be one of the many segments in the current bio (i.e i/o completion unit).
-The nr_sectors value refers to the total number of sectors in the whole
-request that remain to be transferred (no change). The purpose of the
-hard_xxx values is for block to remember these counts every time it hands
-over the request to the driver. These values are updated by block on
-end_that_request_first, i.e. every time the driver completes a part of the
-transfer and invokes block end*request helpers to mark this. The
-driver should not modify these values. The block layer sets up the
-nr_sectors and current_nr_sectors fields (based on the corresponding
-hard_xxx values and the number of bytes transferred) and updates it on
-every transfer that invokes end_that_request_first. It does the same for the
-buffer, bio, bio->bi_iter fields too.
-
-The buffer field is just a virtual address mapping of the current segment
-of the i/o buffer in cases where the buffer resides in low-memory. For high
-memory i/o, this field is not valid and must not be used by drivers.
-
-Code that sets up its own request structures and passes them down to
-a driver needs to be careful about interoperation with the block layer helper
-functions which the driver uses. (Section 1.3)
-
-3. Using bios
-=============
-
-3.1 Setup/Teardown
-------------------
-
-There are routines for managing the allocation, and reference counting, and
-freeing of bios (bio_alloc, bio_get, bio_put).
-
-This makes use of Ingo Molnar's mempool implementation, which enables
-subsystems like bio to maintain their own reserve memory pools for guaranteed
-deadlock-free allocations during extreme VM load. For example, the VM
-subsystem makes use of the block layer to writeout dirty pages in order to be
-able to free up memory space, a case which needs careful handling. The
-allocation logic draws from the preallocated emergency reserve in situations
-where it cannot allocate through normal means. If the pool is empty and it
-can wait, then it would trigger action that would help free up memory or
-replenish the pool (without deadlocking) and wait for availability in the pool.
-If it is in IRQ context, and hence not in a position to do this, allocation
-could fail if the pool is empty. In general mempool always first tries to
-perform allocation without having to wait, even if it means digging into the
-pool as long it is not less that 50% full.
-
-On a free, memory is released to the pool or directly freed depending on
-the current availability in the pool. The mempool interface lets the
-subsystem specify the routines to be used for normal alloc and free. In the
-case of bio, these routines make use of the standard slab allocator.
-
-The caller of bio_alloc is expected to taken certain steps to avoid
-deadlocks, e.g. avoid trying to allocate more memory from the pool while
-already holding memory obtained from the pool.
-
-::
-
- [TBD: This is a potential issue, though a rare possibility
- in the bounce bio allocation that happens in the current code, since
- it ends up allocating a second bio from the same pool while
- holding the original bio ]
-
-Memory allocated from the pool should be released back within a limited
-amount of time (in the case of bio, that would be after the i/o is completed).
-This ensures that if part of the pool has been used up, some work (in this
-case i/o) must already be in progress and memory would be available when it
-is over. If allocating from multiple pools in the same code path, the order
-or hierarchy of allocation needs to be consistent, just the way one deals
-with multiple locks.
-
-The bio_alloc routine also needs to allocate the bio_vec_list (bvec_alloc())
-for a non-clone bio. There are the 6 pools setup for different size biovecs,
-so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
-given size from these slabs.
-
-The bio_get() routine may be used to hold an extra reference on a bio prior
-to i/o submission, if the bio fields are likely to be accessed after the
-i/o is issued (since the bio may otherwise get freed in case i/o completion
-happens in the meantime).
-
-The bio_clone_fast() routine may be used to duplicate a bio, where the clone
-shares the bio_vec_list with the original bio (i.e. both point to the
-same bio_vec_list). This would typically be used for splitting i/o requests
-in lvm or md.
-
-3.2 Generic bio helper Routines
--------------------------------
-
-3.2.1 Traversing segments and completion units in a request
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The macro rq_for_each_segment() should be used for traversing the bios
-in the request list (drivers should avoid directly trying to do it
-themselves). Using these helpers should also make it easier to cope
-with block changes in the future.
-
-::
-
- struct req_iterator iter;
- rq_for_each_segment(bio_vec, rq, iter)
- /* bio_vec is now current segment */
-
-I/O completion callbacks are per-bio rather than per-segment, so drivers
-that traverse bio chains on completion need to keep that in mind. Drivers
-which don't make a distinction between segments and completion units would
-need to be reorganized to support multi-segment bios.
-
-3.2.2 Setting up DMA scatterlists
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The blk_rq_map_sg() helper routine would be used for setting up scatter
-gather lists from a request, so a driver need not do it on its own.
-
- nr_segments = blk_rq_map_sg(q, rq, scatterlist);
-
-The helper routine provides a level of abstraction which makes it easier
-to modify the internals of request to scatterlist conversion down the line
-without breaking drivers. The blk_rq_map_sg routine takes care of several
-things like collapsing physically contiguous segments (if QUEUE_FLAG_CLUSTER
-is set) and correct segment accounting to avoid exceeding the limits which
-the i/o hardware can handle, based on various queue properties.
-
-- Prevents a clustered segment from crossing a 4GB mem boundary
-- Avoids building segments that would exceed the number of physical
- memory segments that the driver can handle (phys_segments) and the
- number that the underlying hardware can handle at once, accounting for
- DMA remapping (hw_segments) (i.e. IOMMU aware limits).
-
-Routines which the low level driver can use to set up the segment limits:
-
-blk_queue_max_hw_segments() : Sets an upper limit of the maximum number of
-hw data segments in a request (i.e. the maximum number of address/length
-pairs the host adapter can actually hand to the device at once)
-
-blk_queue_max_phys_segments() : Sets an upper limit on the maximum number
-of physical data segments in a request (i.e. the largest sized scatter list
-a driver could handle)
-
-3.2.3 I/O completion
-^^^^^^^^^^^^^^^^^^^^
-
-The existing generic block layer helper routines end_request,
-end_that_request_first and end_that_request_last can be used for i/o
-completion (and setting things up so the rest of the i/o or the next
-request can be kicked of) as before. With the introduction of multi-page
-bio support, end_that_request_first requires an additional argument indicating
-the number of sectors completed.
-
-3.2.4 Implications for drivers that do not interpret bios
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-(don't handle multiple segments)
-
-Drivers that do not interpret bios e.g those which do not handle multiple
-segments and do not support i/o into high memory addresses (require bounce
-buffers) and expect only virtually mapped buffers, can access the rq->buffer
-field. As before the driver should use current_nr_sectors to determine the
-size of remaining data in the current segment (that is the maximum it can
-transfer in one go unless it interprets segments), and rely on the block layer
-end_request, or end_that_request_first/last to take care of all accounting
-and transparent mapping of the next bio segment when a segment boundary
-is crossed on completion of a transfer. (The end*request* functions should
-be used if only if the request has come down from block/bio path, not for
-direct access requests which only specify rq->buffer without a valid rq->bio)
-
-3.3 I/O Submission
-------------------
-
-The routine submit_bio() is used to submit a single io. Higher level i/o
-routines make use of this:
-
-(a) Buffered i/o:
-
-The routine submit_bh() invokes submit_bio() on a bio corresponding to the
-bh, allocating the bio if required. ll_rw_block() uses submit_bh() as before.
-
-(b) Kiobuf i/o (for raw/direct i/o):
-
-The ll_rw_kio() routine breaks up the kiobuf into page sized chunks and
-maps the array to one or more multi-page bios, issuing submit_bio() to
-perform the i/o on each of these.
-
-The embedded bh array in the kiobuf structure has been removed and no
-preallocation of bios is done for kiobufs. [The intent is to remove the
-blocks array as well, but it's currently in there to kludge around direct i/o.]
-Thus kiobuf allocation has switched back to using kmalloc rather than vmalloc.
-
-Todo/Observation:
-
- A single kiobuf structure is assumed to correspond to a contiguous range
- of data, so brw_kiovec() invokes ll_rw_kio for each kiobuf in a kiovec.
- So right now it wouldn't work for direct i/o on non-contiguous blocks.
- This is to be resolved. The eventual direction is to replace kiobuf
- by kvec's.
-
- Badari Pulavarty has a patch to implement direct i/o correctly using
- bio and kvec.
-
-
-(c) Page i/o:
-
-Todo/Under discussion:
-
- Andrew Morton's multi-page bio patches attempt to issue multi-page
- writeouts (and reads) from the page cache, by directly building up
- large bios for submission completely bypassing the usage of buffer
- heads. This work is still in progress.
-
- Christoph Hellwig had some code that uses bios for page-io (rather than
- bh). This isn't included in bio as yet. Christoph was also working on a
- design for representing virtual/real extents as an entity and modifying
- some of the address space ops interfaces to utilize this abstraction rather
- than buffer_heads. (This is somewhat along the lines of the SGI XFS pagebuf
- abstraction, but intended to be as lightweight as possible).
-
-(d) Direct access i/o:
-
-Direct access requests that do not contain bios would be submitted differently
-as discussed earlier in section 1.3.
-
-Aside:
-
- Kvec i/o:
-
- Ben LaHaise's aio code uses a slightly different structure instead
- of kiobufs, called a kvec_cb. This contains an array of <page, offset, len>
- tuples (very much like the networking code), together with a callback function
- and data pointer. This is embedded into a brw_cb structure when passed
- to brw_kvec_async().
-
- Now it should be possible to directly map these kvecs to a bio. Just as while
- cloning, in this case rather than PRE_BUILT bio_vecs, we set the bi_io_vec
- array pointer to point to the veclet array in kvecs.
-
- TBD: In order for this to work, some changes are needed in the way multi-page
- bios are handled today. The values of the tuples in such a vector passed in
- from higher level code should not be modified by the block layer in the course
- of its request processing, since that would make it hard for the higher layer
- to continue to use the vector descriptor (kvec) after i/o completes. Instead,
- all such transient state should either be maintained in the request structure,
- and passed on in some way to the endio completion routine.
-
-
-4. The I/O scheduler
-====================
-
-I/O scheduler, a.k.a. elevator, is implemented in two layers. Generic dispatch
-queue and specific I/O schedulers. Unless stated otherwise, elevator is used
-to refer to both parts and I/O scheduler to specific I/O schedulers.
-
-Block layer implements generic dispatch queue in `block/*.c`.
-The generic dispatch queue is responsible for requeueing, handling non-fs
-requests and all other subtleties.
-
-Specific I/O schedulers are responsible for ordering normal filesystem
-requests. They can also choose to delay certain requests to improve
-throughput or whatever purpose. As the plural form indicates, there are
-multiple I/O schedulers. They can be built as modules but at least one should
-be built inside the kernel. Each queue can choose different one and can also
-change to another one dynamically.
-
-A block layer call to the i/o scheduler follows the convention elv_xxx(). This
-calls elevator_xxx_fn in the elevator switch (block/elevator.c). Oh, xxx
-and xxx might not match exactly, but use your imagination. If an elevator
-doesn't implement a function, the switch does nothing or some minimal house
-keeping work.
-
-4.1. I/O scheduler API
-----------------------
-
-The functions an elevator may implement are: (* are mandatory)
-
-=============================== ================================================
-elevator_merge_fn called to query requests for merge with a bio
-
-elevator_merge_req_fn called when two requests get merged. the one
- which gets merged into the other one will be
- never seen by I/O scheduler again. IOW, after
- being merged, the request is gone.
-
-elevator_merged_fn called when a request in the scheduler has been
- involved in a merge. It is used in the deadline
- scheduler for example, to reposition the request
- if its sorting order has changed.
-
-elevator_allow_merge_fn called whenever the block layer determines
- that a bio can be merged into an existing
- request safely. The io scheduler may still
- want to stop a merge at this point if it
- results in some sort of conflict internally,
- this hook allows it to do that. Note however
- that two *requests* can still be merged at later
- time. Currently the io scheduler has no way to
- prevent that. It can only learn about the fact
- from elevator_merge_req_fn callback.
-
-elevator_dispatch_fn* fills the dispatch queue with ready requests.
- I/O schedulers are free to postpone requests by
- not filling the dispatch queue unless @force
- is non-zero. Once dispatched, I/O schedulers
- are not allowed to manipulate the requests -
- they belong to generic dispatch queue.
-
-elevator_add_req_fn* called to add a new request into the scheduler
-
-elevator_former_req_fn
-elevator_latter_req_fn These return the request before or after the
- one specified in disk sort order. Used by the
- block layer to find merge possibilities.
-
-elevator_completed_req_fn called when a request is completed.
-
-elevator_set_req_fn
-elevator_put_req_fn Must be used to allocate and free any elevator
- specific storage for a request.
-
-elevator_activate_req_fn Called when device driver first sees a request.
- I/O schedulers can use this callback to
- determine when actual execution of a request
- starts.
-elevator_deactivate_req_fn Called when device driver decides to delay
- a request by requeueing it.
-
-elevator_init_fn*
-elevator_exit_fn Allocate and free any elevator specific storage
- for a queue.
-=============================== ================================================
-
-4.2 Request flows seen by I/O schedulers
-----------------------------------------
-
-All requests seen by I/O schedulers strictly follow one of the following three
-flows.
-
- set_req_fn ->
-
- i. add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn ->
- (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn
- ii. add_req_fn -> (merged_fn ->)* -> merge_req_fn
- iii. [none]
-
- -> put_req_fn
-
-4.3 I/O scheduler implementation
---------------------------------
-
-The generic i/o scheduler algorithm attempts to sort/merge/batch requests for
-optimal disk scan and request servicing performance (based on generic
-principles and device capabilities), optimized for:
-
-i. improved throughput
-ii. improved latency
-iii. better utilization of h/w & CPU time
-
-Characteristics:
-
-i. Binary tree
-AS and deadline i/o schedulers use red black binary trees for disk position
-sorting and searching, and a fifo linked list for time-based searching. This
-gives good scalability and good availability of information. Requests are
-almost always dispatched in disk sort order, so a cache is kept of the next
-request in sort order to prevent binary tree lookups.
-
-This arrangement is not a generic block layer characteristic however, so
-elevators may implement queues as they please.
-
-ii. Merge hash
-AS and deadline use a hash table indexed by the last sector of a request. This
-enables merging code to quickly look up "back merge" candidates, even when
-multiple I/O streams are being performed at once on one disk.
-
-"Front merges", a new request being merged at the front of an existing request,
-are far less common than "back merges" due to the nature of most I/O patterns.
-Front merges are handled by the binary trees in AS and deadline schedulers.
-
-iii. Plugging the queue to batch requests in anticipation of opportunities for
- merge/sort optimizations
-
-Plugging is an approach that the current i/o scheduling algorithm resorts to so
-that it collects up enough requests in the queue to be able to take
-advantage of the sorting/merging logic in the elevator. If the
-queue is empty when a request comes in, then it plugs the request queue
-(sort of like plugging the bath tub of a vessel to get fluid to build up)
-till it fills up with a few more requests, before starting to service
-the requests. This provides an opportunity to merge/sort the requests before
-passing them down to the device. There are various conditions when the queue is
-unplugged (to open up the flow again), either through a scheduled task or
-could be on demand. For example wait_on_buffer sets the unplugging going
-through sync_buffer() running blk_run_address_space(mapping). Or the caller
-can do it explicity through blk_unplug(bdev). So in the read case,
-the queue gets explicitly unplugged as part of waiting for completion on that
-buffer.
-
-Aside:
- This is kind of controversial territory, as it's not clear if plugging is
- always the right thing to do. Devices typically have their own queues,
- and allowing a big queue to build up in software, while letting the device be
- idle for a while may not always make sense. The trick is to handle the fine
- balance between when to plug and when to open up. Also now that we have
- multi-page bios being queued in one shot, we may not need to wait to merge
- a big request from the broken up pieces coming by.
-
-4.4 I/O contexts
-----------------
-
-I/O contexts provide a dynamically allocated per process data area. They may
-be used in I/O schedulers, and in the block layer (could be used for IO statis,
-priorities for example). See `*io_context` in block/ll_rw_blk.c, and as-iosched.c
-for an example of usage in an i/o scheduler.
-
-
-5. Scalability related changes
-==============================
-
-5.1 Granular Locking: io_request_lock replaced by a per-queue lock
-------------------------------------------------------------------
-
-The global io_request_lock has been removed as of 2.5, to avoid
-the scalability bottleneck it was causing, and has been replaced by more
-granular locking. The request queue structure has a pointer to the
-lock to be used for that queue. As a result, locking can now be
-per-queue, with a provision for sharing a lock across queues if
-necessary (e.g the scsi layer sets the queue lock pointers to the
-corresponding adapter lock, which results in a per host locking
-granularity). The locking semantics are the same, i.e. locking is
-still imposed by the block layer, grabbing the lock before
-request_fn execution which it means that lots of older drivers
-should still be SMP safe. Drivers are free to drop the queue
-lock themselves, if required. Drivers that explicitly used the
-io_request_lock for serialization need to be modified accordingly.
-Usually it's as easy as adding a global lock::
-
- static DEFINE_SPINLOCK(my_driver_lock);
-
-and passing the address to that lock to blk_init_queue().
-
-5.2 64 bit sector numbers (sector_t prepares for 64 bit support)
-----------------------------------------------------------------
-
-The sector number used in the bio structure has been changed to sector_t,
-which could be defined as 64 bit in preparation for 64 bit sector support.
-
-6. Other Changes/Implications
-=============================
-
-6.1 Partition re-mapping handled by the generic block layer
------------------------------------------------------------
-
-In 2.5 some of the gendisk/partition related code has been reorganized.
-Now the generic block layer performs partition-remapping early and thus
-provides drivers with a sector number relative to whole device, rather than
-having to take partition number into account in order to arrive at the true
-sector number. The routine blk_partition_remap() is invoked by
-submit_bio_noacct even before invoking the queue specific ->submit_bio,
-so the i/o scheduler also gets to operate on whole disk sector numbers. This
-should typically not require changes to block drivers, it just never gets
-to invoke its own partition sector offset calculations since all bios
-sent are offset from the beginning of the device.
-
-
-7. A Few Tips on Migration of older drivers
-===========================================
-
-Old-style drivers that just use CURRENT and ignores clustered requests,
-may not need much change. The generic layer will automatically handle
-clustered requests, multi-page bios, etc for the driver.
-
-For a low performance driver or hardware that is PIO driven or just doesn't
-support scatter-gather changes should be minimal too.
-
-The following are some points to keep in mind when converting old drivers
-to bio.
-
-Drivers should use elv_next_request to pick up requests and are no longer
-supposed to handle looping directly over the request list.
-(struct request->queue has been removed)
-
-Now end_that_request_first takes an additional number_of_sectors argument.
-It used to handle always just the first buffer_head in a request, now
-it will loop and handle as many sectors (on a bio-segment granularity)
-as specified.
-
-Now bh->b_end_io is replaced by bio->bi_end_io, but most of the time the
-right thing to use is bio_endio(bio) instead.
-
-If the driver is dropping the io_request_lock from its request_fn strategy,
-then it just needs to replace that with q->queue_lock instead.
-
-As described in Sec 1.1, drivers can set max sector size, max segment size
-etc per queue now. Drivers that used to define their own merge functions i
-to handle things like this can now just use the blk_queue_* functions at
-blk_init_queue time.
-
-Drivers no longer have to map a {partition, sector offset} into the
-correct absolute location anymore, this is done by the block layer, so
-where a driver received a request ala this before::
-
- rq->rq_dev = mk_kdev(3, 5); /* /dev/hda5 */
- rq->sector = 0; /* first sector on hda5 */
-
-it will now see::
-
- rq->rq_dev = mk_kdev(3, 0); /* /dev/hda */
- rq->sector = 123128; /* offset from start of disk */
-
-As mentioned, there is no virtual mapping of a bio. For DMA, this is
-not a problem as the driver probably never will need a virtual mapping.
-Instead it needs a bus mapping (dma_map_page for a single segment or
-use dma_map_sg for scatter gather) to be able to ship it to the driver. For
-PIO drivers (or drivers that need to revert to PIO transfer once in a
-while (IDE for example)), where the CPU is doing the actual data
-transfer a virtual mapping is needed. If the driver supports highmem I/O,
-(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map
-a bio into the virtual address space.
-
-
-8. Prior/Related/Impacted patches
-=================================
-
-8.1. Earlier kiobuf patches (sct/axboe/chait/hch/mkp)
------------------------------------------------------
-
-- orig kiobuf & raw i/o patches (now in 2.4 tree)
-- direct kiobuf based i/o to devices (no intermediate bh's)
-- page i/o using kiobuf
-- kiobuf splitting for lvm (mkp)
-- elevator support for kiobuf request merging (axboe)
-
-8.2. Zero-copy networking (Dave Miller)
----------------------------------------
-
-8.3. SGI XFS - pagebuf patches - use of kiobufs
------------------------------------------------
-8.4. Multi-page pioent patch for bio (Christoph Hellwig)
---------------------------------------------------------
-8.5. Direct i/o implementation (Andrea Arcangeli) since 2.4.10-pre11
---------------------------------------------------------------------
-8.6. Async i/o implementation patch (Ben LaHaise)
--------------------------------------------------
-8.7. EVMS layering design (IBM EVMS team)
------------------------------------------
-8.8. Larger page cache size patch (Ben LaHaise) and Large page size (Daniel Phillips)
--------------------------------------------------------------------------------------
-
- => larger contiguous physical memory buffers
-
-8.9. VM reservations patch (Ben LaHaise)
-----------------------------------------
-8.10. Write clustering patches ? (Marcelo/Quintela/Riel ?)
-----------------------------------------------------------
-8.11. Block device in page cache patch (Andrea Archangeli) - now in 2.4.10+
----------------------------------------------------------------------------
-8.12. Multiple block-size transfers for faster raw i/o (Shailabh Nagar, Badari)
--------------------------------------------------------------------------------
-8.13 Priority based i/o scheduler - prepatches (Arjan van de Ven)
-------------------------------------------------------------------
-8.14 IDE Taskfile i/o patch (Andre Hedrick)
---------------------------------------------
-8.15 Multi-page writeout and readahead patches (Andrew Morton)
----------------------------------------------------------------
-8.16 Direct i/o patches for 2.5 using kvec and bio (Badari Pulavarthy)
------------------------------------------------------------------------
-
-9. Other References
-===================
-
-9.1 The Splice I/O Model
-------------------------
-
-Larry McVoy (and subsequent discussions on lkml, and Linus' comments - Jan 2001
-
-9.2 Discussions about kiobuf and bh design
-------------------------------------------
-
-On lkml between sct, linus, alan et al - Feb-March 2001 (many of the
-initial thoughts that led to bio were brought up in this discussion thread)
-
-9.3 Discussions on mempool on lkml - Dec 2001.
-----------------------------------------------
diff --git a/Documentation/block/capability.rst b/Documentation/block/capability.rst
index 160a5148b915..2ae7f064736a 100644
--- a/Documentation/block/capability.rst
+++ b/Documentation/block/capability.rst
@@ -7,4 +7,4 @@ This file documents the sysfs file ``block/<disk>/capability``.
``capability`` is a bitfield, printed in hexadecimal, indicating which
capabilities a specific block device supports:
-.. kernel-doc:: include/linux/genhd.h
+.. kernel-doc:: include/linux/blkdev.h
diff --git a/Documentation/block/index.rst b/Documentation/block/index.rst
index 3a41495dd77b..68f115f2b1c6 100644
--- a/Documentation/block/index.rst
+++ b/Documentation/block/index.rst
@@ -8,7 +8,6 @@ Block
:maxdepth: 1
bfq-iosched
- biodoc
biovecs
blk-mq
capability
diff --git a/MAINTAINERS b/MAINTAINERS
index f39be89f9128..8b145d8c3249 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3441,6 +3441,7 @@ F: Documentation/ABI/stable/sysfs-block
F: Documentation/block/
F: block/
F: drivers/block/
+F: include/linux/bio.h
F: include/linux/blk*
F: kernel/trace/blktrace.c
F: lib/sbitmap.c
diff --git a/arch/m68k/atari/stdma.c b/arch/m68k/atari/stdma.c
index ba65f942d0c7..ce6818eff75e 100644
--- a/arch/m68k/atari/stdma.c
+++ b/arch/m68k/atari/stdma.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/kdev_t.h>
-#include <linux/genhd.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/interrupt.h>
diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c
index 9b060d466e03..3a1d90e399e0 100644
--- a/arch/m68k/bvme6000/config.c
+++ b/arch/m68k/bvme6000/config.c
@@ -23,7 +23,6 @@
#include <linux/linkage.h>
#include <linux/init.h>
#include <linux/major.h>
-#include <linux/genhd.h>
#include <linux/rtc.h>
#include <linux/interrupt.h>
#include <linux/bcd.h>
diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 9c57b245dc12..267b02cc5655 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -13,7 +13,6 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/types.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#include <linux/slab.h>
diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c
index 8228275aae3e..8f94feed969c 100644
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -16,7 +16,6 @@
#include <linux/interrupt.h>
#include <linux/fs.h>
#include <linux/console.h>
-#include <linux/genhd.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/init.h>
diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c
index b96ea7c76a19..4e6218115f43 100644
--- a/arch/m68k/mvme147/config.c
+++ b/arch/m68k/mvme147/config.c
@@ -22,7 +22,6 @@
#include <linux/linkage.h>
#include <linux/init.h>
#include <linux/major.h>
-#include <linux/genhd.h>
#include <linux/rtc.h>
#include <linux/interrupt.h>
diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c
index 88cbdc10925b..f00c7aa058de 100644
--- a/arch/m68k/mvme16x/config.c
+++ b/arch/m68k/mvme16x/config.c
@@ -24,7 +24,6 @@
#include <linux/linkage.h>
#include <linux/init.h>
#include <linux/major.h>
-#include <linux/genhd.h>
#include <linux/rtc.h>
#include <linux/interrupt.h>
#include <linux/module.h>
diff --git a/block/Kconfig b/block/Kconfig
index d5d4197b7ed2..7eb5d6d53b3f 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -26,6 +26,16 @@ menuconfig BLOCK
if BLOCK
+config BLOCK_LEGACY_AUTOLOAD
+ bool "Legacy autoloading support"
+ default y
+ help
+ Enable loading modules and creating block device instances based on
+ accesses through their device special file. This is a historic Linux
+ feature and makes no sense in a udev world where device files are
+ created on demand, but scripts that manually create device nodes and
+ then call losetup might rely on this behavior.
+
config BLK_RQ_ALLOC_TIME
bool
@@ -218,6 +228,9 @@ config BLK_PM
config BLOCK_HOLDER_DEPRECATED
bool
+config BLK_MQ_STACKING
+ bool
+
source "block/Kconfig.iosched"
endif # BLOCK
diff --git a/block/Makefile b/block/Makefile
index f38eaa612929..3950ecbc5c26 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
-obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
+ blk-crypto-sysfs.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
diff --git a/block/bdev.c b/block/bdev.c
index 102837a37051..ce8de42a89be 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -678,7 +678,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
if (test_bit(GD_NEED_PART_SCAN, &disk->state))
bdev_disk_changed(disk, false);
bdev->bd_openers++;
- return 0;;
+ return 0;
}
static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
@@ -733,12 +733,15 @@ struct block_device *blkdev_get_no_open(dev_t dev)
struct inode *inode;
inode = ilookup(blockdev_superblock, dev);
- if (!inode) {
+ if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
blk_request_module(dev);
inode = ilookup(blockdev_superblock, dev);
- if (!inode)
- return NULL;
+ if (inode)
+ pr_warn_ratelimited(
+"block device autoloading is deprecated and will be removed.\n");
}
+ if (!inode)
+ return NULL;
/* switch from the inode reference to a device mode one: */
bdev = &BDEV_I(inode)->bdev;
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 24a5c5329bcd..420eda2589c0 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -645,8 +645,22 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_group *bfqg)
{
struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_group *old_parent = bfqq_group(bfqq);
/*
+ * No point to move bfqq to the same group, which can happen when
+ * root group is offlined
+ */
+ if (old_parent == bfqg)
+ return;
+
+ /*
+ * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group
+ * until elevator exit.
+ */
+ if (bfqq == &bfqd->oom_bfqq)
+ return;
+ /*
* Get extra reference to prevent bfqq from being freed in
* next possible expire or deactivate.
*/
@@ -666,7 +680,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_deactivate_bfqq(bfqd, bfqq, false, false);
else if (entity->on_st_or_in_serv)
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
- bfqg_and_blkg_put(bfqq_group(bfqq));
+ bfqg_and_blkg_put(old_parent);
if (entity->parent &&
entity->parent->last_bfqq_created == bfqq)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 36a66e97e3c2..29d2635a2a63 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -774,7 +774,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
if (!bfqq->next_rq)
return;
- bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ bfqq->pos_root = &bfqq_group(bfqq)->rq_pos_tree;
__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
blk_rq_pos(bfqq->next_rq), &parent, &p);
if (!__bfqq) {
@@ -2153,7 +2153,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq->waker_detection_started = now_ns;
bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
MAX_BFQQ_NAME_LENGTH);
- bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name);
+ bfq_log_bfqq(bfqd, bfqq, "set tentative waker %s", waker_name);
} else /* Same tentative waker queue detected again */
bfqq->num_waker_detections++;
@@ -2669,7 +2669,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
sector_t sector)
{
- struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ struct rb_root *root = &bfqq_group(bfqq)->rq_pos_tree;
struct rb_node *parent, *node;
struct bfq_queue *__bfqq;
@@ -2782,6 +2782,15 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
* are likely to increase the throughput.
*/
bfqq->new_bfqq = new_bfqq;
+ /*
+ * The above assignment schedules the following redirections:
+ * each time some I/O for bfqq arrives, the process that
+ * generated that I/O is disassociated from bfqq and
+ * associated with new_bfqq. Here we increases new_bfqq->ref
+ * in advance, adding the number of processes that are
+ * expected to be associated with new_bfqq as they happen to
+ * issue I/O.
+ */
new_bfqq->ref += process_refs;
return new_bfqq;
}
@@ -2844,6 +2853,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
{
struct bfq_queue *in_service_bfqq, *new_bfqq;
+ /* if a merge has already been setup, then proceed with that first */
+ if (bfqq->new_bfqq)
+ return bfqq->new_bfqq;
+
/*
* Check delayed stable merge for rotational or non-queueing
* devs. For this branch to be executed, bfqq must not be
@@ -2945,9 +2958,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (bfq_too_late_for_merging(bfqq))
return NULL;
- if (bfqq->new_bfqq)
- return bfqq->new_bfqq;
-
if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
return NULL;
@@ -5181,7 +5191,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct request *rq;
struct bfq_queue *in_serv_queue;
- bool waiting_rq, idle_timer_disabled;
+ bool waiting_rq, idle_timer_disabled = false;
spin_lock_irq(&bfqd->lock);
@@ -5189,14 +5199,15 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
rq = __bfq_dispatch_request(hctx);
-
- idle_timer_disabled =
- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ if (in_serv_queue == bfqd->in_service_queue) {
+ idle_timer_disabled =
+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ }
spin_unlock_irq(&bfqd->lock);
-
- bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
- idle_timer_disabled);
+ bfq_update_dispatch_stats(hctx->queue, rq,
+ idle_timer_disabled ? in_serv_queue : NULL,
+ idle_timer_disabled);
return rq;
}
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 07288b9da389..3b83e3d1c2e5 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -8,7 +8,6 @@
#include <linux/blktrace_api.h>
#include <linux/hrtimer.h>
-#include <linux/blk-cgroup.h>
#include "blk-cgroup-rwstat.h"
@@ -1051,7 +1050,6 @@ extern struct blkcg_policy blkcg_policy_bfq;
for (parent = NULL; entity ; entity = parent)
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index b74cc0da118e..f8eb340381cf 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -142,16 +142,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
#ifdef CONFIG_BFQ_GROUP_IOSCHED
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- struct bfq_entity *group_entity = bfqq->entity.parent;
-
- if (!group_entity)
- group_entity = &bfqq->bfqd->root_group->entity;
-
- return container_of(group_entity, struct bfq_group, entity);
-}
-
/*
* Returns true if this budget changes may let next_in_service->parent
* become the next_in_service entity for its parent entity.
@@ -230,11 +220,6 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
#else /* CONFIG_BFQ_GROUP_IOSCHED */
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- return bfqq->bfqd->root_group;
-}
-
static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
{
return false;
@@ -519,7 +504,7 @@ unsigned short bfq_ioprio_to_weight(int ioprio)
static unsigned short bfq_weight_to_ioprio(int weight)
{
return max_t(int, 0,
- IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+ IOPRIO_NR_LEVELS - weight / BFQ_WEIGHT_CONVERSION_COEFF);
}
static void bfq_get_entity(struct bfq_entity *entity)
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 0827b19820c5..6996e7bd66e9 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -420,7 +420,6 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
return 0;
}
-EXPORT_SYMBOL(bio_integrity_clone);
int bioset_integrity_create(struct bio_set *bs, int pool_size)
{
diff --git a/block/bio.c b/block/bio.c
index 4312a8085396..33979f306e9e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -15,7 +15,6 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
-#include <linux/blk-cgroup.h>
#include <linux/highmem.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
@@ -24,6 +23,7 @@
#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
+#include "blk-cgroup.h"
struct bio_alloc_cache {
struct bio *free_list;
@@ -249,12 +249,12 @@ static void bio_free(struct bio *bio)
* they must remember to pair any call to bio_init() with bio_uninit()
* when IO has completed, or when the bio is released.
*/
-void bio_init(struct bio *bio, struct bio_vec *table,
- unsigned short max_vecs)
+void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
+ unsigned short max_vecs, unsigned int opf)
{
bio->bi_next = NULL;
- bio->bi_bdev = NULL;
- bio->bi_opf = 0;
+ bio->bi_bdev = bdev;
+ bio->bi_opf = opf;
bio->bi_flags = 0;
bio->bi_ioprio = 0;
bio->bi_write_hint = 0;
@@ -268,6 +268,8 @@ void bio_init(struct bio *bio, struct bio_vec *table,
#ifdef CONFIG_BLK_CGROUP
bio->bi_blkg = NULL;
bio->bi_issue.value = 0;
+ if (bdev)
+ bio_associate_blkg(bio);
#ifdef CONFIG_BLK_CGROUP_IOCOST
bio->bi_iocost_cost = 0;
#endif
@@ -293,6 +295,8 @@ EXPORT_SYMBOL(bio_init);
/**
* bio_reset - reinitialize a bio
* @bio: bio to reset
+ * @bdev: block device to use the bio for
+ * @opf: operation and flags for bio
*
* Description:
* After calling bio_reset(), @bio will be in the same state as a freshly
@@ -300,11 +304,15 @@ EXPORT_SYMBOL(bio_init);
* preserved are the ones that are initialized by bio_alloc_bioset(). See
* comment in struct bio.
*/
-void bio_reset(struct bio *bio)
+void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
{
bio_uninit(bio);
memset(bio, 0, BIO_RESET_BYTES);
atomic_set(&bio->__bi_remaining, 1);
+ bio->bi_bdev = bdev;
+ if (bio->bi_bdev)
+ bio_associate_blkg(bio);
+ bio->bi_opf = opf;
}
EXPORT_SYMBOL(bio_reset);
@@ -344,6 +352,20 @@ void bio_chain(struct bio *bio, struct bio *parent)
}
EXPORT_SYMBOL(bio_chain);
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+ unsigned int nr_pages, unsigned int opf, gfp_t gfp)
+{
+ struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
+
+ if (bio) {
+ bio_chain(bio, new);
+ submit_bio(bio);
+ }
+
+ return new;
+}
+EXPORT_SYMBOL_GPL(blk_next_bio);
+
static void bio_alloc_rescue(struct work_struct *work)
{
struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
@@ -400,8 +422,10 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
/**
* bio_alloc_bioset - allocate a bio for I/O
+ * @bdev: block device to allocate the bio for (can be %NULL)
+ * @nr_vecs: number of bvecs to pre-allocate
+ * @opf: operation and flags for bio
* @gfp_mask: the GFP_* mask given to the slab allocator
- * @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
* Allocate a bio from the mempools in @bs.
@@ -430,15 +454,16 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
*
* Returns: Pointer to new bio on success, NULL on failure.
*/
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
+struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
+ unsigned int opf, gfp_t gfp_mask,
struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
struct bio *bio;
void *p;
- /* should not use nobvec bioset for nr_iovecs > 0 */
- if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0))
+ /* should not use nobvec bioset for nr_vecs > 0 */
+ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
return NULL;
/*
@@ -475,23 +500,23 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
return NULL;
bio = p + bs->front_pad;
- if (nr_iovecs > BIO_INLINE_VECS) {
+ if (nr_vecs > BIO_INLINE_VECS) {
struct bio_vec *bvl = NULL;
- bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+ bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
if (!bvl && gfp_mask != saved_gfp) {
punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
- bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+ bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
}
if (unlikely(!bvl))
goto err_free;
- bio_init(bio, bvl, nr_iovecs);
- } else if (nr_iovecs) {
- bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
+ bio_init(bio, bdev, bvl, nr_vecs, opf);
+ } else if (nr_vecs) {
+ bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
} else {
- bio_init(bio, NULL, 0);
+ bio_init(bio, bdev, NULL, 0, opf);
}
bio->bi_pool = bs;
@@ -522,7 +547,8 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
if (unlikely(!bio))
return NULL;
- bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs);
+ bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs,
+ 0);
bio->bi_pool = NULL;
return bio;
}
@@ -702,80 +728,84 @@ void bio_put(struct bio *bio)
}
EXPORT_SYMBOL(bio_put);
-/**
- * __bio_clone_fast - clone a bio that shares the original bio's biovec
- * @bio: destination bio
- * @bio_src: bio to clone
- *
- * Clone a &bio. Caller will own the returned bio, but not
- * the actual data it points to. Reference count of returned
- * bio will be one.
- *
- * Caller must ensure that @bio_src is not freed before @bio.
- */
-void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
+static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
{
- WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs);
-
- /*
- * most users will be overriding ->bi_bdev with a new target,
- * so we don't set nor calculate new physical/hw segment counts here
- */
- bio->bi_bdev = bio_src->bi_bdev;
bio_set_flag(bio, BIO_CLONED);
if (bio_flagged(bio_src, BIO_THROTTLED))
bio_set_flag(bio, BIO_THROTTLED);
- if (bio_flagged(bio_src, BIO_REMAPPED))
+ if (bio->bi_bdev == bio_src->bi_bdev &&
+ bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
- bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;
- bio->bi_io_vec = bio_src->bi_io_vec;
bio_clone_blkg_association(bio, bio_src);
blkcg_bio_issue_init(bio);
+
+ if (bio_crypt_clone(bio, bio_src, gfp) < 0)
+ return -ENOMEM;
+ if (bio_integrity(bio_src) &&
+ bio_integrity_clone(bio, bio_src, gfp) < 0)
+ return -ENOMEM;
+ return 0;
}
-EXPORT_SYMBOL(__bio_clone_fast);
/**
- * bio_clone_fast - clone a bio that shares the original bio's biovec
- * @bio: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
+ * bio_alloc_clone - clone a bio that shares the original bio's biovec
+ * @bdev: block_device to clone onto
+ * @bio_src: bio to clone from
+ * @gfp: allocation priority
+ * @bs: bio_set to allocate from
+ *
+ * Allocate a new bio that is a clone of @bio_src. The caller owns the returned
+ * bio, but not the actual data it points to.
*
- * Like __bio_clone_fast, only also allocates the returned bio
+ * The caller must ensure that the return bio is not freed before @bio_src.
*/
-struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
+struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
+ gfp_t gfp, struct bio_set *bs)
{
- struct bio *b;
+ struct bio *bio;
- b = bio_alloc_bioset(gfp_mask, 0, bs);
- if (!b)
+ bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
+ if (!bio)
return NULL;
- __bio_clone_fast(b, bio);
-
- if (bio_crypt_clone(b, bio, gfp_mask) < 0)
- goto err_put;
-
- if (bio_integrity(bio) &&
- bio_integrity_clone(b, bio, gfp_mask) < 0)
- goto err_put;
-
- return b;
+ if (__bio_clone(bio, bio_src, gfp) < 0) {
+ bio_put(bio);
+ return NULL;
+ }
+ bio->bi_io_vec = bio_src->bi_io_vec;
-err_put:
- bio_put(b);
- return NULL;
+ return bio;
}
-EXPORT_SYMBOL(bio_clone_fast);
+EXPORT_SYMBOL(bio_alloc_clone);
-const char *bio_devname(struct bio *bio, char *buf)
+/**
+ * bio_init_clone - clone a bio that shares the original bio's biovec
+ * @bdev: block_device to clone onto
+ * @bio: bio to clone into
+ * @bio_src: bio to clone from
+ * @gfp: allocation priority
+ *
+ * Initialize a new bio in caller provided memory that is a clone of @bio_src.
+ * The caller owns the returned bio, but not the actual data it points to.
+ *
+ * The caller must ensure that @bio_src is not freed before @bio.
+ */
+int bio_init_clone(struct block_device *bdev, struct bio *bio,
+ struct bio *bio_src, gfp_t gfp)
{
- return bdevname(bio->bi_bdev, buf);
+ int ret;
+
+ bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
+ ret = __bio_clone(bio, bio_src, gfp);
+ if (ret)
+ bio_uninit(bio);
+ return ret;
}
-EXPORT_SYMBOL(bio_devname);
+EXPORT_SYMBOL(bio_init_clone);
/**
* bio_full - check if the bio is full
@@ -1054,7 +1084,7 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
size_t off)
{
if (len > UINT_MAX || off > UINT_MAX)
- return 0;
+ return false;
return bio_add_page(bio, &folio->page, len, off) > 0;
}
@@ -1486,8 +1516,7 @@ again:
if (!bio_integrity_endio(bio))
return;
- if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
- rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
+ rq_qos_done_bio(bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
@@ -1541,7 +1570,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
return NULL;
- split = bio_clone_fast(bio, gfp, bs);
+ split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
if (!split)
return NULL;
@@ -1636,9 +1665,9 @@ EXPORT_SYMBOL(bioset_exit);
* Note that the bio must be embedded at the END of that structure always,
* or things will break badly.
* If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
- * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast().
- * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
- * dispatch queued requests when the mempool runs out of space.
+ * for allocating iovecs. This pool is not needed e.g. for bio_init_clone().
+ * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used
+ * to dispatch queued requests when the mempool runs out of space.
*
*/
int bioset_init(struct bio_set *bs,
@@ -1708,7 +1737,9 @@ EXPORT_SYMBOL(bioset_init_from_src);
/**
* bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb
* @kiocb: kiocb describing the IO
+ * @bdev: block device to allocate the bio for (can be %NULL)
* @nr_vecs: number of iovecs to pre-allocate
+ * @opf: operation and flags for bio
* @bs: bio_set to allocate from
*
* Description:
@@ -1719,14 +1750,14 @@ EXPORT_SYMBOL(bioset_init_from_src);
* MUST be done from process context, not hard/soft IRQ.
*
*/
-struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
- struct bio_set *bs)
+struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev,
+ unsigned short nr_vecs, unsigned int opf, struct bio_set *bs)
{
struct bio_alloc_cache *cache;
struct bio *bio;
if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS)
- return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ return bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs);
cache = per_cpu_ptr(bs->cache, get_cpu());
if (cache->free_list) {
@@ -1734,13 +1765,14 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
cache->free_list = bio->bi_next;
cache->nr--;
put_cpu();
- bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
+ bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL,
+ nr_vecs, opf);
bio->bi_pool = bs;
bio_set_flag(bio, BIO_PERCPU_CACHE);
return bio;
}
put_cpu();
- bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ bio = bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs);
bio_set_flag(bio, BIO_PERCPU_CACHE);
return bio;
}
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index ee746919c41f..9f2723b34b75 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -6,7 +6,7 @@
#ifndef _BLK_CGROUP_RWSTAT_H
#define _BLK_CGROUP_RWSTAT_H
-#include <linux/blk-cgroup.h>
+#include "blk-cgroup.h"
enum blkg_rwstat_type {
BLKG_RWSTAT_READ,
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 650f7e27989f..d53b0d69dd73 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -23,15 +23,14 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
-#include <linux/genhd.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/ctype.h>
-#include <linux/blk-cgroup.h>
#include <linux/tracehook.h>
#include <linux/psi.h>
#include <linux/part_stat.h>
#include "blk.h"
+#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"
@@ -83,6 +82,8 @@ static void blkg_free(struct blkcg_gq *blkg)
if (blkg->pd[i])
blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
+ if (blkg->q)
+ blk_put_queue(blkg->q);
free_percpu(blkg->iostat_cpu);
percpu_ref_exit(&blkg->refcnt);
kfree(blkg);
@@ -168,6 +169,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
if (!blkg->iostat_cpu)
goto err_free;
+ if (!blk_get_queue(q))
+ goto err_free;
+
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
spin_lock_init(&blkg->async_bio_lock);
@@ -857,11 +861,11 @@ static void blkcg_fill_root_iostats(void)
blk_queue_root_blkg(bdev_get_queue(bdev));
struct blkg_iostat tmp;
int cpu;
+ unsigned long flags;
memset(&tmp, 0, sizeof(tmp));
for_each_possible_cpu(cpu) {
struct disk_stats *cpu_dkstats;
- unsigned long flags;
cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
tmp.ios[BLKG_IOSTAT_READ] +=
@@ -877,11 +881,11 @@ static void blkcg_fill_root_iostats(void)
cpu_dkstats->sectors[STAT_WRITE] << 9;
tmp.bytes[BLKG_IOSTAT_DISCARD] +=
cpu_dkstats->sectors[STAT_DISCARD] << 9;
-
- flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
- blkg_iostat_set(&blkg->iostat.cur, &tmp);
- u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
+
+ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+ blkg_iostat_set(&blkg->iostat.cur, &tmp);
+ u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
}
@@ -1176,6 +1180,8 @@ int blkcg_init_queue(struct request_queue *q)
bool preloaded;
int ret;
+ INIT_LIST_HEAD(&q->blkg_list);
+
new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
if (!new_blkg)
return -ENOMEM;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 000000000000..47e1e38390c9
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,494 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BLK_CGROUP_PRIVATE_H
+#define _BLK_CGROUP_PRIVATE_H
+/*
+ * block cgroup private header
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/blk-cgroup.h>
+#include <linux/blk-mq.h>
+
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
+
+#ifdef CONFIG_BLK_CGROUP
+
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q). This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods. A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
+ */
+struct blkg_policy_data {
+ /* the blkg and policy id this per-policy data belongs to */
+ struct blkcg_gq *blkg;
+ int plid;
+};
+
+/*
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods. A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
+ */
+struct blkcg_policy_data {
+ /* the blkcg and policy id this per-policy data belongs to */
+ struct blkcg *blkcg;
+ int plid;
+};
+
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp,
+ struct request_queue *q, struct blkcg *blkcg);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
+typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
+ struct seq_file *s);
+
+struct blkcg_policy {
+ int plid;
+ /* cgroup files for the policy */
+ struct cftype *dfl_cftypes;
+ struct cftype *legacy_cftypes;
+
+ /* operations */
+ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn;
+ blkcg_pol_init_cpd_fn *cpd_init_fn;
+ blkcg_pol_free_cpd_fn *cpd_free_fn;
+ blkcg_pol_bind_cpd_fn *cpd_bind_fn;
+
+ blkcg_pol_alloc_pd_fn *pd_alloc_fn;
+ blkcg_pol_init_pd_fn *pd_init_fn;
+ blkcg_pol_online_pd_fn *pd_online_fn;
+ blkcg_pol_offline_pd_fn *pd_offline_fn;
+ blkcg_pol_free_pd_fn *pd_free_fn;
+ blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
+ blkcg_pol_stat_pd_fn *pd_stat_fn;
+};
+
+extern struct blkcg blkcg_root;
+extern bool blkcg_debug_stats;
+
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+ struct request_queue *q, bool update_hint);
+int blkcg_init_queue(struct request_queue *q);
+void blkcg_exit_queue(struct request_queue *q);
+
+/* Blkio controller policy registration */
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
+int blkcg_activate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol);
+void blkcg_deactivate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol);
+
+const char *blkg_dev_name(struct blkcg_gq *blkg);
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+ u64 (*prfill)(struct seq_file *,
+ struct blkg_policy_data *, int),
+ const struct blkcg_policy *pol, int data,
+ bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+
+struct blkg_conf_ctx {
+ struct block_device *bdev;
+ struct blkcg_gq *blkg;
+ char *body;
+};
+
+struct block_device *blkcg_conf_open_bdev(char **inputp);
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+ char *input, struct blkg_conf_ctx *ctx);
+void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+
+/**
+ * blkcg_css - find the current css
+ *
+ * Find the css associated with either the kthread or the current task.
+ * This may return a dying css, so it is up to the caller to use tryget logic
+ * to confirm it is alive and well.
+ */
+static inline struct cgroup_subsys_state *blkcg_css(void)
+{
+ struct cgroup_subsys_state *css;
+
+ css = kthread_blkcg();
+ if (css)
+ return css;
+ return task_css(current, io_cgrp_id);
+}
+
+/**
+ * __bio_blkcg - internal, inconsistent version to get blkcg
+ *
+ * DO NOT USE.
+ * This function is inconsistent and consequently is dangerous to use. The
+ * first part of the function returns a blkcg where a reference is owned by the
+ * bio. This means it does not need to be rcu protected as it cannot go away
+ * with the bio owning a reference to it. However, the latter potentially gets
+ * it from task_css(). This can race against task migration and the cgroup
+ * dying. It is also semantically different as it must be called rcu protected
+ * and is susceptible to failure when trying to get a reference to it.
+ * Therefore, it is not ok to assume that *_get() will always succeed on the
+ * blkcg returned here.
+ */
+static inline struct blkcg *__bio_blkcg(struct bio *bio)
+{
+ if (bio && bio->bi_blkg)
+ return bio->bi_blkg->blkcg;
+ return css_to_blkcg(blkcg_css());
+}
+
+/**
+ * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
+ * @return: true if this bio needs to be submitted with the root blkg context.
+ *
+ * In order to avoid priority inversions we sometimes need to issue a bio as if
+ * it were attached to the root blkg, and then backcharge to the actual owning
+ * blkg. The idea is we do bio_blkcg() to look up the actual context for the
+ * bio and attach the appropriate blkg to the bio. Then we call this helper and
+ * if it is true run with the root blkg for that queue and then do any
+ * backcharging to the originating cgroup once the io is complete.
+ */
+static inline bool bio_issue_as_root_blkg(struct bio *bio)
+{
+ return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
+}
+
+/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state. If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+ struct request_queue *q,
+ bool update_hint)
+{
+ struct blkcg_gq *blkg;
+
+ if (blkcg == &blkcg_root)
+ return q->root_blkg;
+
+ blkg = rcu_dereference(blkcg->blkg_hint);
+ if (blkg && blkg->q == q)
+ return blkg;
+
+ return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair. This function should be called
+ * under RCU read lock.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+ struct request_queue *q)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return __blkg_lookup(blkcg, q, false);
+}
+
+/**
+ * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for @q at the root level. See also blkg_lookup().
+ */
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{
+ return q->root_blkg;
+}
+
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol)
+{
+ return blkg ? blkg->pd[pol->plid] : NULL;
+}
+
+static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
+ struct blkcg_policy *pol)
+{
+ return blkcg ? blkcg->cpd[pol->plid] : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pd: policy private data of interest
+ *
+ * @pd is policy private data. Determine the blkg it's associated with.
+ */
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+{
+ return pd ? pd->blkg : NULL;
+}
+
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+ return cpd ? cpd->blkcg : NULL;
+}
+
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+{
+ return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+}
+
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding an existing reference.
+ */
+static inline void blkg_get(struct blkcg_gq *blkg)
+{
+ percpu_ref_get(&blkg->refcnt);
+}
+
+/**
+ * blkg_tryget - try and get a blkg reference
+ * @blkg: blkg to get
+ *
+ * This is for use when doing an RCU lookup of the blkg. We may be in the midst
+ * of freeing this blkg, so we can only use it if the refcnt is not zero.
+ */
+static inline bool blkg_tryget(struct blkcg_gq *blkg)
+{
+ return blkg && percpu_ref_tryget(&blkg->refcnt);
+}
+
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ */
+static inline void blkg_put(struct blkcg_gq *blkg)
+{
+ percpu_ref_put(&blkg->refcnt);
+}
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
+ * read locked. If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs. The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
+ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
+ if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
+ (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead. Synchronization rules are the same. @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
+ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
+ if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
+ (p_blkg)->q, false)))
+
+bool __blkcg_punt_bio_submit(struct bio *bio);
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio)
+{
+ if (bio->bi_opf & REQ_CGROUP_PUNT)
+ return __blkcg_punt_bio_submit(bio);
+ else
+ return false;
+}
+
+static inline void blkcg_bio_issue_init(struct bio *bio)
+{
+ bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+}
+
+static inline void blkcg_use_delay(struct blkcg_gq *blkg)
+{
+ if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
+ return;
+ if (atomic_add_return(1, &blkg->use_delay) == 1)
+ atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ if (WARN_ON_ONCE(old < 0))
+ return 0;
+ if (old == 0)
+ return 0;
+
+ /*
+ * We do this song and dance because we can race with somebody else
+ * adding or removing delay. If we just did an atomic_dec we'd end up
+ * negative and we'd already be in trouble. We need to subtract 1 and
+ * then check to see if we were the last delay so we can drop the
+ * congestion count on the cgroup.
+ */
+ while (old) {
+ int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
+ if (cur == old)
+ break;
+ old = cur;
+ }
+
+ if (old == 0)
+ return 0;
+ if (old == 1)
+ atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+ return 1;
+}
+
+/**
+ * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
+ * @blkg: target blkg
+ * @delay: delay duration in nsecs
+ *
+ * When enabled with this function, the delay is not decayed and must be
+ * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
+ * blkcg_[un]use_delay() and blkcg_add_delay() usages.
+ */
+static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ /* We only want 1 person setting the congestion count for this blkg. */
+ if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old)
+ atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+
+ atomic64_set(&blkg->delay_nsec, delay);
+}
+
+/**
+ * blkcg_clear_delay - Disable allocator delay mechanism
+ * @blkg: target blkg
+ *
+ * Disable use_delay mechanism. See blkcg_set_delay().
+ */
+static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ /* We only want 1 person clearing the congestion count for this blkg. */
+ if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old)
+ atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+/**
+ * blk_cgroup_mergeable - Determine whether to allow or disallow merges
+ * @rq: request to merge into
+ * @bio: bio to merge
+ *
+ * @bio and @rq should belong to the same cgroup and their issue_as_root should
+ * match. The latter is necessary as we don't want to throttle e.g. a metadata
+ * update because it happens to be next to a regular IO.
+ */
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
+{
+ return rq->bio->bi_blkg == bio->bi_blkg &&
+ bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
+}
+
+void blk_cgroup_bio_start(struct bio *bio);
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
+#else /* CONFIG_BLK_CGROUP */
+
+struct blkg_policy_data {
+};
+
+struct blkcg_policy_data {
+};
+
+struct blkcg_policy {
+};
+
+#ifdef CONFIG_BLOCK
+
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{ return NULL; }
+static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+static inline void blkcg_exit_queue(struct request_queue *q) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+static inline int blkcg_activate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol) { }
+
+static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
+
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
+static inline void blkcg_bio_issue_init(struct bio *bio) { }
+static inline void blk_cgroup_bio_start(struct bio *bio) { }
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
+
+#define blk_queue_for_each_rl(rl, q) \
+ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+
+#endif /* CONFIG_BLOCK */
+#endif /* CONFIG_BLK_CGROUP */
+
+#endif /* _BLK_CGROUP_PRIVATE_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 779b4a1f66ac..9c14deab3af4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,7 +34,6 @@
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
@@ -49,6 +48,7 @@
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
+#include "blk-cgroup.h"
#include "blk-throttle.h"
#include "blk-rq-qos.h"
@@ -165,6 +165,7 @@ static const struct {
[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
[BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
+ [BLK_STS_OFFLINE] = { -ENODEV, "device offline" },
/* device mapper special case, should not leak out: */
[BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
@@ -339,8 +340,6 @@ void blk_cleanup_queue(struct request_queue *q)
blk_mq_sched_free_rqs(q);
mutex_unlock(&q->sysfs_lock);
- percpu_ref_exit(&q->q_usage_counter);
-
/* @q is and will stay empty, shutdown and put */
blk_put_queue(q);
}
@@ -473,9 +472,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
-#ifdef CONFIG_BLK_CGROUP
- INIT_LIST_HEAD(&q->blkg_list);
-#endif
kobject_init(&q->kobj, &blk_queue_ktype);
@@ -496,17 +492,12 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
goto fail_stats;
- if (blkcg_init_queue(q))
- goto fail_ref;
-
blk_queue_dma_alignment(q, 511);
blk_set_default_limits(&q->limits);
q->nr_requests = BLKDEV_DEFAULT_RQ;
return q;
-fail_ref:
- percpu_ref_exit(&q->q_usage_counter);
fail_stats:
blk_free_queue_stats(q->stats);
fail_split:
@@ -540,17 +531,6 @@ bool blk_get_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_get_queue);
-static void handle_bad_sector(struct bio *bio, sector_t maxsector)
-{
- char b[BDEVNAME_SIZE];
-
- pr_info_ratelimited("%s: attempt to access beyond end of device\n"
- "%s: rw=%d, want=%llu, limit=%llu\n",
- current->comm,
- bio_devname(bio, b), bio->bi_opf,
- bio_end_sector(bio), maxsector);
-}
-
#ifdef CONFIG_FAIL_MAKE_REQUEST
static DECLARE_FAULT_ATTR(fail_make_request);
@@ -580,14 +560,10 @@ late_initcall(fail_make_request_debugfs);
static inline bool bio_check_ro(struct bio *bio)
{
if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
- char b[BDEVNAME_SIZE];
-
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
return false;
-
- WARN_ONCE(1,
- "Trying to write to read-only block-device %s (partno %d)\n",
- bio_devname(bio, b), bio->bi_bdev->bd_partno);
+ pr_warn("Trying to write to read-only block-device %pg\n",
+ bio->bi_bdev);
/* Older lvm-tools actually trigger this */
return false;
}
@@ -616,7 +592,11 @@ static inline int bio_check_eod(struct bio *bio)
if (nr_sectors && maxsector &&
(nr_sectors > maxsector ||
bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
- handle_bad_sector(bio, maxsector);
+ pr_info_ratelimited("%s: attempt to access beyond end of device\n"
+ "%pg: rw=%d, want=%llu, limit=%llu\n",
+ current->comm,
+ bio->bi_bdev, bio->bi_opf,
+ bio_end_sector(bio), maxsector);
return -EIO;
}
return 0;
@@ -676,7 +656,123 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_OK;
}
-noinline_for_stack bool submit_bio_checks(struct bio *bio)
+static void __submit_bio(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+
+ if (unlikely(!blk_crypto_bio_prep(&bio)))
+ return;
+
+ if (!disk->fops->submit_bio) {
+ blk_mq_submit_bio(bio);
+ } else if (likely(bio_queue_enter(bio) == 0)) {
+ disk->fops->submit_bio(bio);
+ blk_queue_exit(disk->queue);
+ }
+}
+
+/*
+ * The loop in this function may be a bit non-obvious, and so deserves some
+ * explanation:
+ *
+ * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
+ * that), so we have a list with a single bio.
+ * - We pretend that we have just taken it off a longer list, so we assign
+ * bio_list to a pointer to the bio_list_on_stack, thus initialising the
+ * bio_list of new bios to be added. ->submit_bio() may indeed add some more
+ * bios through a recursive call to submit_bio_noacct. If it did, we find a
+ * non-NULL value in bio_list and re-enter the loop from the top.
+ * - In this case we really did just take the bio of the top of the list (no
+ * pretending) and so remove it from bio_list, and call into ->submit_bio()
+ * again.
+ *
+ * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
+ * bio_list_on_stack[1] contains bios that were submitted before the current
+ * ->submit_bio_bio, but that haven't been processed yet.
+ */
+static void __submit_bio_noacct(struct bio *bio)
+{
+ struct bio_list bio_list_on_stack[2];
+
+ BUG_ON(bio->bi_next);
+
+ bio_list_init(&bio_list_on_stack[0]);
+ current->bio_list = bio_list_on_stack;
+
+ do {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ struct bio_list lower, same;
+
+ /*
+ * Create a fresh bio_list for all subordinate requests.
+ */
+ bio_list_on_stack[1] = bio_list_on_stack[0];
+ bio_list_init(&bio_list_on_stack[0]);
+
+ __submit_bio(bio);
+
+ /*
+ * Sort new bios into those for a lower level and those for the
+ * same level.
+ */
+ bio_list_init(&lower);
+ bio_list_init(&same);
+ while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
+ if (q == bdev_get_queue(bio->bi_bdev))
+ bio_list_add(&same, bio);
+ else
+ bio_list_add(&lower, bio);
+
+ /*
+ * Now assemble so we handle the lowest level first.
+ */
+ bio_list_merge(&bio_list_on_stack[0], &lower);
+ bio_list_merge(&bio_list_on_stack[0], &same);
+ bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
+ } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
+
+ current->bio_list = NULL;
+}
+
+static void __submit_bio_noacct_mq(struct bio *bio)
+{
+ struct bio_list bio_list[2] = { };
+
+ current->bio_list = bio_list;
+
+ do {
+ __submit_bio(bio);
+ } while ((bio = bio_list_pop(&bio_list[0])));
+
+ current->bio_list = NULL;
+}
+
+void submit_bio_noacct_nocheck(struct bio *bio)
+{
+ /*
+ * We only want one ->submit_bio to be active at a time, else stack
+ * usage with stacked devices could be a problem. Use current->bio_list
+ * to collect a list of requests submited by a ->submit_bio method while
+ * it is active, and then process them after it returned.
+ */
+ if (current->bio_list)
+ bio_list_add(&current->bio_list[0], bio);
+ else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
+ __submit_bio_noacct_mq(bio);
+ else
+ __submit_bio_noacct(bio);
+}
+
+/**
+ * submit_bio_noacct - re-submit a bio to the block device layer for I/O
+ * @bio: The bio describing the location in memory and on the device.
+ *
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level drivers by stacking block drivers. All file
+ * systems and other upper level users of the block layer should use
+ * submit_bio() instead.
+ */
+void submit_bio_noacct(struct bio *bio)
{
struct block_device *bdev = bio->bi_bdev;
struct request_queue *q = bdev_get_queue(bdev);
@@ -761,7 +857,7 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio)
}
if (blk_throtl_bio(bio))
- return false;
+ return;
blk_cgroup_bio_start(bio);
blkcg_bio_issue_init(bio);
@@ -773,138 +869,14 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio)
*/
bio_set_flag(bio, BIO_TRACE_COMPLETION);
}
- return true;
+ submit_bio_noacct_nocheck(bio);
+ return;
not_supported:
status = BLK_STS_NOTSUPP;
end_io:
bio->bi_status = status;
bio_endio(bio);
- return false;
-}
-
-static void __submit_bio_fops(struct gendisk *disk, struct bio *bio)
-{
- if (blk_crypto_bio_prep(&bio)) {
- if (likely(bio_queue_enter(bio) == 0)) {
- disk->fops->submit_bio(bio);
- blk_queue_exit(disk->queue);
- }
- }
-}
-
-static void __submit_bio(struct bio *bio)
-{
- struct gendisk *disk = bio->bi_bdev->bd_disk;
-
- if (unlikely(!submit_bio_checks(bio)))
- return;
-
- if (!disk->fops->submit_bio)
- blk_mq_submit_bio(bio);
- else
- __submit_bio_fops(disk, bio);
-}
-
-/*
- * The loop in this function may be a bit non-obvious, and so deserves some
- * explanation:
- *
- * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
- * that), so we have a list with a single bio.
- * - We pretend that we have just taken it off a longer list, so we assign
- * bio_list to a pointer to the bio_list_on_stack, thus initialising the
- * bio_list of new bios to be added. ->submit_bio() may indeed add some more
- * bios through a recursive call to submit_bio_noacct. If it did, we find a
- * non-NULL value in bio_list and re-enter the loop from the top.
- * - In this case we really did just take the bio of the top of the list (no
- * pretending) and so remove it from bio_list, and call into ->submit_bio()
- * again.
- *
- * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
- * bio_list_on_stack[1] contains bios that were submitted before the current
- * ->submit_bio_bio, but that haven't been processed yet.
- */
-static void __submit_bio_noacct(struct bio *bio)
-{
- struct bio_list bio_list_on_stack[2];
-
- BUG_ON(bio->bi_next);
-
- bio_list_init(&bio_list_on_stack[0]);
- current->bio_list = bio_list_on_stack;
-
- do {
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct bio_list lower, same;
-
- /*
- * Create a fresh bio_list for all subordinate requests.
- */
- bio_list_on_stack[1] = bio_list_on_stack[0];
- bio_list_init(&bio_list_on_stack[0]);
-
- __submit_bio(bio);
-
- /*
- * Sort new bios into those for a lower level and those for the
- * same level.
- */
- bio_list_init(&lower);
- bio_list_init(&same);
- while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
- if (q == bdev_get_queue(bio->bi_bdev))
- bio_list_add(&same, bio);
- else
- bio_list_add(&lower, bio);
-
- /*
- * Now assemble so we handle the lowest level first.
- */
- bio_list_merge(&bio_list_on_stack[0], &lower);
- bio_list_merge(&bio_list_on_stack[0], &same);
- bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
- } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
-
- current->bio_list = NULL;
-}
-
-static void __submit_bio_noacct_mq(struct bio *bio)
-{
- struct bio_list bio_list[2] = { };
-
- current->bio_list = bio_list;
-
- do {
- __submit_bio(bio);
- } while ((bio = bio_list_pop(&bio_list[0])));
-
- current->bio_list = NULL;
-}
-
-/**
- * submit_bio_noacct - re-submit a bio to the block device layer for I/O
- * @bio: The bio describing the location in memory and on the device.
- *
- * This is a version of submit_bio() that shall only be used for I/O that is
- * resubmitted to lower level drivers by stacking block drivers. All file
- * systems and other upper level users of the block layer should use
- * submit_bio() instead.
- */
-void submit_bio_noacct(struct bio *bio)
-{
- /*
- * We only want one ->submit_bio to be active at a time, else stack
- * usage with stacked devices could be a problem. Use current->bio_list
- * to collect a list of requests submited by a ->submit_bio method while
- * it is active, and then process them after it returned.
- */
- if (current->bio_list)
- bio_list_add(&current->bio_list[0], bio);
- else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
- __submit_bio_noacct_mq(bio);
- else
- __submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio_noacct);
@@ -989,8 +961,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
- if (current->plug)
- blk_flush_plug(current->plug, false);
+ blk_flush_plug(current->plug, false);
if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT))
return 0;
@@ -1272,7 +1243,7 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
}
EXPORT_SYMBOL(blk_check_plugged);
-void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
+void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
{
if (!list_empty(&plug->cb_list))
flush_plug_callbacks(plug, from_schedule);
@@ -1301,7 +1272,7 @@ void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
void blk_finish_plug(struct blk_plug *plug)
{
if (plug == current->plug) {
- blk_flush_plug(plug, false);
+ __blk_flush_plug(plug, false);
current->plug = NULL;
}
}
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index c87aba8584c6..18c8eafe20b9 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -10,7 +10,6 @@
#define pr_fmt(fmt) "blk-crypto-fallback: " fmt
#include <crypto/skcipher.h>
-#include <linux/blk-cgroup.h>
#include <linux/blk-crypto.h>
#include <linux/blk-crypto-profile.h>
#include <linux/blkdev.h>
@@ -20,6 +19,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
+#include "blk-cgroup.h"
#include "blk-crypto-internal.h"
static unsigned int num_prealloc_bounce_pg = 32;
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index 2fb0d65a464c..e6818ffaddbf 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -11,6 +11,7 @@
/* Represents a crypto mode supported by blk-crypto */
struct blk_crypto_mode {
+ const char *name; /* name of this mode, shown in sysfs */
const char *cipher_str; /* crypto API name (for fallback case) */
unsigned int keysize; /* key size in bytes */
unsigned int ivsize; /* iv size in bytes */
@@ -20,6 +21,10 @@ extern const struct blk_crypto_mode blk_crypto_modes[];
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+int blk_crypto_sysfs_register(struct request_queue *q);
+
+void blk_crypto_sysfs_unregister(struct request_queue *q);
+
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
unsigned int inc);
@@ -62,6 +67,13 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
+static inline int blk_crypto_sysfs_register(struct request_queue *q)
+{
+ return 0;
+}
+
+static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { }
+
static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
struct bio *bio)
{
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
new file mode 100644
index 000000000000..fd93bd2f33b7
--- /dev/null
+++ b/block/blk-crypto-sysfs.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Google LLC
+ *
+ * sysfs support for blk-crypto. This file contains the code which exports the
+ * crypto capabilities of devices via /sys/block/$disk/queue/crypto/.
+ */
+
+#include <linux/blk-crypto-profile.h>
+
+#include "blk-crypto-internal.h"
+
+struct blk_crypto_kobj {
+ struct kobject kobj;
+ struct blk_crypto_profile *profile;
+};
+
+struct blk_crypto_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page);
+};
+
+static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj)
+{
+ return container_of(kobj, struct blk_crypto_kobj, kobj)->profile;
+}
+
+static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr)
+{
+ return container_of(attr, struct blk_crypto_attr, attr);
+}
+
+static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported);
+}
+
+static ssize_t num_keyslots_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ return sysfs_emit(page, "%u\n", profile->num_slots);
+}
+
+#define BLK_CRYPTO_RO_ATTR(_name) \
+ static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
+
+BLK_CRYPTO_RO_ATTR(max_dun_bits);
+BLK_CRYPTO_RO_ATTR(num_keyslots);
+
+static struct attribute *blk_crypto_attrs[] = {
+ &max_dun_bits_attr.attr,
+ &num_keyslots_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group blk_crypto_attr_group = {
+ .attrs = blk_crypto_attrs,
+};
+
+/*
+ * The encryption mode attributes. To avoid hard-coding the list of encryption
+ * modes, these are initialized at boot time by blk_crypto_sysfs_init().
+ */
+static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX];
+static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1];
+
+static umode_t blk_crypto_mode_is_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+ int mode_num = a - __blk_crypto_mode_attrs;
+
+ if (profile->modes_supported[mode_num])
+ return 0444;
+ return 0;
+}
+
+static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ int mode_num = attr - __blk_crypto_mode_attrs;
+
+ return sysfs_emit(page, "0x%x\n", profile->modes_supported[mode_num]);
+}
+
+static const struct attribute_group blk_crypto_modes_attr_group = {
+ .name = "modes",
+ .attrs = blk_crypto_mode_attrs,
+ .is_visible = blk_crypto_mode_is_visible,
+};
+
+static const struct attribute_group *blk_crypto_attr_groups[] = {
+ &blk_crypto_attr_group,
+ &blk_crypto_modes_attr_group,
+ NULL,
+};
+
+static ssize_t blk_crypto_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *page)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+
+ return a->show(profile, a, page);
+}
+
+static const struct sysfs_ops blk_crypto_attr_ops = {
+ .show = blk_crypto_attr_show,
+};
+
+static void blk_crypto_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct blk_crypto_kobj, kobj));
+}
+
+static struct kobj_type blk_crypto_ktype = {
+ .default_groups = blk_crypto_attr_groups,
+ .sysfs_ops = &blk_crypto_attr_ops,
+ .release = blk_crypto_release,
+};
+
+/*
+ * If the request_queue has a blk_crypto_profile, create the "crypto"
+ * subdirectory in sysfs (/sys/block/$disk/queue/crypto/).
+ */
+int blk_crypto_sysfs_register(struct request_queue *q)
+{
+ struct blk_crypto_kobj *obj;
+ int err;
+
+ if (!q->crypto_profile)
+ return 0;
+
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+ obj->profile = q->crypto_profile;
+
+ err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj,
+ "crypto");
+ if (err) {
+ kobject_put(&obj->kobj);
+ return err;
+ }
+ q->crypto_kobject = &obj->kobj;
+ return 0;
+}
+
+void blk_crypto_sysfs_unregister(struct request_queue *q)
+{
+ kobject_put(q->crypto_kobject);
+}
+
+static int __init blk_crypto_sysfs_init(void)
+{
+ int i;
+
+ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
+ for (i = 1; i < BLK_ENCRYPTION_MODE_MAX; i++) {
+ struct blk_crypto_attr *attr = &__blk_crypto_mode_attrs[i];
+
+ attr->attr.name = blk_crypto_modes[i].name;
+ attr->attr.mode = 0444;
+ attr->show = blk_crypto_mode_show;
+ blk_crypto_mode_attrs[i - 1] = &attr->attr;
+ }
+ return 0;
+}
+subsys_initcall(blk_crypto_sysfs_init);
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index ec9efeeeca91..a496aaef85ba 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -19,16 +19,19 @@
const struct blk_crypto_mode blk_crypto_modes[] = {
[BLK_ENCRYPTION_MODE_AES_256_XTS] = {
+ .name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
+ .name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_ADIANTUM] = {
+ .name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
.ivsize = 32,
@@ -111,7 +114,6 @@ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask)
*dst->bi_crypt_context = *src->bi_crypt_context;
return 0;
}
-EXPORT_SYMBOL_GPL(__bio_crypt_clone);
/* Increments @dun by @inc, treating @dun as a multi-limb integer. */
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
diff --git a/block/blk-flush.c b/block/blk-flush.c
index e4df894189ce..c68968724870 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -460,9 +460,7 @@ int blkdev_issue_flush(struct block_device *bdev)
{
struct bio bio;
- bio_init(&bio, NULL, 0);
- bio_set_dev(&bio, bdev);
- bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
return submit_bio_wait(&bio);
}
EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 769b64394298..70a0a3d680a3 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -178,12 +178,12 @@
#include <linux/time64.h>
#include <linux/parser.h>
#include <linux/sched/signal.h>
-#include <linux/blk-cgroup.h>
#include <asm/local.h>
#include <asm/local64.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
#ifdef CONFIG_TRACEPOINTS
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 6593c7123b97..2f33932e72e3 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -74,9 +74,9 @@
#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
-#include <linux/blk-cgroup.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
+#include "blk-cgroup.h"
#include "blk.h"
#define DEFAULT_SCALE_COOKIE 1000000U
@@ -598,7 +598,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
int inflight = 0;
blkg = bio->bi_blkg;
- if (!blkg || !bio_flagged(bio, BIO_TRACKED))
+ if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED))
return;
iolat = blkg_to_lat(bio->bi_blkg);
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 2e7f10e1c03f..79e797f5d194 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -12,11 +12,11 @@
* Documentation/admin-guide/cgroup-v2.rst.
*/
-#include <linux/blk-cgroup.h>
#include <linux/blk-mq.h>
#include <linux/blk_types.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-rq-qos.h"
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9f09beadcbe3..fc6ea52e7482 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -10,19 +10,6 @@
#include "blk.h"
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
-{
- struct bio *new = bio_alloc(gfp, nr_pages);
-
- if (bio) {
- bio_chain(bio, new);
- submit_bio(bio);
- }
-
- return new;
-}
-EXPORT_SYMBOL_GPL(blk_next_bio);
-
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, int flags,
struct bio **biop)
@@ -32,9 +19,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
unsigned int op;
sector_t bs_mask, part_offset = 0;
- if (!q)
- return -ENXIO;
-
if (bdev_read_only(bdev))
return -EPERM;
@@ -95,11 +79,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
WARN_ON_ONCE((req_sects << 9) > UINT_MAX);
- bio = blk_next_bio(bio, 0, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 0, op, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio_set_op_attrs(bio, op, 0);
-
bio->bi_iter.bi_size = req_sects << 9;
sector += req_sects;
nr_sects -= req_sects;
@@ -172,9 +153,6 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
struct bio *bio = *biop;
sector_t bs_mask;
- if (!q)
- return -ENXIO;
-
if (bdev_read_only(bdev))
return -EPERM;
@@ -189,14 +167,12 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
max_write_same_sectors = bio_allowed_max_sectors(q);
while (nr_sects) {
- bio = blk_next_bio(bio, 1, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 1, REQ_OP_WRITE_SAME, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
bio->bi_vcnt = 1;
bio->bi_io_vec->bv_page = page;
bio->bi_io_vec->bv_offset = 0;
bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
- bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0);
if (nr_sects > max_write_same_sectors) {
bio->bi_iter.bi_size = max_write_same_sectors << 9;
@@ -250,10 +226,6 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
{
struct bio *bio = *biop;
unsigned int max_write_zeroes_sectors;
- struct request_queue *q = bdev_get_queue(bdev);
-
- if (!q)
- return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
@@ -265,10 +237,8 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
return -EOPNOTSUPP;
while (nr_sects) {
- bio = blk_next_bio(bio, 0, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE_ZEROES;
if (flags & BLKDEV_ZERO_NOUNMAP)
bio->bi_opf |= REQ_NOUNMAP;
@@ -304,23 +274,17 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop)
{
- struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
int bi_size = 0;
unsigned int sz;
- if (!q)
- return -ENXIO;
-
if (bdev_read_only(bdev))
return -EPERM;
while (nr_sects != 0) {
- bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
- gfp_mask);
+ bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects),
+ REQ_OP_WRITE, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
while (nr_sects != 0) {
sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4de34a332c9f..ea6968313b4a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,6 +9,7 @@
#include <linux/blk-integrity.h>
#include <linux/scatterlist.h>
#include <linux/part_stat.h>
+#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
@@ -368,8 +369,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
trace_block_split(split, (*bio)->bi_iter.bi_sector);
submit_bio_noacct(*bio);
*bio = split;
-
- blk_throtl_charge_bio_split(*bio);
}
}
@@ -600,6 +599,9 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
{
+ if (!blk_cgroup_mergeable(req, bio))
+ goto no_merge;
+
if (blk_integrity_merge_bio(req->q, req, bio) == false)
goto no_merge;
@@ -696,6 +698,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
if (total_phys_segments > blk_rq_get_max_segments(req))
return 0;
+ if (!blk_cgroup_mergeable(req, next->bio))
+ return 0;
+
if (blk_integrity_merge_rq(q, req, next) == false)
return 0;
@@ -904,6 +909,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (bio_data_dir(bio) != rq_data_dir(rq))
return false;
+ /* don't merge across cgroup boundaries */
+ if (!blk_cgroup_mergeable(rq, bio))
+ return false;
+
/* only merge integrity protected bio into ditto rq */
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false;
@@ -1089,12 +1098,20 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
if (!plug || rq_list_empty(plug->mq_list))
return false;
- /* check the previously added entry for a quick merge attempt */
- rq = rq_list_peek(&plug->mq_list);
- if (rq->q == q) {
- if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
- BIO_MERGE_OK)
- return true;
+ rq_list_for_each(&plug->mq_list, rq) {
+ if (rq->q == q) {
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK)
+ return true;
+ break;
+ }
+
+ /*
+ * Only keep iterating plug list for merges if we have multiple
+ * queues
+ */
+ if (!plug->multiple_queues)
+ break;
}
return false;
}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3a790eb4995c..e2880f6deb34 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -707,7 +707,7 @@ static void debugfs_create_files(struct dentry *parent, void *data,
void blk_mq_debugfs_register(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
@@ -780,7 +780,7 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
void blk_mq_debugfs_register_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_debugfs_register_hctx(q, hctx);
@@ -789,7 +789,7 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q)
void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_debugfs_unregister_hctx(hctx);
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index a68aa6041a10..69918f4170d6 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -6,6 +6,8 @@
#include <linux/seq_file.h>
+struct blk_mq_hw_ctx;
+
struct blk_mq_debugfs_attr {
const char *name;
umode_t mode;
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 55488ba97823..9e56a69422b6 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -180,11 +180,18 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
+ unsigned long end = jiffies + HZ;
int ret;
do {
ret = __blk_mq_do_dispatch_sched(hctx);
- } while (ret == 1);
+ if (ret != 1)
+ break;
+ if (need_resched() || time_is_before_jiffies(end)) {
+ blk_mq_delay_run_hw_queue(hctx, 0);
+ break;
+ }
+ } while (1);
return ret;
}
@@ -515,7 +522,7 @@ static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags) {
@@ -550,9 +557,10 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
- unsigned int i, flags = q->tag_set->flags;
+ unsigned int flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx;
struct elevator_queue *eq;
+ unsigned long i;
int ret;
if (!e) {
@@ -618,7 +626,7 @@ err_free_map_and_rqs:
void blk_mq_sched_free_rqs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
@@ -635,7 +643,7 @@ void blk_mq_sched_free_rqs(struct request_queue *q)
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 674786574075..c08426975856 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -206,7 +206,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
lockdep_assert_held(&q->sysfs_dir_lock);
@@ -255,7 +255,8 @@ void blk_mq_sysfs_init(struct request_queue *q)
int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int ret, i;
+ unsigned long i, j;
+ int ret;
WARN_ON_ONCE(!q->kobj.parent);
lockdep_assert_held(&q->sysfs_dir_lock);
@@ -278,8 +279,10 @@ out:
return ret;
unreg:
- while (--i >= 0)
- blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
+ queue_for_each_hw_ctx(q, hctx, j) {
+ if (j < i)
+ blk_mq_unregister_hctx(hctx);
+ }
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
kobject_del(q->mq_kobj);
@@ -290,7 +293,7 @@ unreg:
void blk_mq_sysfs_unregister(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
@@ -306,7 +309,8 @@ unlock:
int blk_mq_sysfs_register(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i, ret = 0;
+ unsigned long i;
+ int ret = 0;
mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 845f74e8dd7b..68ac23d0b640 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -107,7 +107,7 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
return BLK_MQ_NO_TAG;
if (data->shallow_depth)
- return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
+ return sbitmap_queue_get_shallow(bt, data->shallow_depth);
else
return __sbitmap_queue_get(bt);
}
@@ -498,7 +498,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv)
{
/*
- * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
+ * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
* while the queue is frozen. So we can use q_usage_counter to avoid
* racing with it.
*/
@@ -515,7 +515,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
bt_for_each(NULL, q, btags, fn, priv, false);
} else {
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_tags *tags = hctx->tags;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9a9185a0a2d1..8e659dc5fcf3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -71,7 +71,8 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
blk_qc_t qc)
{
- return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT];
+ return xa_load(&q->hctx_table,
+ (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
}
static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
@@ -312,7 +313,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hw_queue_mapped(hctx))
@@ -573,7 +574,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
* If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
- data.hctx = q->queue_hw_ctx[hctx_idx];
+ data.hctx = xa_load(&q->hctx_table, hctx_idx);
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
@@ -793,8 +794,10 @@ bool blk_update_request(struct request *req, blk_status_t error,
#endif
if (unlikely(error && !blk_rq_is_passthrough(req) &&
- !(req->rq_flags & RQF_QUIET)))
+ !(req->rq_flags & RQF_QUIET))) {
blk_print_req_error(req, error);
+ trace_block_rq_error(req, error, nr_bytes);
+ }
blk_account_io_completion(req, nr_bytes);
@@ -885,10 +888,15 @@ static inline void blk_account_io_done(struct request *req, u64 now)
static void __blk_account_io_start(struct request *rq)
{
- /* passthrough requests can hold bios that do not have ->bi_bdev set */
- if (rq->bio && rq->bio->bi_bdev)
+ /*
+ * All non-passthrough requests are created from a bio with one
+ * exception: when a flush command that is part of a flush sequence
+ * generated by the state machine in blk-flush.c is cloned onto the
+ * lower device by dm-multipath we can get here without a bio.
+ */
+ if (rq->bio)
rq->part = rq->bio->bi_bdev;
- else if (rq->q->disk)
+ else
rq->part = rq->q->disk->part0;
part_stat_lock();
@@ -1444,7 +1452,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
container_of(work, struct request_queue, timeout_work);
unsigned long next = 0;
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
/* A deadlock might occur if a request is stuck requiring a
* timeout at the same time a queue freeze is waiting
@@ -2145,7 +2153,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx, *sq_hctx;
- int i;
+ unsigned long i;
sq_hctx = NULL;
if (blk_mq_has_sqsched(q))
@@ -2173,7 +2181,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
struct blk_mq_hw_ctx *hctx, *sq_hctx;
- int i;
+ unsigned long i;
sq_hctx = NULL;
if (blk_mq_has_sqsched(q))
@@ -2182,6 +2190,14 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
if (blk_mq_hctx_stopped(hctx))
continue;
/*
+ * If there is already a run_work pending, leave the
+ * pending delay untouched. Otherwise, a hctx can stall
+ * if another hctx is re-delaying the other's work
+ * before the work executes.
+ */
+ if (delayed_work_pending(&hctx->run_work))
+ continue;
+ /*
* Dispatch from this hctx either if there's no hctx preferred
* by IO scheduler or if it has requests that bypass the
* scheduler.
@@ -2203,7 +2219,7 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
bool blk_mq_queue_stopped(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hctx_stopped(hctx))
@@ -2242,7 +2258,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue);
void blk_mq_stop_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_stop_hw_queue(hctx);
@@ -2260,7 +2276,7 @@ EXPORT_SYMBOL(blk_mq_start_hw_queue);
void blk_mq_start_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_hw_queue(hctx);
@@ -2280,7 +2296,7 @@ EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_stopped_hw_queue(hctx, async);
@@ -2561,13 +2577,36 @@ static void __blk_mq_flush_plug_list(struct request_queue *q,
q->mq_ops->queue_rqs(&plug->mq_list);
}
+static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
+{
+ struct blk_mq_hw_ctx *this_hctx = NULL;
+ struct blk_mq_ctx *this_ctx = NULL;
+ struct request *requeue_list = NULL;
+ unsigned int depth = 0;
+ LIST_HEAD(list);
+
+ do {
+ struct request *rq = rq_list_pop(&plug->mq_list);
+
+ if (!this_hctx) {
+ this_hctx = rq->mq_hctx;
+ this_ctx = rq->mq_ctx;
+ } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+ rq_list_add(&requeue_list, rq);
+ continue;
+ }
+ list_add_tail(&rq->queuelist, &list);
+ depth++;
+ } while (!rq_list_empty(plug->mq_list));
+
+ plug->mq_list = requeue_list;
+ trace_block_unplug(this_hctx->queue, depth, !from_sched);
+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched);
+}
+
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
- struct blk_mq_hw_ctx *this_hctx;
- struct blk_mq_ctx *this_ctx;
struct request *rq;
- unsigned int depth;
- LIST_HEAD(list);
if (rq_list_empty(plug->mq_list))
return;
@@ -2603,35 +2642,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
return;
}
- this_hctx = NULL;
- this_ctx = NULL;
- depth = 0;
do {
- rq = rq_list_pop(&plug->mq_list);
-
- if (!this_hctx) {
- this_hctx = rq->mq_hctx;
- this_ctx = rq->mq_ctx;
- } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
- trace_block_unplug(this_hctx->queue, depth,
- !from_schedule);
- blk_mq_sched_insert_requests(this_hctx, this_ctx,
- &list, from_schedule);
- depth = 0;
- this_hctx = rq->mq_hctx;
- this_ctx = rq->mq_ctx;
-
- }
-
- list_add(&rq->queuelist, &list);
- depth++;
+ blk_mq_dispatch_plug_list(plug, from_schedule);
} while (!rq_list_empty(plug->mq_list));
-
- if (!list_empty(&list)) {
- trace_block_unplug(this_hctx->queue, depth, !from_schedule);
- blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
- from_schedule);
- }
}
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2804,9 +2817,6 @@ void blk_mq_submit_bio(struct bio *bio)
unsigned int nr_segs = 1;
blk_status_t ret;
- if (unlikely(!blk_crypto_bio_prep(&bio)))
- return;
-
blk_queue_bounce(q, &bio);
if (blk_may_split(q, bio))
__blk_queue_split(q, &bio, &nr_segs);
@@ -2853,27 +2863,16 @@ void blk_mq_submit_bio(struct bio *bio)
blk_mq_try_issue_directly(rq->mq_hctx, rq));
}
+#ifdef CONFIG_BLK_MQ_STACKING
/**
- * blk_cloned_rq_check_limits - Helper function to check a cloned request
- * for the new queue limits
- * @q: the queue
- * @rq: the request being checked
- *
- * Description:
- * @rq may have been made based on weaker limitations of upper-level queues
- * in request stacking drivers, and it may violate the limitation of @q.
- * Since the block layer and the underlying device driver trust @rq
- * after it is inserted to @q, it should be checked against @q before
- * the insertion using this generic function.
- *
- * Request stacking drivers like request-based dm may change the queue
- * limits when retrying requests on other queues. Those requests need
- * to be checked against the new queue limits again during dispatch.
+ * blk_insert_cloned_request - Helper for stacking drivers to submit a request
+ * @rq: the request being queued
*/
-static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
- struct request *rq)
+blk_status_t blk_insert_cloned_request(struct request *rq)
{
+ struct request_queue *q = rq->q;
unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+ blk_status_t ret;
if (blk_rq_sectors(rq) > max_sectors) {
/*
@@ -2905,24 +2904,7 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
return BLK_STS_IOERR;
}
- return BLK_STS_OK;
-}
-
-/**
- * blk_insert_cloned_request - Helper for stacking drivers to submit a request
- * @q: the queue to submit the request
- * @rq: the request being queued
- */
-blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
-{
- blk_status_t ret;
-
- ret = blk_cloned_rq_check_limits(q, rq);
- if (ret != BLK_STS_OK)
- return ret;
-
- if (rq->q->disk &&
- should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq)))
+ if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
return BLK_STS_IOERR;
if (blk_crypto_insert_cloned_request(rq))
@@ -2935,7 +2917,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
* bypass a potential scheduler on the bottom device for
* insert.
*/
- blk_mq_run_dispatch_ops(rq->q,
+ blk_mq_run_dispatch_ops(q,
ret = blk_mq_request_issue_directly(rq, true));
if (ret)
blk_account_io_done(rq, ktime_get_ns());
@@ -2990,10 +2972,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
bs = &fs_bio_set;
__rq_for_each_bio(bio_src, rq_src) {
- bio = bio_clone_fast(bio_src, gfp_mask, bs);
+ bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask,
+ bs);
if (!bio)
goto free_and_out;
- bio->bi_bdev = rq->q->disk->part0;
if (bio_ctr && bio_ctr(bio, bio_src, data))
goto free_and_out;
@@ -3030,6 +3012,7 @@ free_and_out:
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
+#endif /* CONFIG_BLK_MQ_STACKING */
/*
* Steal bios from a request and add them to a bio list.
@@ -3100,6 +3083,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
struct blk_mq_tags *drv_tags;
struct page *page;
+ if (list_empty(&tags->page_list))
+ return;
+
if (blk_mq_is_shared_tags(set->flags))
drv_tags = set->shared_tags;
else
@@ -3142,15 +3128,41 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags)
blk_mq_free_tags(tags);
}
+static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
+{
+ int i;
+
+ for (i = 0; i < set->nr_maps; i++) {
+ unsigned int start = set->map[i].queue_offset;
+ unsigned int end = start + set->map[i].nr_queues;
+
+ if (hctx_idx >= start && hctx_idx < end)
+ break;
+ }
+
+ if (i >= set->nr_maps)
+ i = HCTX_TYPE_DEFAULT;
+
+ return i;
+}
+
+static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
+{
+ enum hctx_type type = hctx_idx_to_type(set, hctx_idx);
+
+ return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
+}
+
static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx,
unsigned int nr_tags,
unsigned int reserved_tags)
{
+ int node = blk_mq_get_hctx_node(set, hctx_idx);
struct blk_mq_tags *tags;
- int node;
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@@ -3199,10 +3211,9 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
unsigned int hctx_idx, unsigned int depth)
{
unsigned int i, j, entries_per_page, max_order = 4;
+ int node = blk_mq_get_hctx_node(set, hctx_idx);
size_t rq_size, left;
- int node;
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@@ -3447,6 +3458,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
blk_mq_remove_cpuhp(hctx);
+ xa_erase(&q->hctx_table, hctx_idx);
+
spin_lock(&q->unused_hctx_lock);
list_add(&hctx->hctx_list, &q->unused_hctx_list);
spin_unlock(&q->unused_hctx_lock);
@@ -3456,12 +3469,11 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
struct blk_mq_tag_set *set, int nr_queue)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
if (i == nr_queue)
break;
- blk_mq_debugfs_unregister_hctx(hctx);
blk_mq_exit_hctx(q, set, hctx, i);
}
}
@@ -3486,8 +3498,15 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
hctx->numa_node))
goto exit_hctx;
+
+ if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
+ goto exit_flush_rq;
+
return 0;
+ exit_flush_rq:
+ if (set->ops->exit_request)
+ set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@@ -3647,7 +3666,8 @@ static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
static void blk_mq_map_swqueue(struct request_queue *q)
{
- unsigned int i, j, hctx_idx;
+ unsigned int j, hctx_idx;
+ unsigned long i;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
@@ -3754,7 +3774,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
if (shared) {
@@ -3854,7 +3874,7 @@ static int blk_mq_alloc_ctxs(struct request_queue *q)
void blk_mq_release(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx, *next;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
@@ -3865,7 +3885,7 @@ void blk_mq_release(struct request_queue *q)
kobject_put(&hctx->kobj);
}
- kfree(q->queue_hw_ctx);
+ xa_destroy(&q->hctx_table);
/*
* release .mq_kobj and sw queue's kobject now because
@@ -3954,52 +3974,28 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
{
- int i, j, end;
- struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
-
- if (q->nr_hw_queues < set->nr_hw_queues) {
- struct blk_mq_hw_ctx **new_hctxs;
-
- new_hctxs = kcalloc_node(set->nr_hw_queues,
- sizeof(*new_hctxs), GFP_KERNEL,
- set->numa_node);
- if (!new_hctxs)
- return;
- if (hctxs)
- memcpy(new_hctxs, hctxs, q->nr_hw_queues *
- sizeof(*hctxs));
- q->queue_hw_ctx = new_hctxs;
- kfree(hctxs);
- hctxs = new_hctxs;
- }
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i, j;
/* protect against switching io scheduler */
mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
- int node;
- struct blk_mq_hw_ctx *hctx;
+ int old_node;
+ int node = blk_mq_get_hctx_node(set, i);
+ struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
- /*
- * If the hw queue has been mapped to another numa node,
- * we need to realloc the hctx. If allocation fails, fallback
- * to use the previous one.
- */
- if (hctxs[i] && (hctxs[i]->numa_node == node))
- continue;
+ if (old_hctx) {
+ old_node = old_hctx->numa_node;
+ blk_mq_exit_hctx(q, set, old_hctx, i);
+ }
- hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
- if (hctx) {
- if (hctxs[i])
- blk_mq_exit_hctx(q, set, hctxs[i], i);
- hctxs[i] = hctx;
- } else {
- if (hctxs[i])
- pr_warn("Allocate new hctx on node %d fails,\
- fallback to previous one on node %d\n",
- node, hctxs[i]->numa_node);
- else
+ if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
+ if (!old_hctx)
break;
+ pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
+ node, old_node);
+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
+ WARN_ON_ONCE(!hctx);
}
}
/*
@@ -4008,24 +4004,27 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
*/
if (i != set->nr_hw_queues) {
j = q->nr_hw_queues;
- end = i;
} else {
j = i;
- end = q->nr_hw_queues;
q->nr_hw_queues = set->nr_hw_queues;
}
- for (; j < end; j++) {
- struct blk_mq_hw_ctx *hctx = hctxs[j];
-
- if (hctx) {
- blk_mq_exit_hctx(q, set, hctx, j);
- hctxs[j] = NULL;
- }
- }
+ xa_for_each_start(&q->hctx_table, j, hctx, j)
+ blk_mq_exit_hctx(q, set, hctx, j);
mutex_unlock(&q->sysfs_lock);
}
+static void blk_mq_update_poll_flag(struct request_queue *q)
+{
+ struct blk_mq_tag_set *set = q->tag_set;
+
+ if (set->nr_maps > HCTX_TYPE_POLL &&
+ set->map[HCTX_TYPE_POLL].nr_queues)
+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+ else
+ blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+}
+
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
@@ -4050,6 +4049,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->unused_hctx_list);
spin_lock_init(&q->unused_hctx_lock);
+ xa_init(&q->hctx_table);
+
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
@@ -4060,9 +4061,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->tag_set = set;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
- if (set->nr_maps > HCTX_TYPE_POLL &&
- set->map[HCTX_TYPE_POLL].nr_queues)
- blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+ blk_mq_update_poll_flag(q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->requeue_list);
@@ -4081,7 +4080,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
return 0;
err_hctxs:
- kfree(q->queue_hw_ctx);
+ xa_destroy(&q->hctx_table);
q->nr_hw_queues = 0;
blk_mq_sysfs_deinit(q);
err_poll:
@@ -4369,7 +4368,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
struct blk_mq_hw_ctx *hctx;
- int i, ret;
+ int ret;
+ unsigned long i;
if (!set)
return -EINVAL;
@@ -4528,6 +4528,7 @@ fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
+ blk_mq_update_poll_flag(q);
if (q->nr_hw_queues != set->nr_hw_queues) {
int i = prev_nr_hw_queues;
@@ -4744,7 +4745,7 @@ void blk_mq_cancel_work_sync(struct request_queue *q)
{
if (queue_is_mq(q)) {
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
cancel_delayed_work_sync(&q->requeue_work);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 948791ea2a3e..2615bd58bad3 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -83,7 +83,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
enum hctx_type type,
unsigned int cpu)
{
- return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
+ return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
}
static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 3cfbc8668cba..68267007da1c 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -177,20 +177,20 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
__rq_qos_requeue(q->rq_qos, rq);
}
-static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
+static inline void rq_qos_done_bio(struct bio *bio)
{
- if (q->rq_qos)
- __rq_qos_done_bio(q->rq_qos, bio);
+ if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
+ bio_flagged(bio, BIO_QOS_MERGED))) {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ if (q->rq_qos)
+ __rq_qos_done_bio(q->rq_qos, bio);
+ }
}
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
- /*
- * BIO_TRACKED lets controllers know that a bio went through the
- * normal rq_qos path.
- */
if (q->rq_qos) {
- bio_set_flag(bio, BIO_TRACKED);
+ bio_set_flag(bio, BIO_QOS_THROTTLED);
__rq_qos_throttle(q->rq_qos, bio);
}
}
@@ -205,8 +205,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq,
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (q->rq_qos)
+ if (q->rq_qos) {
+ bio_set_flag(bio, BIO_QOS_MERGED);
__rq_qos_merge(q->rq_qos, rq, bio);
+ }
}
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9f32882ceb2f..85c4ba006671 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -10,7 +10,6 @@
#include <linux/backing-dev.h>
#include <linux/blktrace_api.h>
#include <linux/blk-mq.h>
-#include <linux/blk-cgroup.h>
#include <linux/debugfs.h>
#include "blk.h"
@@ -18,6 +17,7 @@
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
#include "blk-throttle.h"
struct queue_sysfs_entry {
@@ -739,27 +739,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
}
-/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
-static void blk_exit_queue(struct request_queue *q)
-{
- /*
- * Since the I/O scheduler exit code may access cgroup information,
- * perform I/O scheduler exit before disassociating from the block
- * cgroup controller.
- */
- if (q->elevator) {
- ioc_clear_queue(q);
- elevator_exit(q);
- }
-
- /*
- * Remove all references to @q from the block cgroup controller before
- * restoring @q->queue_lock to avoid that restoring this pointer causes
- * e.g. blkcg_print_blkgs() to crash.
- */
- blkcg_exit_queue(q);
-}
-
/**
* blk_release_queue - releases all allocated resources of the request_queue
* @kobj: pointer to a kobject, whose container is a request_queue
@@ -787,12 +766,12 @@ static void blk_release_queue(struct kobject *kobj)
might_sleep();
+ percpu_ref_exit(&q->q_usage_counter);
+
if (q->poll_stat)
blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb);
- blk_exit_queue(q);
-
blk_free_queue_stats(q->stats);
kfree(q->poll_stat);
@@ -880,6 +859,10 @@ int blk_register_queue(struct gendisk *disk)
goto put_dev;
}
+ ret = blk_crypto_sysfs_register(q);
+ if (ret)
+ goto put_dev;
+
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q);
blk_throtl_register_queue(q);
@@ -910,6 +893,7 @@ unlock:
return ret;
put_dev:
+ elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
@@ -954,16 +938,18 @@ void blk_unregister_queue(struct gendisk *disk)
*/
if (queue_is_mq(q))
blk_mq_unregister_dev(disk_to_dev(disk), q);
-
- kobject_uevent(&q->kobj, KOBJ_REMOVE);
- kobject_del(&q->kobj);
+ blk_crypto_sysfs_unregister(q);
blk_trace_remove_sysfs(disk_to_dev(disk));
mutex_lock(&q->sysfs_lock);
- if (q->elevator)
- elv_unregister_queue(q);
+ elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
+
+ /* Now that we've deleted all child objects, we can delete the queue. */
+ kobject_uevent(&q->kobj, KOBJ_REMOVE);
+ kobject_del(&q->kobj);
+
mutex_unlock(&q->sysfs_dir_lock);
kobject_put(&disk_to_dev(disk)->kobj);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 7c462c006b26..469c483719be 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,7 +10,6 @@
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/blktrace_api.h>
-#include <linux/blk-cgroup.h>
#include "blk.h"
#include "blk-cgroup-rwstat.h"
#include "blk-stat.h"
@@ -42,11 +41,6 @@
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
-enum tg_state_flags {
- THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
- THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
-};
-
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
/* We measure latency for request size from <= 4k to >= 1M */
@@ -426,12 +420,24 @@ static void tg_update_has_rules(struct throtl_grp *tg)
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
struct throtl_data *td = tg->td;
int rw;
+ int has_iops_limit = 0;
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ unsigned int iops_limit = tg_iops_limit(tg, rw);
- for (rw = READ; rw <= WRITE; rw++)
tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
(td->limit_valid[td->limit_index] &&
(tg_bps_limit(tg, rw) != U64_MAX ||
- tg_iops_limit(tg, rw) != UINT_MAX));
+ iops_limit != UINT_MAX));
+
+ if (iops_limit != UINT_MAX)
+ has_iops_limit = 1;
+ }
+
+ if (has_iops_limit)
+ tg->flags |= THROTL_TG_HAS_IOPS_LIMIT;
+ else
+ tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT;
}
static void throtl_pd_online(struct blkg_policy_data *pd)
@@ -634,8 +640,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
- atomic_set(&tg->io_split_cnt[rw], 0);
-
/*
* Previous slice has expired. We must have trimmed it after last
* bio dispatch. That means since start of last slice, we never used
@@ -659,8 +663,6 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
- atomic_set(&tg->io_split_cnt[rw], 0);
-
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -808,7 +810,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
- if (bps_limit == U64_MAX) {
+ /* no need to throttle if this bio's bytes have been accounted */
+ if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) {
if (wait)
*wait = 0;
return true;
@@ -871,7 +874,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
- if (bps_limit == U64_MAX && iops_limit == UINT_MAX) {
+ if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
+ tg->flags & THROTL_TG_CANCELING) {
if (wait)
*wait = 0;
return true;
@@ -893,9 +897,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice);
}
- if (iops_limit != UINT_MAX)
- tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0);
-
if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
if (wait)
@@ -920,9 +921,12 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */
- tg->bytes_disp[rw] += bio_size;
+ if (!bio_flagged(bio, BIO_THROTTLED)) {
+ tg->bytes_disp[rw] += bio_size;
+ tg->last_bytes_disp[rw] += bio_size;
+ }
+
tg->io_disp[rw]++;
- tg->last_bytes_disp[rw] += bio_size;
tg->last_io_disp[rw]++;
/*
@@ -1134,12 +1138,22 @@ static void throtl_pending_timer_fn(struct timer_list *t)
struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
struct throtl_grp *tg = sq_to_tg(sq);
struct throtl_data *td = sq_to_td(sq);
- struct request_queue *q = td->queue;
struct throtl_service_queue *parent_sq;
+ struct request_queue *q;
bool dispatched;
int ret;
+ /* throtl_data may be gone, so figure out request queue by blkg */
+ if (tg)
+ q = tg->pd.blkg->q;
+ else
+ q = td->queue;
+
spin_lock_irq(&q->queue_lock);
+
+ if (!q->root_blkg)
+ goto out_unlock;
+
if (throtl_can_upgrade(td, NULL))
throtl_upgrade_state(td);
@@ -1219,7 +1233,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bio_list_on_stack)))
- submit_bio_noacct(bio);
+ submit_bio_noacct_nocheck(bio);
blk_finish_plug(&plug);
}
}
@@ -1763,6 +1777,39 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
return false;
}
+void blk_throtl_cancel_bios(struct request_queue *q)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ spin_lock_irq(&q->queue_lock);
+ /*
+ * queue_lock is held, rcu lock is not needed here technically.
+ * However, rcu lock is still held to emphasize that following
+ * path need RCU protection and to prevent warning from lockdep.
+ */
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ /*
+ * Set the flag to make sure throtl_pending_timer_fn() won't
+ * stop until all throttled bios are dispatched.
+ */
+ blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
+ /*
+ * Update disptime after setting the above flag to make sure
+ * throtl_select_dispatch() won't exit without dispatching.
+ */
+ tg_update_disptime(tg);
+
+ throtl_schedule_pending_timer(sq, jiffies + 1);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&q->queue_lock);
+}
+
static bool throtl_can_upgrade(struct throtl_data *td,
struct throtl_grp *this_tg)
{
@@ -1917,14 +1964,12 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
}
if (tg->iops[READ][LIMIT_LOW]) {
- tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0);
iops = tg->last_io_disp[READ] * HZ / elapsed_time;
if (iops >= tg->iops[READ][LIMIT_LOW])
tg->last_low_overflow_time[READ] = now;
}
if (tg->iops[WRITE][LIMIT_LOW]) {
- tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0);
iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
if (iops >= tg->iops[WRITE][LIMIT_LOW])
tg->last_low_overflow_time[WRITE] = now;
@@ -2043,25 +2088,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
}
#endif
-void blk_throtl_charge_bio_split(struct bio *bio)
-{
- struct blkcg_gq *blkg = bio->bi_blkg;
- struct throtl_grp *parent = blkg_to_tg(blkg);
- struct throtl_service_queue *parent_sq;
- bool rw = bio_data_dir(bio);
-
- do {
- if (!parent->has_rules[rw])
- break;
-
- atomic_inc(&parent->io_split_cnt[rw]);
- atomic_inc(&parent->last_io_split_cnt[rw]);
-
- parent_sq = parent->service_queue.parent_sq;
- parent = sq_to_tg(parent_sq);
- } while (parent);
-}
-
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 175f03abd9e4..c1b602996127 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -52,6 +52,13 @@ struct throtl_service_queue {
struct timer_list pending_timer; /* fires on first_pending_disptime */
};
+enum tg_state_flags {
+ THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
+ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
+ THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */
+ THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
+};
+
enum {
LIMIT_LOW,
LIMIT_MAX,
@@ -132,9 +139,6 @@ struct throtl_grp {
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
- atomic_t io_split_cnt[2];
- atomic_t last_io_split_cnt[2];
-
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
@@ -158,20 +162,23 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
-static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
+static inline void blk_throtl_cancel_bios(struct request_queue *q) { }
#else /* CONFIG_BLK_DEV_THROTTLING */
int blk_throtl_init(struct request_queue *q);
void blk_throtl_exit(struct request_queue *q);
void blk_throtl_register_queue(struct request_queue *q);
-void blk_throtl_charge_bio_split(struct bio *bio);
bool __blk_throtl_bio(struct bio *bio);
+void blk_throtl_cancel_bios(struct request_queue *q);
static inline bool blk_throtl_bio(struct bio *bio)
{
struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
- if (bio_flagged(bio, BIO_THROTTLED))
+ /* no need to throttle bps any more if the bio has been throttled */
+ if (bio_flagged(bio, BIO_THROTTLED) &&
+ !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT))
return false;
+
if (!tg->has_rules[bio_data_dir(bio)])
return false;
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 774ecc598bee..602bef54c813 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -215,9 +215,8 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
continue;
}
- bio = blk_next_bio(bio, 0, gfp_mask);
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC;
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
+ gfp_mask);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
@@ -239,10 +238,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
{
struct bio bio;
- bio_init(&bio, NULL, 0);
- bio_set_dev(&bio, bdev);
- bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
-
+ bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
return submit_bio_wait(&bio);
}
@@ -306,9 +302,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
}
while (sector < end_sector) {
- bio = blk_next_bio(bio, 0, gfp_mask);
- bio_set_dev(bio, bdev);
- bio->bi_opf = op | REQ_SYNC;
+ bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
diff --git a/block/blk.h b/block/blk.h
index 8bd43b3ad33d..6f21859c7f0f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -46,7 +46,7 @@ void blk_freeze_queue(struct request_queue *q);
void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
void blk_queue_start_drain(struct request_queue *q);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
-bool submit_bio_checks(struct bio *bio);
+void submit_bio_noacct_nocheck(struct bio *bio);
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
@@ -325,7 +325,7 @@ int blk_dev_init(void);
*/
static inline bool blk_do_io_stat(struct request *rq)
{
- return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk;
+ return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq);
}
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
@@ -406,8 +406,6 @@ extern int blk_iolatency_init(struct request_queue *q);
static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
#endif
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
-
#ifdef CONFIG_BLK_DEV_ZONED
void blk_queue_free_zone_bitmaps(struct request_queue *q);
void blk_queue_clear_zone_settings(struct request_queue *q);
@@ -426,6 +424,7 @@ int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
int bdev_del_partition(struct gendisk *disk, int partno);
int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
sector_t length);
+void blk_drop_partitions(struct gendisk *disk);
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
@@ -445,6 +444,9 @@ int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
void disk_del_events(struct gendisk *disk);
void disk_release_events(struct gendisk *disk);
+void disk_block_events(struct gendisk *disk);
+void disk_unblock_events(struct gendisk *disk);
+void disk_flush_events(struct gendisk *disk, unsigned int mask);
extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;
diff --git a/block/bounce.c b/block/bounce.c
index 7af1a72835b9..3d50d19cde72 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -14,7 +14,6 @@
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
-#include <linux/blk-cgroup.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/hash.h>
@@ -24,6 +23,7 @@
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-cgroup.h"
#define POOL_SIZE 64
#define ISA_POOL_SIZE 16
@@ -162,15 +162,12 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
* that does not own the bio - reason being drivers don't use it for
* iterating over the biovec anymore, so expecting it to be kept up
* to date (i.e. for clones that share the parent biovec) is just
- * asking for trouble and would force extra work on
- * __bio_clone_fast() anyways.
+ * asking for trouble and would force extra work.
*/
- bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src),
- &bounce_bio_set);
- bio->bi_bdev = bio_src->bi_bdev;
+ bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src),
+ bio_src->bi_opf, GFP_NOIO, &bounce_bio_set);
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
- bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
diff --git a/block/disk-events.c b/block/disk-events.c
index 8d5496e7592a..aee25a7e1ab7 100644
--- a/block/disk-events.c
+++ b/block/disk-events.c
@@ -4,7 +4,7 @@
*/
#include <linux/export.h>
#include <linux/moduleparam.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include "blk.h"
struct disk_events {
diff --git a/block/elevator.c b/block/elevator.c
index 482df2a350fc..c319765892bb 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -35,7 +35,6 @@
#include <linux/hash.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
@@ -44,6 +43,7 @@
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -192,6 +192,9 @@ void elevator_exit(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
+ ioc_clear_queue(q);
+ blk_mq_sched_free_rqs(q);
+
mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock);
@@ -516,9 +519,11 @@ int elv_register_queue(struct request_queue *q, bool uevent)
void elv_unregister_queue(struct request_queue *q)
{
+ struct elevator_queue *e = q->elevator;
+
lockdep_assert_held(&q->sysfs_lock);
- if (q) {
+ if (e && e->registered) {
struct elevator_queue *e = q->elevator;
kobject_uevent(&e->kobj, KOBJ_REMOVE);
@@ -591,11 +596,7 @@ int elevator_switch_mq(struct request_queue *q,
lockdep_assert_held(&q->sysfs_lock);
if (q->elevator) {
- if (q->elevator->registered)
- elv_unregister_queue(q);
-
- ioc_clear_queue(q);
- blk_mq_sched_free_rqs(q);
+ elv_unregister_queue(q);
elevator_exit(q);
}
@@ -606,7 +607,6 @@ int elevator_switch_mq(struct request_queue *q,
if (new_e) {
ret = elv_register_queue(q, true);
if (ret) {
- blk_mq_sched_free_rqs(q);
elevator_exit(q);
goto out;
}
diff --git a/block/fops.c b/block/fops.c
index a18e7fbd97b8..7ccc4ff109ce 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -75,8 +75,13 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
return -ENOMEM;
}
- bio_init(&bio, vecs, nr_pages);
- bio_set_dev(&bio, bdev);
+ if (iov_iter_rw(iter) == READ) {
+ bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
+ if (iter_is_iovec(iter))
+ should_dirty = true;
+ } else {
+ bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
+ }
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = iocb->ki_hint;
bio.bi_private = current;
@@ -88,14 +93,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
goto out;
ret = bio.bi_iter.bi_size;
- if (iov_iter_rw(iter) == READ) {
- bio.bi_opf = REQ_OP_READ;
- if (iter_is_iovec(iter))
- should_dirty = true;
- } else {
- bio.bi_opf = dio_bio_write_op(iocb);
+ if (iov_iter_rw(iter) == WRITE)
task_io_account_write(ret);
- }
+
if (iocb->ki_flags & IOCB_NOWAIT)
bio.bi_opf |= REQ_NOWAIT;
if (iocb->ki_flags & IOCB_HIPRI)
@@ -190,6 +190,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
struct blkdev_dio *dio;
struct bio *bio;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
+ unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
loff_t pos = iocb->ki_pos;
int ret = 0;
@@ -197,7 +198,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+ bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
atomic_set(&dio->ref, 1);
@@ -223,7 +224,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug);
for (;;) {
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = iocb->ki_hint;
bio->bi_private = dio;
@@ -238,11 +238,9 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
if (is_read) {
- bio->bi_opf = REQ_OP_READ;
if (dio->flags & DIO_SHOULD_DIRTY)
bio_set_pages_dirty(bio);
} else {
- bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -258,7 +256,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
atomic_inc(&dio->ref);
submit_bio(bio);
- bio = bio_alloc(GFP_KERNEL, nr_pages);
+ bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL);
}
blk_finish_plug(&plug);
@@ -313,6 +311,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
unsigned int nr_pages)
{
struct block_device *bdev = iocb->ki_filp->private_data;
+ bool is_read = iov_iter_rw(iter) == READ;
+ unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
struct bio *bio;
loff_t pos = iocb->ki_pos;
@@ -322,11 +322,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+ bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
dio->flags = 0;
dio->iocb = iocb;
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = iocb->ki_hint;
bio->bi_end_io = blkdev_bio_end_io_async;
@@ -349,14 +348,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
}
dio->size = bio->bi_iter.bi_size;
- if (iov_iter_rw(iter) == READ) {
- bio->bi_opf = REQ_OP_READ;
+ if (is_read) {
if (iter_is_iovec(iter)) {
dio->flags |= DIO_SHOULD_DIRTY;
bio_set_pages_dirty(bio);
}
} else {
- bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
diff --git a/block/genhd.c b/block/genhd.c
index 9eca1f7d35c9..37eb41ee4086 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -8,7 +8,6 @@
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
@@ -26,10 +25,12 @@
#include <linux/pm_runtime.h>
#include <linux/badblocks.h>
#include <linux/part_stat.h>
+#include "blk-throttle.h"
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
+#include "blk-cgroup.h"
static struct kobject *block_depr;
@@ -185,7 +186,9 @@ static struct blk_major_name {
struct blk_major_name *next;
int major;
char name[16];
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
void (*probe)(dev_t devt);
+#endif
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
static DEFINE_MUTEX(major_names_lock);
static DEFINE_SPINLOCK(major_names_spinlock);
@@ -275,7 +278,9 @@ int __register_blkdev(unsigned int major, const char *name,
}
p->major = major;
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
p->probe = probe;
+#endif
strlcpy(p->name, name, sizeof(p->name));
p->next = NULL;
index = major_to_index(major);
@@ -523,6 +528,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
disk_update_readahead(disk);
disk_add_events(disk);
+ set_bit(GD_ADDED, &disk->state);
return 0;
out_unregister_bdi:
@@ -636,7 +642,8 @@ void del_gendisk(struct gendisk *disk)
blk_mq_freeze_queue_wait(q);
- rq_qos_exit(q);
+ blk_throtl_cancel_bios(disk->queue);
+
blk_sync_queue(q);
blk_flush_integrity();
/*
@@ -693,6 +700,7 @@ static ssize_t disk_badblocks_store(struct device *dev,
return badblocks_store(disk->bb, page, len, 0);
}
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
void blk_request_module(dev_t devt)
{
unsigned int major = MAJOR(devt);
@@ -712,6 +720,7 @@ void blk_request_module(dev_t devt)
/* Make old-style 2.4 aliases work */
request_module("block-major-%d", MAJOR(devt));
}
+#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */
/*
* print a full list of all partitions - intended for places where the root
@@ -927,12 +936,17 @@ ssize_t part_stat_show(struct device *dev,
struct disk_stats stat;
unsigned int inflight;
- part_stat_read_all(bdev, &stat);
if (queue_is_mq(q))
inflight = blk_mq_in_flight(q, bdev);
else
inflight = part_in_flight(bdev);
+ if (inflight) {
+ part_stat_lock();
+ update_io_ticks(bdev, jiffies, true);
+ part_stat_unlock();
+ }
+ part_stat_read_all(bdev, &stat);
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
@@ -1100,6 +1114,31 @@ static const struct attribute_group *disk_attr_groups[] = {
NULL
};
+static void disk_release_mq(struct request_queue *q)
+{
+ blk_mq_cancel_work_sync(q);
+
+ /*
+ * There can't be any non non-passthrough bios in flight here, but
+ * requests stay around longer, including passthrough ones so we
+ * still need to freeze the queue here.
+ */
+ blk_mq_freeze_queue(q);
+
+ /*
+ * Since the I/O scheduler exit code may access cgroup information,
+ * perform I/O scheduler exit before disassociating from the block
+ * cgroup controller.
+ */
+ if (q->elevator) {
+ mutex_lock(&q->sysfs_lock);
+ elevator_exit(q);
+ mutex_unlock(&q->sysfs_lock);
+ }
+ rq_qos_exit(q);
+ __blk_mq_unfreeze_queue(q, true);
+}
+
/**
* disk_release - releases all allocated resources of the gendisk
* @dev: the device representing this disk
@@ -1121,13 +1160,21 @@ static void disk_release(struct device *dev)
might_sleep();
WARN_ON_ONCE(disk_live(disk));
- blk_mq_cancel_work_sync(disk->queue);
+ if (queue_is_mq(disk->queue))
+ disk_release_mq(disk->queue);
+
+ blkcg_exit_queue(disk->queue);
disk_release_events(disk);
kfree(disk->random);
xa_destroy(&disk->part_tbl);
+
disk->queue->disk = NULL;
blk_put_queue(disk->queue);
+
+ if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk)
+ disk->fops->free_disk(disk);
+
iput(disk->part0->bd_inode); /* frees the disk */
}
@@ -1188,12 +1235,17 @@ static int diskstats_show(struct seq_file *seqf, void *v)
xa_for_each(&gp->part_tbl, idx, hd) {
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
- part_stat_read_all(hd, &stat);
if (queue_is_mq(gp->queue))
inflight = blk_mq_in_flight(gp->queue, hd);
else
inflight = part_in_flight(hd);
+ if (inflight) {
+ part_stat_lock();
+ update_io_ticks(hd, jiffies, true);
+ part_stat_unlock();
+ }
+ part_stat_read_all(hd, &stat);
seq_printf(seqf, "%4d %7d %pg "
"%lu %lu %lu %u "
"%lu %lu %lu %u "
@@ -1322,6 +1374,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
goto out_destroy_part_tbl;
+ if (blkcg_init_queue(q))
+ goto out_erase_part0;
+
rand_initialize_disk(disk);
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
@@ -1334,6 +1389,8 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
#endif
return disk;
+out_erase_part0:
+ xa_erase(&disk->part_tbl, 0);
out_destroy_part_tbl:
xa_destroy(&disk->part_tbl);
disk->part0->bd_disk = NULL;
diff --git a/block/holder.c b/block/holder.c
index 27cddce1b446..8d750281a1cd 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/slab.h>
struct bd_holder_disk {
diff --git a/block/partitions/check.h b/block/partitions/check.h
index d5b28e309d64..4ffa2359b1a3 100644
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/pagemap.h>
#include <linux/blkdev.h>
-#include <linux/genhd.h>
#include "../blk.h"
/*
diff --git a/block/partitions/core.c b/block/partitions/core.c
index c2a1635922b1..2ef8dfa1e5c8 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -8,7 +8,6 @@
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/ctype.h>
-#include <linux/genhd.h>
#include <linux/vmalloc.h>
#include <linux/blktrace_api.h>
#include <linux/raid/detect.h>
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index 8cc2b88d0aa8..84b9f36b9e47 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -13,7 +13,6 @@
#include <linux/types.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
index 8693704dcf5e..0a747a0c782d 100644
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@@ -14,7 +14,6 @@
#include <linux/types.h>
#include <linux/list.h>
-#include <linux/genhd.h>
#include <linux/fs.h>
#include <asm/unaligned.h>
#include <asm/byteorder.h>
diff --git a/block/sed-opal.c b/block/sed-opal.c
index daafadbb88ca..9700197000f2 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -13,7 +13,7 @@
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/list.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <uapi/linux/sed-opal.h>
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 7476f393df97..8feb85e186e3 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -16,7 +16,7 @@
#include <linux/kdev_t.h>
#include <linux/err.h>
#include <linux/slab.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/mutex.h>
#include "base.h"
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 7bb957b11861..3d6430eb0c6a 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -21,7 +21,7 @@
#include <linux/notifier.h>
#include <linux/of.h>
#include <linux/of_device.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/mutex.h>
#include <linux/pm_runtime.h>
#include <linux/netdevice.h>
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index f41063ac1aee..db5a03a0618e 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -17,7 +17,7 @@
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/device.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/namei.h>
#include <linux/fs.h>
#include <linux/shmem_fs.h>
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 52484bcdedb9..8a91fcac6f82 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -12,7 +12,6 @@
#include <linux/ioctl.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
-#include <linux/genhd.h>
#include <linux/netdevice.h>
#include <linux/mutex.h>
#include <linux/export.h>
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 6af111f568e4..cc11f89a0928 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -10,7 +10,6 @@
#include <linux/blk-mq.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
-#include <linux/genhd.h>
#include <linux/moduleparam.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 72cf7603d51f..f5bcded3640d 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -138,15 +138,14 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
op_flags |= REQ_FUA | REQ_PREFLUSH;
op_flags |= REQ_SYNC;
- bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set);
- bio_set_dev(bio, bdev->md_bdev);
+ bio = bio_alloc_bioset(bdev->md_bdev, 1, op | op_flags, GFP_NOIO,
+ &drbd_md_io_bio_set);
bio->bi_iter.bi_sector = sector;
err = -EIO;
if (bio_add_page(bio, device->md_io.page, size, 0) != size)
goto out;
bio->bi_private = device;
bio->bi_end_io = drbd_md_endio;
- bio_set_op_attrs(bio, op, op_flags);
if (op != REQ_OP_WRITE && device->state.disk == D_DISKLESS && device->ldev == NULL)
/* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index c1f816f896a8..df25eecf80af 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -976,12 +976,13 @@ static void drbd_bm_endio(struct bio *bio)
static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
{
- struct bio *bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set);
struct drbd_device *device = ctx->device;
+ unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE;
+ struct bio *bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op,
+ GFP_NOIO, &drbd_md_io_bio_set);
struct drbd_bitmap *b = device->bitmap;
struct page *page;
unsigned int len;
- unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE;
sector_t on_disk_sector =
device->ldev->md.md_offset + device->ldev->md.bm_offset;
@@ -1006,14 +1007,12 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
bm_store_page_idx(page, page_nr);
} else
page = b->bm_pages[page_nr];
- bio_set_dev(bio, device->ldev->md_bdev);
bio->bi_iter.bi_sector = on_disk_sector;
/* bio_add_page of a single page to an empty bio will always succeed,
* according to api. Do we want to assert that? */
bio_add_page(bio, page, len, 0);
bio->bi_private = ctx;
bio->bi_end_io = drbd_bm_endio;
- bio_set_op_attrs(bio, op, 0);
if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
bio_io_error(bio);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f27d5b0f9a0b..acb1ad3c0603 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -27,7 +27,6 @@
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
-#include <linux/genhd.h>
#include <linux/idr.h>
#include <linux/dynamic_debug.h>
#include <net/tcp.h>
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6df2539e215b..04e3ec12d8b4 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1279,16 +1279,16 @@ static void one_flush_endio(struct bio *bio)
static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
{
- struct bio *bio = bio_alloc(GFP_NOIO, 0);
+ struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0,
+ REQ_OP_FLUSH | REQ_PREFLUSH, GFP_NOIO);
struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
- if (!bio || !octx) {
- drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n");
+
+ if (!octx) {
+ drbd_warn(device, "Could not allocate a octx, CANNOT ISSUE FLUSH\n");
/* FIXME: what else can I do now? disconnecting or detaching
* really does not help to improve the state of the world, either.
*/
- kfree(octx);
- if (bio)
- bio_put(bio);
+ bio_put(bio);
ctx->error = -ENOMEM;
put_ldev(device);
@@ -1298,10 +1298,8 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont
octx->device = device;
octx->ctx = ctx;
- bio_set_dev(bio, device->ldev->backing_bdev);
bio->bi_private = octx;
bio->bi_end_io = one_flush_endio;
- bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH;
device->flush_jif = jiffies;
set_bit(FLUSH_PENDING, &device->flags);
@@ -1646,7 +1644,6 @@ int drbd_submit_peer_request(struct drbd_device *device,
unsigned data_size = peer_req->i.size;
unsigned n_bios = 0;
unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
- int err = -ENOMEM;
/* TRIM/DISCARD: for now, always use the helper function
* blkdev_issue_zeroout(..., discard=true).
@@ -1687,15 +1684,10 @@ int drbd_submit_peer_request(struct drbd_device *device,
* generated bio, but a bio allocated on behalf of the peer.
*/
next_bio:
- bio = bio_alloc(GFP_NOIO, nr_pages);
- if (!bio) {
- drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
- goto fail;
- }
+ bio = bio_alloc(device->ldev->backing_bdev, nr_pages, op | op_flags,
+ GFP_NOIO);
/* > peer_req->i.sector, unless this is the first bio */
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, device->ldev->backing_bdev);
- bio_set_op_attrs(bio, op, op_flags);
bio->bi_private = peer_req;
bio->bi_end_io = drbd_peer_request_endio;
@@ -1726,14 +1718,6 @@ next_bio:
drbd_submit_bio_noacct(device, fault_type, bio);
} while (bios);
return 0;
-
-fail:
- while (bios) {
- bio = bios;
- bios = bios->bi_next;
- bio_put(bio);
- }
- return err;
}
static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3235532ae077..c00ae8619519 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -30,7 +30,8 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
return NULL;
memset(req, 0, sizeof(*req));
- req->private_bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set);
+ req->private_bio = bio_alloc_clone(device->ldev->backing_bdev, bio_src,
+ GFP_NOIO, &drbd_io_bio_set);
req->private_bio->bi_private = req;
req->private_bio->bi_end_io = drbd_request_endio;
@@ -1151,8 +1152,6 @@ drbd_submit_req_private_bio(struct drbd_request *req)
else
type = DRBD_FAULT_DT_RD;
- bio_set_dev(bio, device->ldev->backing_bdev);
-
/* State may have changed since we grabbed our reference on the
* ->ldev member. Double check, and short-circuit to endio.
* In case the last activity log transaction failed to get on
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 64563bfdf0da..a5e04b38006b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1523,9 +1523,9 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
drbd_al_begin_io(device, &req->i);
- req->private_bio = bio_clone_fast(req->master_bio, GFP_NOIO,
+ req->private_bio = bio_alloc_clone(device->ldev->backing_bdev,
+ req->master_bio, GFP_NOIO,
&drbd_io_bio_set);
- bio_set_dev(req->private_bio, device->ldev->backing_bdev);
req->private_bio->bi_private = req;
req->private_bio->bi_end_io = drbd_request_endio;
submit_bio_noacct(req->private_bio);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index e611411a934c..19c2d0327e15 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4129,15 +4129,13 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive)
cbdata.drive = drive;
- bio_init(&bio, &bio_vec, 1);
- bio_set_dev(&bio, bdev);
+ bio_init(&bio, bdev, &bio_vec, 1, REQ_OP_READ);
bio_add_page(&bio, page, block_size(bdev), 0);
bio.bi_iter.bi_sector = 0;
bio.bi_flags |= (1 << BIO_QUIET);
bio.bi_private = &cbdata;
bio.bi_end_io = floppy_rb0_cb;
- bio_set_op_attrs(&bio, REQ_OP_READ, 0);
init_completion(&cbdata.complete);
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 2b588b62cbbb..4fbaf0b4958b 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -19,7 +19,6 @@
#include <linux/compat.h>
#include <linux/fs.h>
#include <linux/module.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/bio.h>
@@ -161,9 +160,7 @@ static bool mtip_check_surprise_removal(struct driver_data *dd)
static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
unsigned int tag)
{
- struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0];
-
- return blk_mq_rq_to_pdu(blk_mq_tag_to_rq(hctx->tags, tag));
+ return blk_mq_rq_to_pdu(blk_mq_tag_to_rq(dd->tags.tags[0], tag));
}
/*
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 88f4206310e4..6816beb45352 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -15,7 +15,6 @@
#include <linux/rwsem.h>
#include <linux/ata.h>
#include <linux/interrupt.h>
-#include <linux/genhd.h>
/* Offset of Subsystem Device ID in pci confoguration space */
#define PCI_SUBSYSTEM_DEVICEID 0x2E
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 2b6b70a39e76..e745fc29e55d 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1020,9 +1020,8 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
continue;
bio = pkt->r_bios[f];
- bio_reset(bio);
+ bio_reset(bio, pd->bdev, REQ_OP_READ);
bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
- bio_set_dev(bio, pd->bdev);
bio->bi_end_io = pkt_end_io_read;
bio->bi_private = pkt;
@@ -1034,7 +1033,6 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
BUG();
atomic_inc(&pkt->io_wait);
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
pkt_queue_bio(pd, bio);
frames_read++;
}
@@ -1235,9 +1233,8 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
{
int f;
- bio_reset(pkt->w_bio);
+ bio_reset(pkt->w_bio, pd->bdev, REQ_OP_WRITE);
pkt->w_bio->bi_iter.bi_sector = pkt->sector;
- bio_set_dev(pkt->w_bio, pd->bdev);
pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
pkt->w_bio->bi_private = pkt;
@@ -1270,7 +1267,6 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
/* Start the write request */
atomic_set(&pkt->io_wait, 1);
- bio_set_op_attrs(pkt->w_bio, REQ_OP_WRITE, 0);
pkt_queue_bio(pd, pkt->w_bio);
}
@@ -2298,12 +2294,12 @@ static void pkt_end_io_read_cloned(struct bio *bio)
static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
{
- struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, &pkt_bio_set);
+ struct bio *cloned_bio =
+ bio_alloc_clone(pd->bdev, bio, GFP_NOIO, &pkt_bio_set);
struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
psd->pd = pd;
psd->bio = bio;
- bio_set_dev(cloned_bio, pd->bdev);
cloned_bio->bi_private = psd;
cloned_bio->bi_end_io = pkt_end_io_read_cloned;
pd->stats.secs_r += bio_sectors(bio);
@@ -2404,18 +2400,11 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
static void pkt_submit_bio(struct bio *bio)
{
- struct pktcdvd_device *pd;
- char b[BDEVNAME_SIZE];
+ struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
struct bio *split;
blk_queue_split(&bio);
- pd = bio->bi_bdev->bd_disk->queue->queuedata;
- if (!pd) {
- pr_err("%s incorrect request queue\n", bio_devname(bio, b));
- goto end_io;
- }
-
pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
(unsigned long long)bio->bi_iter.bi_sector,
(unsigned long long)bio_end_sector(bio));
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index c08971de369f..58304f978e10 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -1343,7 +1343,7 @@ static inline void rnbd_init_hw_queue(struct rnbd_clt_dev *dev,
static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev)
{
- int i;
+ unsigned long i;
struct blk_mq_hw_ctx *hctx;
struct rnbd_queue *q;
diff --git a/drivers/block/rnbd/rnbd-srv-dev.c b/drivers/block/rnbd/rnbd-srv-dev.c
index b241a099aeae..c5d0a0391165 100644
--- a/drivers/block/rnbd/rnbd-srv-dev.c
+++ b/drivers/block/rnbd/rnbd-srv-dev.c
@@ -12,8 +12,7 @@
#include "rnbd-srv-dev.h"
#include "rnbd-log.h"
-struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags,
- struct bio_set *bs)
+struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags)
{
struct rnbd_dev *dev;
int ret;
@@ -30,7 +29,6 @@ struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags,
dev->blk_open_flags = flags;
bdevname(dev->bdev, dev->name);
- dev->ibd_bio_set = bs;
return dev;
@@ -44,60 +42,3 @@ void rnbd_dev_close(struct rnbd_dev *dev)
blkdev_put(dev->bdev, dev->blk_open_flags);
kfree(dev);
}
-
-void rnbd_dev_bi_end_io(struct bio *bio)
-{
- struct rnbd_dev_blk_io *io = bio->bi_private;
-
- rnbd_endio(io->priv, blk_status_to_errno(bio->bi_status));
- bio_put(bio);
-}
-
-/**
- * rnbd_bio_map_kern - map kernel address into bio
- * @data: pointer to buffer to map
- * @bs: bio_set to use.
- * @len: length in bytes
- * @gfp_mask: allocation flags for bio allocation
- *
- * Map the kernel address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs,
- unsigned int len, gfp_t gfp_mask)
-{
- unsigned long kaddr = (unsigned long)data;
- unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = kaddr >> PAGE_SHIFT;
- const int nr_pages = end - start;
- int offset, i;
- struct bio *bio;
-
- bio = bio_alloc_bioset(gfp_mask, nr_pages, bs);
- if (!bio)
- return ERR_PTR(-ENOMEM);
-
- offset = offset_in_page(kaddr);
- for (i = 0; i < nr_pages; i++) {
- unsigned int bytes = PAGE_SIZE - offset;
-
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- if (bio_add_page(bio, virt_to_page(data), bytes,
- offset) < bytes) {
- /* we don't support partial mappings */
- bio_put(bio);
- return ERR_PTR(-EINVAL);
- }
-
- data += bytes;
- len -= bytes;
- offset = 0;
- }
-
- return bio;
-}
diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h
index 0eb23850afb9..2c3df02b5e8e 100644
--- a/drivers/block/rnbd/rnbd-srv-dev.h
+++ b/drivers/block/rnbd/rnbd-srv-dev.h
@@ -14,25 +14,16 @@
struct rnbd_dev {
struct block_device *bdev;
- struct bio_set *ibd_bio_set;
fmode_t blk_open_flags;
char name[BDEVNAME_SIZE];
};
-struct rnbd_dev_blk_io {
- struct rnbd_dev *dev;
- void *priv;
- /* have to be last member for front_pad usage of bioset_init */
- struct bio bio;
-};
-
/**
* rnbd_dev_open() - Open a device
+ * @path: path to open
* @flags: open flags
- * @bs: bio_set to use during block io,
*/
-struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags,
- struct bio_set *bs);
+struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags);
/**
* rnbd_dev_close() - Close a device
@@ -41,11 +32,6 @@ void rnbd_dev_close(struct rnbd_dev *dev);
void rnbd_endio(void *priv, int error);
-void rnbd_dev_bi_end_io(struct bio *bio);
-
-struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs,
- unsigned int len, gfp_t gfp_mask);
-
static inline int rnbd_dev_get_max_segs(const struct rnbd_dev *dev)
{
return queue_max_segments(bdev_get_queue(dev->bdev));
diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c
index 4db98e0e76f0..feaa76c5a342 100644
--- a/drivers/block/rnbd/rnbd-srv-sysfs.c
+++ b/drivers/block/rnbd/rnbd-srv-sysfs.c
@@ -13,7 +13,6 @@
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <linux/stat.h>
-#include <linux/genhd.h>
#include <linux/list.h>
#include <linux/moduleparam.h>
#include <linux/device.h>
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 1ee808fc600c..132e950685d5 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -114,6 +114,12 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess)
return sess_dev;
}
+static void rnbd_dev_bi_end_io(struct bio *bio)
+{
+ rnbd_endio(bio->bi_private, blk_status_to_errno(bio->bi_status));
+ bio_put(bio);
+}
+
static int process_rdma(struct rnbd_srv_session *srv_sess,
struct rtrs_srv_op *id, void *data, u32 datalen,
const void *usr, size_t usrlen)
@@ -123,7 +129,6 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
struct rnbd_srv_sess_dev *sess_dev;
u32 dev_id;
int err;
- struct rnbd_dev_blk_io *io;
struct bio *bio;
short prio;
@@ -144,33 +149,29 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
priv->sess_dev = sess_dev;
priv->id = id;
- /* Generate bio with pages pointing to the rdma buffer */
- bio = rnbd_bio_map_kern(data, sess_dev->rnbd_dev->ibd_bio_set, datalen, GFP_KERNEL);
- if (IS_ERR(bio)) {
- err = PTR_ERR(bio);
- rnbd_srv_err(sess_dev, "Failed to generate bio, err: %d\n", err);
- goto sess_dev_put;
+ bio = bio_alloc(sess_dev->rnbd_dev->bdev, 1,
+ rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
+ if (bio_add_page(bio, virt_to_page(data), datalen,
+ offset_in_page(data)) != datalen) {
+ rnbd_srv_err(sess_dev, "Failed to map data to bio\n");
+ err = -EINVAL;
+ goto bio_put;
}
- io = container_of(bio, struct rnbd_dev_blk_io, bio);
- io->dev = sess_dev->rnbd_dev;
- io->priv = priv;
-
bio->bi_end_io = rnbd_dev_bi_end_io;
- bio->bi_private = io;
- bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
+ bio->bi_private = priv;
bio->bi_iter.bi_sector = le64_to_cpu(msg->sector);
bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size);
prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR ||
usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio);
bio_set_prio(bio, prio);
- bio_set_dev(bio, sess_dev->rnbd_dev->bdev);
submit_bio(bio);
return 0;
-sess_dev_put:
+bio_put:
+ bio_put(bio);
rnbd_put_sess_dev(sess_dev);
err:
kfree(priv);
@@ -251,7 +252,6 @@ static void destroy_sess(struct rnbd_srv_session *srv_sess)
out:
xa_destroy(&srv_sess->index_idr);
- bioset_exit(&srv_sess->sess_bio_set);
pr_info("RTRS Session %s disconnected\n", srv_sess->sessname);
@@ -280,16 +280,6 @@ static int create_sess(struct rtrs_srv_sess *rtrs)
return -ENOMEM;
srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs);
- err = bioset_init(&srv_sess->sess_bio_set, srv_sess->queue_depth,
- offsetof(struct rnbd_dev_blk_io, bio),
- BIOSET_NEED_BVECS);
- if (err) {
- pr_err("Allocating srv_session for path %s failed\n",
- pathname);
- kfree(srv_sess);
- return err;
- }
-
xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC);
INIT_LIST_HEAD(&srv_sess->sess_dev_list);
mutex_init(&srv_sess->lock);
@@ -738,8 +728,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
goto reject;
}
- rnbd_dev = rnbd_dev_open(full_path, open_flags,
- &srv_sess->sess_bio_set);
+ rnbd_dev = rnbd_dev_open(full_path, open_flags);
if (IS_ERR(rnbd_dev)) {
pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %ld\n",
full_path, srv_sess->sessname, PTR_ERR(rnbd_dev));
diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h
index e5604bce123a..be2ae486d407 100644
--- a/drivers/block/rnbd/rnbd-srv.h
+++ b/drivers/block/rnbd/rnbd-srv.h
@@ -23,7 +23,6 @@ struct rnbd_srv_session {
struct rtrs_srv_sess *rtrs;
char sessname[NAME_MAX];
int queue_depth;
- struct bio_set sess_bio_set;
struct xarray index_idr;
/* List of struct rnbd_srv_sess_dev */
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 146d85d80e0e..dd0a1a6fed29 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -9,7 +9,6 @@
#include <linux/types.h>
#include <linux/blk-mq.h>
#include <linux/hdreg.h>
-#include <linux/genhd.h>
#include <linux/cdrom.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8c415be86732..35399a9eae0e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -69,13 +69,6 @@ struct virtio_blk {
/* Process context for config space updates */
struct work_struct config_work;
- /*
- * Tracks references from block_device_operations open/release and
- * virtio_driver probe/remove so this object can be freed once no
- * longer in use.
- */
- refcount_t refs;
-
/* Ida index - used to track minor number allocations. */
int index;
@@ -386,43 +379,6 @@ out:
return err;
}
-static void virtblk_get(struct virtio_blk *vblk)
-{
- refcount_inc(&vblk->refs);
-}
-
-static void virtblk_put(struct virtio_blk *vblk)
-{
- if (refcount_dec_and_test(&vblk->refs)) {
- ida_simple_remove(&vd_index_ida, vblk->index);
- mutex_destroy(&vblk->vdev_mutex);
- kfree(vblk);
- }
-}
-
-static int virtblk_open(struct block_device *bd, fmode_t mode)
-{
- struct virtio_blk *vblk = bd->bd_disk->private_data;
- int ret = 0;
-
- mutex_lock(&vblk->vdev_mutex);
-
- if (vblk->vdev)
- virtblk_get(vblk);
- else
- ret = -ENXIO;
-
- mutex_unlock(&vblk->vdev_mutex);
- return ret;
-}
-
-static void virtblk_release(struct gendisk *disk, fmode_t mode)
-{
- struct virtio_blk *vblk = disk->private_data;
-
- virtblk_put(vblk);
-}
-
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
@@ -455,11 +411,19 @@ out:
return ret;
}
+static void virtblk_free_disk(struct gendisk *disk)
+{
+ struct virtio_blk *vblk = disk->private_data;
+
+ ida_simple_remove(&vd_index_ida, vblk->index);
+ mutex_destroy(&vblk->vdev_mutex);
+ kfree(vblk);
+}
+
static const struct block_device_operations virtblk_fops = {
- .owner = THIS_MODULE,
- .open = virtblk_open,
- .release = virtblk_release,
- .getgeo = virtblk_getgeo,
+ .owner = THIS_MODULE,
+ .getgeo = virtblk_getgeo,
+ .free_disk = virtblk_free_disk,
};
static int index_to_minor(int index)
@@ -784,8 +748,6 @@ static int virtblk_probe(struct virtio_device *vdev)
goto out_free_index;
}
- /* This reference is dropped in virtblk_remove(). */
- refcount_set(&vblk->refs, 1);
mutex_init(&vblk->vdev_mutex);
vblk->vdev = vdev;
@@ -968,7 +930,7 @@ static void virtblk_remove(struct virtio_device *vdev)
flush_work(&vblk->config_work);
del_gendisk(vblk->disk);
- blk_cleanup_disk(vblk->disk);
+ blk_cleanup_queue(vblk->disk->queue);
blk_mq_free_tag_set(&vblk->tag_set);
mutex_lock(&vblk->vdev_mutex);
@@ -984,7 +946,7 @@ static void virtblk_remove(struct virtio_device *vdev)
mutex_unlock(&vblk->vdev_mutex);
- virtblk_put(vblk);
+ put_disk(vblk->disk);
}
#ifdef CONFIG_PM_SLEEP
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 14e452896d04..d1e26461a64e 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1326,16 +1326,13 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
pages[i]->page,
seg[i].nsec << 9,
seg[i].offset) == 0)) {
- bio = bio_alloc(GFP_KERNEL, bio_max_segs(nseg - i));
- if (unlikely(bio == NULL))
- goto fail_put_bio;
-
+ bio = bio_alloc(preq.bdev, bio_max_segs(nseg - i),
+ operation | operation_flags,
+ GFP_KERNEL);
biolist[nbio++] = bio;
- bio_set_dev(bio, preq.bdev);
bio->bi_private = pending_req;
bio->bi_end_io = end_block_io_op;
bio->bi_iter.bi_sector = preq.sector_number;
- bio_set_op_attrs(bio, operation, operation_flags);
}
preq.sector_number += seg[i].nsec;
@@ -1345,15 +1342,11 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
if (!bio) {
BUG_ON(operation_flags != REQ_PREFLUSH);
- bio = bio_alloc(GFP_KERNEL, 0);
- if (unlikely(bio == NULL))
- goto fail_put_bio;
-
+ bio = bio_alloc(preq.bdev, 0, operation | operation_flags,
+ GFP_KERNEL);
biolist[nbio++] = bio;
- bio_set_dev(bio, preq.bdev);
bio->bi_private = pending_req;
bio->bi_end_io = end_block_io_op;
- bio_set_op_attrs(bio, operation, operation_flags);
}
atomic_set(&pending_req->pendcnt, nbio);
@@ -1381,14 +1374,6 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
free_req(ring, pending_req);
msleep(1); /* back off a bit */
return -EIO;
-
- fail_put_bio:
- for (i = 0; i < nbio; i++)
- bio_put(biolist[i]);
- atomic_set(&pending_req->pendcnt, 1);
- __end_block_io_op(pending_req, BLK_STS_RESOURCE);
- msleep(1); /* back off a bit */
- return -EIO;
}
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index cb253d80d72b..a3a5e1e71326 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -22,7 +22,6 @@
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/device.h>
-#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
@@ -617,24 +616,21 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
{
struct bio *bio;
- bio = bio_alloc(GFP_NOIO, 1);
+ bio = bio_alloc(zram->bdev, 1, parent ? parent->bi_opf : REQ_OP_READ,
+ GFP_NOIO);
if (!bio)
return -ENOMEM;
bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
- bio_set_dev(bio, zram->bdev);
if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
bio_put(bio);
return -EIO;
}
- if (!parent) {
- bio->bi_opf = REQ_OP_READ;
+ if (!parent)
bio->bi_end_io = zram_page_end_io;
- } else {
- bio->bi_opf = parent->bi_opf;
+ else
bio_chain(bio, parent);
- }
submit_bio(bio);
return 1;
@@ -747,10 +743,9 @@ static ssize_t writeback_store(struct device *dev,
continue;
}
- bio_init(&bio, &bio_vec, 1);
- bio_set_dev(&bio, zram->bdev);
+ bio_init(&bio, zram->bdev, &bio_vec, 1,
+ REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
- bio.bi_opf = REQ_OP_WRITE | REQ_SYNC;
bio_add_page(&bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset);
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index faead41709bc..8e78b37d0f6a 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -15,7 +15,6 @@
#include <linux/slab.h>
#include <linux/dma-mapping.h>
#include <linux/cdrom.h>
-#include <linux/genhd.h>
#include <linux/bio.h>
#include <linux/blk-mq.h>
#include <linux/interrupt.h>
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 0bdefada7453..85f4cca6e707 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -38,7 +38,7 @@
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/nodemask.h>
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b5ea378e66cb..998a5cfdbc4e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -204,6 +204,7 @@ config BLK_DEV_DM
tristate "Device mapper support"
select BLOCK_HOLDER_DEPRECATED if SYSFS
select BLK_DEV_DM_BUILTIN
+ select BLK_MQ_STACKING
depends on DAX || DAX=n
help
Device-mapper is a low level volume manager. It works by allowing
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 9c6f9ec55b72..020712c5203f 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -26,7 +26,8 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
struct bio *bio = &b->bio;
- bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->cache->sb));
+ bio_init(bio, NULL, bio->bi_inline_vecs,
+ meta_bucket_pages(&c->cache->sb), 0);
return bio;
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 61bd79babf7a..7c2ca52ca3e4 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -53,14 +53,12 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
reread: left = ca->sb.bucket_size - offset;
len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS);
- bio_reset(bio);
+ bio_reset(bio, ca->bdev, REQ_OP_READ);
bio->bi_iter.bi_sector = bucket + offset;
- bio_set_dev(bio, ca->bdev);
bio->bi_iter.bi_size = len << 9;
bio->bi_end_io = journal_read_endio;
bio->bi_private = &cl;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
bch_bio_map(bio, data);
closure_bio_submit(ca->set, bio, &cl);
@@ -611,11 +609,9 @@ static void do_journal_discard(struct cache *ca)
atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
- bio_init(bio, bio->bi_inline_vecs, 1);
- bio_set_op_attrs(bio, REQ_OP_DISCARD, 0);
+ bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD);
bio->bi_iter.bi_sector = bucket_to_sector(ca->set,
ca->sb.d[ja->discard_idx]);
- bio_set_dev(bio, ca->bdev);
bio->bi_iter.bi_size = bucket_bytes(ca);
bio->bi_end_io = journal_discard_endio;
@@ -773,16 +769,14 @@ static void journal_write_unlocked(struct closure *cl)
atomic_long_add(sectors, &ca->meta_sectors_written);
- bio_reset(bio);
+ bio_reset(bio, ca->bdev, REQ_OP_WRITE |
+ REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA);
+ bch_bio_map(bio, w->data);
bio->bi_iter.bi_sector = PTR_OFFSET(k, i);
- bio_set_dev(bio, ca->bdev);
bio->bi_iter.bi_size = sectors << 9;
bio->bi_end_io = journal_write_endio;
bio->bi_private = w;
- bio_set_op_attrs(bio, REQ_OP_WRITE,
- REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
- bch_bio_map(bio, w->data);
trace_bcache_journal_write(bio, w->data->keys);
bio_list_add(&list, bio);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index b9c3d27ec093..99499d1f6e66 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -79,8 +79,8 @@ static void moving_init(struct moving_io *io)
{
struct bio *bio = &io->bio.bio;
- bio_init(bio, bio->bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS));
+ bio_init(bio, NULL, bio->bi_inline_vecs,
+ DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0);
bio_get(bio);
bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index d15aae6c51c1..6869e010475a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -685,8 +685,7 @@ static void do_bio_hook(struct search *s,
{
struct bio *bio = &s->bio.bio;
- bio_init(bio, NULL, 0);
- __bio_clone_fast(bio, orig_bio);
+ bio_init_clone(bio->bi_bdev, bio, orig_bio, GFP_NOIO);
/*
* bi_end_io can be set separately somewhere else, e.g. the
* variants in,
@@ -831,11 +830,11 @@ static void cached_dev_read_done(struct closure *cl)
*/
if (s->iop.bio) {
- bio_reset(s->iop.bio);
+ bio_reset(s->iop.bio, s->cache_miss->bi_bdev, REQ_OP_READ);
s->iop.bio->bi_iter.bi_sector =
s->cache_miss->bi_iter.bi_sector;
- bio_copy_dev(s->iop.bio, s->cache_miss);
s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
+ bio_clone_blkg_association(s->iop.bio, s->cache_miss);
bch_bio_map(s->iop.bio, NULL);
bio_copy_data(s->cache_miss, s->iop.bio);
@@ -913,14 +912,13 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
/* btree_search_recurse()'s btree iterator is no good anymore */
ret = miss == bio ? MAP_DONE : -EINTR;
- cache_bio = bio_alloc_bioset(GFP_NOWAIT,
+ cache_bio = bio_alloc_bioset(miss->bi_bdev,
DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
- &dc->disk.bio_split);
+ 0, GFP_NOWAIT, &dc->disk.bio_split);
if (!cache_bio)
goto out_submit;
cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector;
- bio_copy_dev(cache_bio, miss);
cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
cache_bio->bi_end_io = backing_request_endio;
@@ -1025,21 +1023,21 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
*/
struct bio *flush;
- flush = bio_alloc_bioset(GFP_NOIO, 0,
- &dc->disk.bio_split);
+ flush = bio_alloc_bioset(bio->bi_bdev, 0,
+ REQ_OP_WRITE | REQ_PREFLUSH,
+ GFP_NOIO, &dc->disk.bio_split);
if (!flush) {
s->iop.status = BLK_STS_RESOURCE;
goto insert_data;
}
- bio_copy_dev(flush, bio);
flush->bi_end_io = backing_request_endio;
flush->bi_private = cl;
- flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
/* I/O request sent to backing device */
closure_bio_submit(s->iop.c, flush, cl);
}
} else {
- s->iop.bio = bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split);
+ s->iop.bio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
+ &dc->disk.bio_split);
/* I/O request sent to backing device */
bio->bi_end_io = backing_request_endio;
closure_bio_submit(s->iop.c, bio, cl);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 140f35dc0c45..bf3de149d3c9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -18,7 +18,6 @@
#include <linux/blkdev.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
-#include <linux/genhd.h>
#include <linux/idr.h>
#include <linux/kthread.h>
#include <linux/workqueue.h>
@@ -343,8 +342,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
down(&dc->sb_write_mutex);
closure_init(cl, parent);
- bio_init(bio, dc->sb_bv, 1);
- bio_set_dev(bio, dc->bdev);
+ bio_init(bio, dc->bdev, dc->sb_bv, 1, 0);
bio->bi_end_io = write_bdev_super_endio;
bio->bi_private = dc;
@@ -387,8 +385,7 @@ void bcache_write_super(struct cache_set *c)
if (ca->sb.version < version)
ca->sb.version = version;
- bio_init(bio, ca->sb_bv, 1);
- bio_set_dev(bio, ca->bdev);
+ bio_init(bio, ca->bdev, ca->sb_bv, 1, 0);
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
@@ -2240,7 +2237,7 @@ static int cache_alloc(struct cache *ca)
__module_get(THIS_MODULE);
kobject_init(&ca->kobj, &bch_cache_ktype);
- bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
+ bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0);
/*
* when ca->sb.njournal_buckets is not zero, journal exists,
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index c7560f66dca8..d42301e6309d 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -292,8 +292,8 @@ static void dirty_init(struct keybuf_key *w)
struct dirty_io *io = w->private;
struct bio *bio = &io->bio;
- bio_init(bio, bio->bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS));
+ bio_init(bio, NULL, bio->bi_inline_vecs,
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
if (!io->dc->writeback_percent)
bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 447d030036d1..89fdfb49d564 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -744,21 +744,14 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
spin_unlock_irq(&cache->lock);
}
-static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
- dm_oblock_t oblock, bool bio_has_pbd)
-{
- if (bio_has_pbd)
- check_if_tick_bio_needed(cache, bio);
- remap_to_origin(cache, bio);
- if (bio_data_dir(bio) == WRITE)
- clear_discard(cache, oblock_to_dblock(cache, oblock));
-}
-
static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
dm_oblock_t oblock)
{
// FIXME: check_if_tick_bio_needed() is called way too much through this interface
- __remap_to_origin_clear_discard(cache, bio, oblock, true);
+ check_if_tick_bio_needed(cache, bio);
+ remap_to_origin(cache, bio);
+ if (bio_data_dir(bio) == WRITE)
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
}
static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
@@ -826,16 +819,15 @@ static void issue_op(struct bio *bio, void *context)
static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
dm_oblock_t oblock, dm_cblock_t cblock)
{
- struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);
+ struct bio *origin_bio = bio_alloc_clone(cache->origin_dev->bdev, bio,
+ GFP_NOIO, &cache->bs);
BUG_ON(!origin_bio);
bio_chain(origin_bio, bio);
- /*
- * Passing false to __remap_to_origin_clear_discard() skips
- * all code that might use per_bio_data (since clone doesn't have it)
- */
- __remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
+
+ if (bio_data_dir(origin_bio) == WRITE)
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
submit_bio(origin_bio);
remap_to_cache(cache, bio, cblock);
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index b855fef4f38a..72d18c3fbf1f 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -11,7 +11,6 @@
#include <linux/kthread.h>
#include <linux/ktime.h>
-#include <linux/genhd.h>
#include <linux/blk-mq.h>
#include <linux/blk-crypto-profile.h>
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d4ae31558826..e2b0af4a2ee8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -234,7 +234,7 @@ static volatile unsigned long dm_crypt_pages_per_client;
#define DM_CRYPT_MEMORY_PERCENT 2
#define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_VECS * 16)
-static void clone_init(struct dm_crypt_io *, struct bio *);
+static void crypt_endio(struct bio *clone);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
struct scatterlist *sg);
@@ -1364,11 +1364,10 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
}
if (r == -EBADMSG) {
- char b[BDEVNAME_SIZE];
sector_t s = le64_to_cpu(*sector);
- DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu",
- bio_devname(ctx->bio_in, b), s);
+ DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu",
+ ctx->bio_in->bi_bdev, s);
dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead",
ctx->bio_in, s, 0);
}
@@ -1672,11 +1671,10 @@ retry:
if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_lock(&cc->bio_alloc_lock);
- clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, &cc->bs);
- if (!clone)
- goto out;
-
- clone_init(io, clone);
+ clone = bio_alloc_bioset(cc->dev->bdev, nr_iovecs, io->base_bio->bi_opf,
+ GFP_NOIO, &cc->bs);
+ clone->bi_private = io;
+ clone->bi_end_io = crypt_endio;
remaining_size = size;
@@ -1702,7 +1700,7 @@ retry:
bio_put(clone);
clone = NULL;
}
-out:
+
if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_unlock(&cc->bio_alloc_lock);
@@ -1829,34 +1827,25 @@ static void crypt_endio(struct bio *clone)
crypt_dec_pending(io);
}
-static void clone_init(struct dm_crypt_io *io, struct bio *clone)
-{
- struct crypt_config *cc = io->cc;
-
- clone->bi_private = io;
- clone->bi_end_io = crypt_endio;
- bio_set_dev(clone, cc->dev->bdev);
- clone->bi_opf = io->base_bio->bi_opf;
-}
-
static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
{
struct crypt_config *cc = io->cc;
struct bio *clone;
/*
- * We need the original biovec array in order to decrypt
- * the whole bio data *afterwards* -- thanks to immutable
- * biovecs we don't need to worry about the block layer
- * modifying the biovec array; so leverage bio_clone_fast().
+ * We need the original biovec array in order to decrypt the whole bio
+ * data *afterwards* -- thanks to immutable biovecs we don't need to
+ * worry about the block layer modifying the biovec array; so leverage
+ * bio_alloc_clone().
*/
- clone = bio_clone_fast(io->base_bio, gfp, &cc->bs);
+ clone = bio_alloc_clone(cc->dev->bdev, io->base_bio, gfp, &cc->bs);
if (!clone)
return 1;
+ clone->bi_private = io;
+ clone->bi_end_io = crypt_endio;
crypt_inc_pending(io);
- clone_init(io, clone);
clone->bi_iter.bi_sector = cc->start + io->sector;
if (dm_crypt_integrity_io_alloc(io, clone)) {
@@ -2179,11 +2168,10 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
if (error == -EBADMSG) {
- char b[BDEVNAME_SIZE];
sector_t s = le64_to_cpu(*org_sector_of_dmreq(cc, dmreq));
- DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu",
- bio_devname(ctx->bio_in, b), s);
+ DMERR_LIMIT("%pg: INTEGRITY AEAD ERROR, sector %llu",
+ ctx->bio_in->bi_bdev, s);
dm_audit_log_bio(DM_MSG_PREFIX, "integrity-aead",
ctx->bio_in, s, 0);
io->error = BLK_STS_PROTECTION;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index eb4b5e52bd6f..c58a5111cb57 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1788,12 +1788,11 @@ again:
checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
if (unlikely(r)) {
if (r > 0) {
- char b[BDEVNAME_SIZE];
sector_t s;
s = sector - ((r + ic->tag_size - 1) / ic->tag_size);
- DMERR_LIMIT("%s: Checksum failed at sector 0x%llx",
- bio_devname(bio, b), s);
+ DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
+ bio->bi_bdev, s);
r = -EILSEQ;
atomic64_inc(&ic->number_of_mismatches);
dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum",
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2d3cda0acacb..23e038f8dc84 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -345,11 +345,10 @@ static void do_region(int op, int op_flags, unsigned region,
(PAGE_SIZE >> SECTOR_SHIFT)));
}
- bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, &io->client->bios);
+ bio = bio_alloc_bioset(where->bdev, num_bvecs, op | op_flags,
+ GFP_NOIO, &io->client->bios);
bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
- bio_set_dev(bio, where->bdev);
bio->bi_end_io = endio;
- bio_set_op_attrs(bio, op, op_flags);
store_io_and_region_in_bio(bio, io, region);
if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) {
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 139b09b06eda..c9d036d6bb2e 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -217,18 +217,12 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
void *ptr;
size_t ret;
- bio = bio_alloc(GFP_KERNEL, 1);
- if (!bio) {
- DMERR("Couldn't alloc log bio");
- goto error;
- }
+ bio = bio_alloc(lc->logdev->bdev, 1, REQ_OP_WRITE, GFP_KERNEL);
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, lc->logdev->bdev);
bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
log_end_super : log_end_io;
bio->bi_private = lc;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
page = alloc_page(GFP_KERNEL);
if (!page) {
@@ -275,18 +269,12 @@ static int write_inline_data(struct log_writes_c *lc, void *entry,
atomic_inc(&lc->io_blocks);
- bio = bio_alloc(GFP_KERNEL, bio_pages);
- if (!bio) {
- DMERR("Couldn't alloc inline data bio");
- goto error;
- }
-
+ bio = bio_alloc(lc->logdev->bdev, bio_pages, REQ_OP_WRITE,
+ GFP_KERNEL);
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, lc->logdev->bdev);
bio->bi_end_io = log_end_io;
bio->bi_private = lc;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
for (i = 0; i < bio_pages; i++) {
pg_datalen = min_t(int, datalen, PAGE_SIZE);
@@ -322,7 +310,6 @@ static int write_inline_data(struct log_writes_c *lc, void *entry,
error_bio:
bio_free_pages(bio);
bio_put(bio);
-error:
put_io_block(lc);
return -1;
}
@@ -363,17 +350,12 @@ static int log_one_block(struct log_writes_c *lc,
goto out;
atomic_inc(&lc->io_blocks);
- bio = bio_alloc(GFP_KERNEL, bio_max_segs(block->vec_cnt));
- if (!bio) {
- DMERR("Couldn't alloc log bio");
- goto error;
- }
+ bio = bio_alloc(lc->logdev->bdev, bio_max_segs(block->vec_cnt),
+ REQ_OP_WRITE, GFP_KERNEL);
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, lc->logdev->bdev);
bio->bi_end_io = log_end_io;
bio->bi_private = lc;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
for (i = 0; i < block->vec_cnt; i++) {
/*
@@ -385,18 +367,13 @@ static int log_one_block(struct log_writes_c *lc,
if (ret != block->vecs[i].bv_len) {
atomic_inc(&lc->io_blocks);
submit_bio(bio);
- bio = bio_alloc(GFP_KERNEL,
- bio_max_segs(block->vec_cnt - i));
- if (!bio) {
- DMERR("Couldn't alloc log bio");
- goto error;
- }
+ bio = bio_alloc(lc->logdev->bdev,
+ bio_max_segs(block->vec_cnt - i),
+ REQ_OP_WRITE, GFP_KERNEL);
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, lc->logdev->bdev);
bio->bi_end_io = log_end_io;
bio->bi_private = lc;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
ret = bio_add_page(bio, block->vecs[i].bv_page,
block->vecs[i].bv_len, 0);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 579ab6183d4d..6948d5db9092 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -303,21 +303,6 @@ static void end_clone_request(struct request *clone, blk_status_t error)
dm_complete_request(tio->orig, error);
}
-static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq)
-{
- blk_status_t r;
-
- if (blk_queue_io_stat(clone->q))
- clone->rq_flags |= RQF_IO_STAT;
-
- clone->start_time_ns = ktime_get_ns();
- r = blk_insert_cloned_request(clone->q, clone);
- if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE)
- /* must complete clone in terms of original request */
- dm_complete_request(rq, r);
- return r;
-}
-
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
void *data)
{
@@ -398,13 +383,20 @@ static int map_request(struct dm_rq_target_io *tio)
/* The target has remapped the I/O so dispatch it */
trace_block_rq_remap(clone, disk_devt(dm_disk(md)),
blk_rq_pos(rq));
- ret = dm_dispatch_clone_request(clone, rq);
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
+ ret = blk_insert_cloned_request(clone);
+ switch (ret) {
+ case BLK_STS_OK:
+ break;
+ case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
blk_rq_unprep_clone(clone);
blk_mq_cleanup_rq(clone);
tio->ti->type->release_clone_rq(clone, &tio->info);
tio->clone = NULL;
return DM_MAPIO_REQUEUE;
+ default:
+ /* must complete clone in terms of original request */
+ dm_complete_request(rq, ret);
}
break;
case DM_MAPIO_REQUEUE:
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index dcf34c6b05ad..0d336b5ec571 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -141,11 +141,6 @@ struct dm_snapshot {
* for them to be committed.
*/
struct bio_list bios_queued_during_merge;
-
- /*
- * Flush data after merge.
- */
- struct bio flush_bio;
};
/*
@@ -1127,17 +1122,6 @@ shut:
static void error_bios(struct bio *bio);
-static int flush_data(struct dm_snapshot *s)
-{
- struct bio *flush_bio = &s->flush_bio;
-
- bio_reset(flush_bio);
- bio_set_dev(flush_bio, s->origin->bdev);
- flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
- return submit_bio_wait(flush_bio);
-}
-
static void merge_callback(int read_err, unsigned long write_err, void *context)
{
struct dm_snapshot *s = context;
@@ -1151,7 +1135,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
goto shut;
}
- if (flush_data(s) < 0) {
+ if (blkdev_issue_flush(s->origin->bdev) < 0) {
DMERR("Flush after merge failed: shutting down merge");
goto shut;
}
@@ -1340,7 +1324,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
s->first_merging_chunk = 0;
s->num_merging_chunks = 0;
bio_list_init(&s->bios_queued_during_merge);
- bio_init(&s->flush_bio, NULL, 0);
/* Allocate hash table for COW data */
if (init_hash_tables(s)) {
@@ -1528,8 +1511,6 @@ static void snapshot_dtr(struct dm_target *ti)
dm_exception_store_destroy(s->store);
- bio_uninit(&s->flush_bio);
-
dm_put_device(ti, s->cow);
dm_put_device(ti, s->origin);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index ec119d2422d5..f4234d615aa1 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -282,8 +282,6 @@ struct pool {
struct dm_bio_prison_cell **cell_sort_array;
mempool_t mapping_pool;
-
- struct bio flush_bio;
};
static void metadata_operation_failed(struct pool *pool, const char *op, int r);
@@ -1179,25 +1177,17 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
return;
}
- discard_parent = bio_alloc(GFP_NOIO, 1);
- if (!discard_parent) {
- DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
- dm_device_name(tc->pool->pool_md));
- queue_passdown_pt2(m);
-
- } else {
- discard_parent->bi_end_io = passdown_endio;
- discard_parent->bi_private = m;
-
- if (m->maybe_shared)
- passdown_double_checking_shared_status(m, discard_parent);
- else {
- struct discard_op op;
-
- begin_discard(&op, tc, discard_parent);
- r = issue_discard(&op, m->data_block, data_end);
- end_discard(&op, r);
- }
+ discard_parent = bio_alloc(NULL, 1, 0, GFP_NOIO);
+ discard_parent->bi_end_io = passdown_endio;
+ discard_parent->bi_private = m;
+ if (m->maybe_shared)
+ passdown_double_checking_shared_status(m, discard_parent);
+ else {
+ struct discard_op op;
+
+ begin_discard(&op, tc, discard_parent);
+ r = issue_discard(&op, m->data_block, data_end);
+ end_discard(&op, r);
}
}
@@ -2913,7 +2903,6 @@ static void __pool_destroy(struct pool *pool)
if (pool->next_mapping)
mempool_free(pool->next_mapping, &pool->mapping_pool);
mempool_exit(&pool->mapping_pool);
- bio_uninit(&pool->flush_bio);
dm_deferred_set_destroy(pool->shared_read_ds);
dm_deferred_set_destroy(pool->all_io_ds);
kfree(pool);
@@ -2994,7 +2983,6 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->low_water_triggered = false;
pool->suspended = true;
pool->out_of_data_space = false;
- bio_init(&pool->flush_bio, NULL, 0);
pool->shared_read_ds = dm_deferred_set_create();
if (!pool->shared_read_ds) {
@@ -3201,13 +3189,8 @@ static void metadata_low_callback(void *context)
static int metadata_pre_commit_callback(void *context)
{
struct pool *pool = context;
- struct bio *flush_bio = &pool->flush_bio;
-
- bio_reset(flush_bio);
- bio_set_dev(flush_bio, pool->data_dev);
- flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
- return submit_bio_wait(flush_bio);
+ return blkdev_issue_flush(pool->data_dev);
}
static sector_t get_dev_size(struct block_device *bdev)
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 4f31591d2d25..5630b470ba42 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -1821,11 +1821,11 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
max_pages = e->wc_list_contiguous;
- bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
+ bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE,
+ GFP_NOIO, &wc->bio_set);
wb = container_of(bio, struct writeback_struct, bio);
wb->wc = wc;
bio->bi_end_io = writecache_writeback_endio;
- bio_set_dev(bio, wc->dev->bdev);
bio->bi_iter.bi_sector = read_original_sector(wc, e);
if (max_pages <= WB_LIST_INLINE ||
unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
@@ -1852,7 +1852,8 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
wb->wc_list[wb->wc_list_n++] = f;
e = f;
}
- bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
+ if (WC_MODE_FUA(wc))
+ bio->bi_opf |= REQ_FUA;
if (writecache_has_error(wc)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index ee4626d08557..e5f1eb27ce2e 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -550,11 +550,8 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
if (!mblk)
return ERR_PTR(-ENOMEM);
- bio = bio_alloc(GFP_NOIO, 1);
- if (!bio) {
- dmz_free_mblock(zmd, mblk);
- return ERR_PTR(-ENOMEM);
- }
+ bio = bio_alloc(dev->bdev, 1, REQ_OP_READ | REQ_META | REQ_PRIO,
+ GFP_NOIO);
spin_lock(&zmd->mblk_lock);
@@ -578,10 +575,8 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
/* Submit read BIO */
bio->bi_iter.bi_sector = dmz_blk2sect(block);
- bio_set_dev(bio, dev->bdev);
bio->bi_private = mblk;
bio->bi_end_io = dmz_mblock_bio_end_io;
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO);
bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
submit_bio(bio);
@@ -725,19 +720,14 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
if (dmz_bdev_is_dying(dev))
return -EIO;
- bio = bio_alloc(GFP_NOIO, 1);
- if (!bio) {
- set_bit(DMZ_META_ERROR, &mblk->state);
- return -ENOMEM;
- }
+ bio = bio_alloc(dev->bdev, 1, REQ_OP_WRITE | REQ_META | REQ_PRIO,
+ GFP_NOIO);
set_bit(DMZ_META_WRITING, &mblk->state);
bio->bi_iter.bi_sector = dmz_blk2sect(block);
- bio_set_dev(bio, dev->bdev);
bio->bi_private = mblk;
bio->bi_end_io = dmz_mblock_bio_end_io;
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
submit_bio(bio);
@@ -759,13 +749,9 @@ static int dmz_rdwr_block(struct dmz_dev *dev, int op,
if (dmz_bdev_is_dying(dev))
return -EIO;
- bio = bio_alloc(GFP_NOIO, 1);
- if (!bio)
- return -ENOMEM;
-
+ bio = bio_alloc(dev->bdev, 1, op | REQ_SYNC | REQ_META | REQ_PRIO,
+ GFP_NOIO);
bio->bi_iter.bi_sector = dmz_blk2sect(block);
- bio_set_dev(bio, dev->bdev);
- bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
ret = submit_bio_wait(bio);
bio_put(bio);
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 166c4e9d99c9..a3f6d3ef3817 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -125,11 +125,10 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
if (dev->flags & DMZ_BDEV_DYING)
return -EIO;
- clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
+ clone = bio_alloc_clone(dev->bdev, bio, GFP_NOIO, &dmz->bio_set);
if (!clone)
return -ENOMEM;
- bio_set_dev(clone, dev->bdev);
bioctx->dev = dev;
clone->bi_iter.bi_sector =
dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 997ace47bbd5..183ce0d6728f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -79,10 +79,14 @@ struct clone_info {
#define DM_IO_BIO_OFFSET \
(offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
+static inline struct dm_target_io *clone_to_tio(struct bio *clone)
+{
+ return container_of(clone, struct dm_target_io, clone);
+}
+
void *dm_per_bio_data(struct bio *bio, size_t data_size)
{
- struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
- if (!tio->inside_dm_io)
+ if (!clone_to_tio(bio)->inside_dm_io)
return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
return (char *)bio - DM_IO_BIO_OFFSET - data_size;
}
@@ -477,10 +481,7 @@ out:
u64 dm_start_time_ns_from_clone(struct bio *bio)
{
- struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
- struct dm_io *io = tio->io;
-
- return jiffies_to_nsecs(io->start_time);
+ return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
}
EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
@@ -519,11 +520,9 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
struct dm_target_io *tio;
struct bio *clone;
- clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
- if (!clone)
- return NULL;
+ clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs);
- tio = container_of(clone, struct dm_target_io, clone);
+ tio = clone_to_tio(clone);
tio->inside_dm_io = true;
tio->io = NULL;
@@ -545,8 +544,8 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
bio_put(&io->tio.clone);
}
-static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
- unsigned target_bio_nr, gfp_t gfp_mask)
+static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
+ unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
{
struct dm_target_io *tio;
@@ -554,11 +553,12 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
/* the dm_target_io embedded in ci->io is available */
tio = &ci->io->tio;
} else {
- struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
+ struct bio *clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio,
+ gfp_mask, &ci->io->md->bs);
if (!clone)
return NULL;
- tio = container_of(clone, struct dm_target_io, clone);
+ tio = clone_to_tio(clone);
tio->inside_dm_io = false;
}
@@ -566,15 +566,16 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
tio->io = ci->io;
tio->ti = ti;
tio->target_bio_nr = target_bio_nr;
+ tio->len_ptr = len;
- return tio;
+ return &tio->clone;
}
-static void free_tio(struct dm_target_io *tio)
+static void free_tio(struct bio *clone)
{
- if (tio->inside_dm_io)
+ if (clone_to_tio(clone)->inside_dm_io)
return;
- bio_put(&tio->clone);
+ bio_put(clone);
}
/*
@@ -879,7 +880,7 @@ static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
static void clone_endio(struct bio *bio)
{
blk_status_t error = bio->bi_status;
- struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+ struct dm_target_io *tio = clone_to_tio(bio);
struct dm_io *io = tio->io;
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
@@ -930,7 +931,7 @@ static void clone_endio(struct bio *bio)
up(&md->swap_bios_semaphore);
}
- free_tio(tio);
+ free_tio(bio);
dm_io_dec_pending(io, error);
}
@@ -1085,7 +1086,7 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
*/
void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
{
- struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+ struct dm_target_io *tio = clone_to_tio(bio);
unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
BUG_ON(bio->bi_opf & REQ_PREFLUSH);
@@ -1115,11 +1116,11 @@ static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
mutex_unlock(&md->swap_bios_lock);
}
-static void __map_bio(struct dm_target_io *tio)
+static void __map_bio(struct bio *clone)
{
+ struct dm_target_io *tio = clone_to_tio(clone);
int r;
sector_t sector;
- struct bio *clone = &tio->clone;
struct dm_io *io = tio->io;
struct dm_target *ti = tio->ti;
@@ -1164,7 +1165,7 @@ static void __map_bio(struct dm_target_io *tio)
struct mapped_device *md = io->md;
up(&md->swap_bios_semaphore);
}
- free_tio(tio);
+ free_tio(clone);
dm_io_dec_pending(io, BLK_STS_IOERR);
break;
case DM_MAPIO_REQUEUE:
@@ -1172,7 +1173,7 @@ static void __map_bio(struct dm_target_io *tio)
struct mapped_device *md = io->md;
up(&md->swap_bios_semaphore);
}
- free_tio(tio);
+ free_tio(clone);
dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
break;
default:
@@ -1190,106 +1191,75 @@ static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
/*
* Creates a bio that consists of range of complete bvecs.
*/
-static int clone_bio(struct dm_target_io *tio, struct bio *bio,
- sector_t sector, unsigned len)
+static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
+ sector_t sector, unsigned *len)
{
- struct bio *clone = &tio->clone;
- int r;
-
- __bio_clone_fast(clone, bio);
-
- r = bio_crypt_clone(clone, bio, GFP_NOIO);
- if (r < 0)
- return r;
-
- if (bio_integrity(bio)) {
- if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
- !dm_target_passes_integrity(tio->ti->type))) {
- DMWARN("%s: the target %s doesn't support integrity data.",
- dm_device_name(tio->io->md),
- tio->ti->type->name);
- return -EIO;
- }
-
- r = bio_integrity_clone(clone, bio, GFP_NOIO);
- if (r < 0)
- return r;
- }
+ struct bio *bio = ci->bio, *clone;
+ clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
- clone->bi_iter.bi_size = to_bytes(len);
+ clone->bi_iter.bi_size = to_bytes(*len);
if (bio_integrity(bio))
bio_integrity_trim(clone);
+ __map_bio(clone);
return 0;
}
static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
- struct dm_target *ti, unsigned num_bios)
+ struct dm_target *ti, unsigned num_bios,
+ unsigned *len)
{
- struct dm_target_io *tio;
+ struct bio *bio;
int try;
- if (!num_bios)
- return;
-
- if (num_bios == 1) {
- tio = alloc_tio(ci, ti, 0, GFP_NOIO);
- bio_list_add(blist, &tio->clone);
- return;
- }
-
for (try = 0; try < 2; try++) {
int bio_nr;
- struct bio *bio;
if (try)
mutex_lock(&ci->io->md->table_devices_lock);
for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
- tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
- if (!tio)
+ bio = alloc_tio(ci, ti, bio_nr, len,
+ try ? GFP_NOIO : GFP_NOWAIT);
+ if (!bio)
break;
- bio_list_add(blist, &tio->clone);
+ bio_list_add(blist, bio);
}
if (try)
mutex_unlock(&ci->io->md->table_devices_lock);
if (bio_nr == num_bios)
return;
- while ((bio = bio_list_pop(blist))) {
- tio = container_of(bio, struct dm_target_io, clone);
- free_tio(tio);
- }
+ while ((bio = bio_list_pop(blist)))
+ free_tio(bio);
}
}
-static void __clone_and_map_simple_bio(struct clone_info *ci,
- struct dm_target_io *tio, unsigned *len)
-{
- struct bio *clone = &tio->clone;
-
- tio->len_ptr = len;
-
- __bio_clone_fast(clone, ci->bio);
- if (len)
- bio_setup_sector(clone, ci->sector, *len);
- __map_bio(tio);
-}
-
static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
unsigned num_bios, unsigned *len)
{
struct bio_list blist = BIO_EMPTY_LIST;
- struct bio *bio;
- struct dm_target_io *tio;
-
- alloc_multiple_bios(&blist, ci, ti, num_bios);
+ struct bio *clone;
- while ((bio = bio_list_pop(&blist))) {
- tio = container_of(bio, struct dm_target_io, clone);
- __clone_and_map_simple_bio(ci, tio, len);
+ switch (num_bios) {
+ case 0:
+ break;
+ case 1:
+ clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
+ if (len)
+ bio_setup_sector(clone, ci->sector, *len);
+ __map_bio(clone);
+ break;
+ default:
+ alloc_multiple_bios(&blist, ci, ti, num_bios, len);
+ while ((clone = bio_list_pop(&blist))) {
+ if (len)
+ bio_setup_sector(clone, ci->sector, *len);
+ __map_bio(clone);
+ }
+ break;
}
}
@@ -1304,9 +1274,8 @@ static int __send_empty_flush(struct clone_info *ci)
* need to reference it after submit. It's just used as
* the basis for the clone(s).
*/
- bio_init(&flush_bio, NULL, 0);
- flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
- bio_set_dev(&flush_bio, ci->io->md->disk->part0);
+ bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
+ REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
ci->bio = &flush_bio;
ci->sector_count = 0;
@@ -1319,25 +1288,6 @@ static int __send_empty_flush(struct clone_info *ci)
return 0;
}
-static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
- sector_t sector, unsigned *len)
-{
- struct bio *bio = ci->bio;
- struct dm_target_io *tio;
- int r;
-
- tio = alloc_tio(ci, ti, 0, GFP_NOIO);
- tio->len_ptr = len;
- r = clone_bio(tio, bio, sector, *len);
- if (r < 0) {
- free_tio(tio);
- return r;
- }
- __map_bio(tio);
-
- return 0;
-}
-
static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
unsigned num_bios)
{
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index c0dc6f2ef4a3..50ad818978a4 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -205,9 +205,9 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
}
}
if (failit) {
- struct bio *b = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+ struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
+ &mddev->bio_set);
- bio_set_dev(b, conf->rdev->bdev);
b->bi_private = bio;
b->bi_end_io = faulty_fail;
bio = b;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index e7d6486f090f..3081a936350d 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -121,11 +121,9 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
}
multipath = conf->multipaths + mp_bh->path;
- bio_init(&mp_bh->bio, NULL, 0);
- __bio_clone_fast(&mp_bh->bio, bio);
+ bio_init_clone(multipath->rdev->bdev, &mp_bh->bio, bio, GFP_NOIO);
mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
- bio_set_dev(&mp_bh->bio, multipath->rdev->bdev);
mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
mp_bh->bio.bi_end_io = multipath_end_request;
mp_bh->bio.bi_private = mp_bh;
@@ -299,7 +297,6 @@ static void multipathd(struct md_thread *thread)
md_check_recovery(mddev);
for (;;) {
- char b[BDEVNAME_SIZE];
spin_lock_irqsave(&conf->device_lock, flags);
if (list_empty(head))
break;
@@ -311,13 +308,13 @@ static void multipathd(struct md_thread *thread)
bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
if ((mp_bh->path = multipath_map (conf))<0) {
- pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
- bio_devname(bio, b),
+ pr_err("multipath: %pg: unrecoverable IO read error for block %llu\n",
+ bio->bi_bdev,
(unsigned long long)bio->bi_iter.bi_sector);
multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
} else {
- pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
- bio_devname(bio, b),
+ pr_err("multipath: %pg: redirecting sector %llu to another IO path\n",
+ bio->bi_bdev,
(unsigned long long)bio->bi_iter.bi_sector);
*bio = *(mp_bh->master_bio);
bio->bi_iter.bi_sector +=
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4d38bd7dadd6..f210a55af201 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -562,11 +562,11 @@ static void submit_flushes(struct work_struct *ws)
atomic_inc(&rdev->nr_pending);
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
- bi = bio_alloc_bioset(GFP_NOIO, 0, &mddev->bio_set);
+ bi = bio_alloc_bioset(rdev->bdev, 0,
+ REQ_OP_WRITE | REQ_PREFLUSH,
+ GFP_NOIO, &mddev->bio_set);
bi->bi_end_io = md_end_flush;
bi->bi_private = rdev;
- bio_set_dev(bi, rdev->bdev);
- bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
atomic_inc(&mddev->flush_pending);
submit_bio(bi);
rcu_read_lock();
@@ -955,7 +955,6 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
* If an error occurred, call md_error
*/
struct bio *bio;
- int ff = 0;
if (!page)
return;
@@ -963,11 +962,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
if (test_bit(Faulty, &rdev->flags))
return;
- bio = bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
+ bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
+ 1,
+ REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA,
+ GFP_NOIO, &mddev->sync_set);
atomic_inc(&rdev->nr_pending);
- bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
bio->bi_iter.bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
@@ -976,8 +977,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
test_bit(FailFast, &rdev->flags) &&
!test_bit(LastDev, &rdev->flags))
- ff = MD_FAILFAST;
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
+ bio->bi_opf |= MD_FAILFAST;
atomic_inc(&mddev->pending_writes);
submit_bio(bio);
@@ -998,13 +998,11 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
struct bio bio;
struct bio_vec bvec;
- bio_init(&bio, &bvec, 1);
-
if (metadata_op && rdev->meta_bdev)
- bio_set_dev(&bio, rdev->meta_bdev);
+ bio_init(&bio, rdev->meta_bdev, &bvec, 1, op | op_flags);
else
- bio_set_dev(&bio, rdev->bdev);
- bio.bi_opf = op | op_flags;
+ bio_init(&bio, rdev->bdev, &bvec, 1, op | op_flags);
+
if (metadata_op)
bio.bi_iter.bi_sector = sector + rdev->sb_start;
else if (rdev->mddev->reshape_position != MaxSector &&
@@ -8636,13 +8634,14 @@ static void md_end_io_acct(struct bio *bio)
*/
void md_account_bio(struct mddev *mddev, struct bio **bio)
{
+ struct block_device *bdev = (*bio)->bi_bdev;
struct md_io_acct *md_io_acct;
struct bio *clone;
- if (!blk_queue_io_stat((*bio)->bi_bdev->bd_disk->queue))
+ if (!blk_queue_io_stat(bdev->bd_disk->queue))
return;
- clone = bio_clone_fast(*bio, GFP_NOIO, &mddev->io_acct_set);
+ clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
md_io_acct->orig_bio = *bio;
md_io_acct->start_time = bio_start_io_acct(*bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2d8acb1e988..03477e971699 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1126,7 +1126,8 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio,
int i = 0;
struct bio *behind_bio = NULL;
- behind_bio = bio_alloc_bioset(GFP_NOIO, vcnt, &r1_bio->mddev->bio_set);
+ behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO,
+ &r1_bio->mddev->bio_set);
if (!behind_bio)
return;
@@ -1319,13 +1320,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
r1_bio->start_time = bio_start_io_acct(bio);
- read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
+ read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
+ &mddev->bio_set);
r1_bio->bios[rdisk] = read_bio;
read_bio->bi_iter.bi_sector = r1_bio->sector +
mirror->rdev->data_offset;
- bio_set_dev(read_bio, mirror->rdev->bdev);
read_bio->bi_end_io = raid1_end_read_request;
bio_set_op_attrs(read_bio, op, do_sync);
if (test_bit(FailFast, &mirror->rdev->flags) &&
@@ -1545,24 +1546,25 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
first_clone = 0;
}
- if (r1_bio->behind_master_bio)
- mbio = bio_clone_fast(r1_bio->behind_master_bio,
- GFP_NOIO, &mddev->bio_set);
- else
- mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
-
if (r1_bio->behind_master_bio) {
+ mbio = bio_alloc_clone(rdev->bdev,
+ r1_bio->behind_master_bio,
+ GFP_NOIO, &mddev->bio_set);
if (test_bit(CollisionCheck, &rdev->flags))
wait_for_serialization(rdev, r1_bio);
if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
- } else if (mddev->serialize_policy)
- wait_for_serialization(rdev, r1_bio);
+ } else {
+ mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO,
+ &mddev->bio_set);
+
+ if (mddev->serialize_policy)
+ wait_for_serialization(rdev, r1_bio);
+ }
r1_bio->bios[i] = mbio;
mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset);
- bio_set_dev(mbio, rdev->bdev);
mbio->bi_end_io = raid1_end_write_request;
mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
if (test_bit(FailFast, &rdev->flags) &&
@@ -2070,15 +2072,14 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
} while (!success && d != r1_bio->read_disk);
if (!success) {
- char b[BDEVNAME_SIZE];
int abort = 0;
/* Cannot read from anywhere, this block is lost.
* Record a bad block on each device. If that doesn't
* work just disable and interrupt the recovery.
* Don't fail devices as that won't really help.
*/
- pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
- mdname(mddev), bio_devname(bio, b),
+ pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
+ mdname(mddev), bio->bi_bdev,
(unsigned long long)r1_bio->sector);
for (d = 0; d < conf->raid_disks * 2; d++) {
rdev = conf->mirrors[d].rdev;
@@ -2165,11 +2166,10 @@ static void process_checks(struct r1bio *r1_bio)
continue;
/* fixup the bio for reuse, but preserve errno */
status = b->bi_status;
- bio_reset(b);
+ bio_reset(b, conf->mirrors[i].rdev->bdev, REQ_OP_READ);
b->bi_status = status;
b->bi_iter.bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
- bio_set_dev(b, conf->mirrors[i].rdev->bdev);
b->bi_end_io = end_sync_read;
rp->raid_bio = r1_bio;
b->bi_private = rp;
@@ -2416,12 +2416,12 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
/* Write at 'sector' for 'sectors'*/
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- wbio = bio_clone_fast(r1_bio->behind_master_bio,
- GFP_NOIO,
- &mddev->bio_set);
+ wbio = bio_alloc_clone(rdev->bdev,
+ r1_bio->behind_master_bio,
+ GFP_NOIO, &mddev->bio_set);
} else {
- wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
- &mddev->bio_set);
+ wbio = bio_alloc_clone(rdev->bdev, r1_bio->master_bio,
+ GFP_NOIO, &mddev->bio_set);
}
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2430,7 +2430,6 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
bio_trim(wbio, sector - r1_bio->sector, sectors);
wbio->bi_iter.bi_sector += rdev->data_offset;
- bio_set_dev(wbio, rdev->bdev);
if (submit_bio_wait(wbio) < 0)
/* failure! */
@@ -2650,7 +2649,7 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
for (i = conf->poolinfo->raid_disks; i--; ) {
bio = r1bio->bios[i];
rps = bio->bi_private;
- bio_reset(bio);
+ bio_reset(bio, NULL, 0);
bio->bi_private = rps;
}
r1bio->master_bio = NULL;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2b969f70a31f..5dd2e17e1d0e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1208,14 +1208,13 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
r10_bio->start_time = bio_start_io_acct(bio);
- read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
+ read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
choose_data_offset(r10_bio, rdev);
- bio_set_dev(read_bio, rdev->bdev);
read_bio->bi_end_io = raid10_end_read_request;
bio_set_op_attrs(read_bio, op, do_sync);
if (test_bit(FailFast, &rdev->flags) &&
@@ -1255,7 +1254,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
} else
rdev = conf->mirrors[devnum].rdev;
- mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+ mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set);
if (replacement)
r10_bio->devs[n_copy].repl_bio = mbio;
else
@@ -1263,7 +1262,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
choose_data_offset(r10_bio, rdev));
- bio_set_dev(mbio, rdev->bdev);
mbio->bi_end_io = raid10_end_write_request;
bio_set_op_attrs(mbio, op, do_sync | do_fua);
if (!replacement && test_bit(FailFast,
@@ -1812,7 +1810,8 @@ retry_discard:
*/
if (r10_bio->devs[disk].bio) {
struct md_rdev *rdev = conf->mirrors[disk].rdev;
- mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+ mbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
+ &mddev->bio_set);
mbio->bi_end_io = raid10_end_discard_request;
mbio->bi_private = r10_bio;
r10_bio->devs[disk].bio = mbio;
@@ -1825,7 +1824,8 @@ retry_discard:
}
if (r10_bio->devs[disk].repl_bio) {
struct md_rdev *rrdev = conf->mirrors[disk].replacement;
- rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+ rbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
+ &mddev->bio_set);
rbio->bi_end_io = raid10_end_discard_request;
rbio->bi_private = r10_bio;
r10_bio->devs[disk].repl_bio = rbio;
@@ -2422,7 +2422,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* bi_vecs, as the read request might have corrupted these
*/
rp = get_resync_pages(tbio);
- bio_reset(tbio);
+ bio_reset(tbio, conf->mirrors[d].rdev->bdev, REQ_OP_WRITE);
md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
@@ -2430,7 +2430,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
tbio->bi_private = rp;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
tbio->bi_end_io = end_sync_write;
- bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
bio_copy_data(tbio, fbio);
@@ -2441,7 +2440,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
tbio->bi_opf |= MD_FAILFAST;
tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
- bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
submit_bio_noacct(tbio);
}
@@ -2894,12 +2892,12 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
if (sectors > sect_to_write)
sectors = sect_to_write;
/* Write at 'sector' for 'sectors' */
- wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+ wbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO,
+ &mddev->bio_set);
bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
wbio->bi_iter.bi_sector = wsector +
choose_data_offset(r10_bio, rdev);
- bio_set_dev(wbio, rdev->bdev);
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
if (submit_bio_wait(wbio) < 0)
@@ -3160,12 +3158,12 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
for (i = 0; i < nalloc; i++) {
bio = r10bio->devs[i].bio;
rp = bio->bi_private;
- bio_reset(bio);
+ bio_reset(bio, NULL, 0);
bio->bi_private = rp;
bio = r10bio->devs[i].repl_bio;
if (bio) {
rp = bio->bi_private;
- bio_reset(bio);
+ bio_reset(bio, NULL, 0);
bio->bi_private = rp;
}
}
@@ -4892,14 +4890,12 @@ read_more:
return sectors_done;
}
- read_bio = bio_alloc_bioset(GFP_KERNEL, RESYNC_PAGES, &mddev->bio_set);
-
- bio_set_dev(read_bio, rdev->bdev);
+ read_bio = bio_alloc_bioset(rdev->bdev, RESYNC_PAGES, REQ_OP_READ,
+ GFP_KERNEL, &mddev->bio_set);
read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+ rdev->data_offset);
read_bio->bi_private = r10_bio;
read_bio->bi_end_io = end_reshape_read;
- bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
r10_bio->master_bio = read_bio;
r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 0b5dcaabbc15..86e2bb89d9c7 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -735,10 +735,9 @@ static void r5l_submit_current_io(struct r5l_log *log)
static struct bio *r5l_bio_alloc(struct r5l_log *log)
{
- struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS, &log->bs);
+ struct bio *bio = bio_alloc_bioset(log->rdev->bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE, GFP_NOIO, &log->bs);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- bio_set_dev(bio, log->rdev->bdev);
bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
return bio;
@@ -1302,10 +1301,9 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log)
if (!do_flush)
return;
- bio_reset(&log->flush_bio);
- bio_set_dev(&log->flush_bio, log->rdev->bdev);
+ bio_reset(&log->flush_bio, log->rdev->bdev,
+ REQ_OP_WRITE | REQ_PREFLUSH);
log->flush_bio.bi_end_io = r5l_log_flush_endio;
- log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
submit_bio(&log->flush_bio);
}
@@ -1634,7 +1632,8 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
{
struct page *page;
- ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_VECS, &log->bs);
+ ctx->ra_bio = bio_alloc_bioset(NULL, BIO_MAX_VECS, 0, GFP_KERNEL,
+ &log->bs);
if (!ctx->ra_bio)
return -ENOMEM;
@@ -1678,9 +1677,7 @@ static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
struct r5l_recovery_ctx *ctx,
sector_t offset)
{
- bio_reset(ctx->ra_bio);
- bio_set_dev(ctx->ra_bio, log->rdev->bdev);
- bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
+ bio_reset(ctx->ra_bio, log->rdev->bdev, REQ_OP_READ);
ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
ctx->valid_pages = 0;
@@ -3108,7 +3105,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
INIT_LIST_HEAD(&log->io_end_ios);
INIT_LIST_HEAD(&log->flushing_ios);
INIT_LIST_HEAD(&log->finished_ios);
- bio_init(&log->flush_bio, NULL, 0);
+ bio_init(&log->flush_bio, NULL, NULL, 0, 0);
log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
if (!log->io_kc)
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 4ab417915d7f..bbb5673104ec 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -250,7 +250,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
INIT_LIST_HEAD(&io->stripe_list);
atomic_set(&io->pending_stripes, 0);
atomic_set(&io->pending_flushes, 0);
- bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
+ bio_init(&io->bio, NULL, io->biovec, PPL_IO_INLINE_BVECS, 0);
pplhdr = page_address(io->header_page);
clear_page(pplhdr);
@@ -416,12 +416,10 @@ static void ppl_log_endio(struct bio *bio)
static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
{
- char b[BDEVNAME_SIZE];
-
- pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
+ pr_debug("%s: seq: %llu size: %u sector: %llu dev: %pg\n",
__func__, io->seq, bio->bi_iter.bi_size,
(unsigned long long)bio->bi_iter.bi_sector,
- bio_devname(bio, b));
+ bio->bi_bdev);
submit_bio(bio);
}
@@ -496,11 +494,10 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
struct bio *prev = bio;
- bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS,
+ bio = bio_alloc_bioset(prev->bi_bdev, BIO_MAX_VECS,
+ prev->bi_opf, GFP_NOIO,
&ppl_conf->bs);
- bio->bi_opf = prev->bi_opf;
bio->bi_write_hint = prev->bi_write_hint;
- bio_copy_dev(bio, prev);
bio->bi_iter.bi_sector = bio_end_sector(prev);
bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
@@ -590,9 +587,8 @@ static void ppl_flush_endio(struct bio *bio)
struct ppl_log *log = io->log;
struct ppl_conf *ppl_conf = log->ppl_conf;
struct r5conf *conf = ppl_conf->mddev->private;
- char b[BDEVNAME_SIZE];
- pr_debug("%s: dev: %s\n", __func__, bio_devname(bio, b));
+ pr_debug("%s: dev: %pg\n", __func__, bio->bi_bdev);
if (bio->bi_status) {
struct md_rdev *rdev;
@@ -635,16 +631,14 @@ static void ppl_do_flush(struct ppl_io_unit *io)
if (bdev) {
struct bio *bio;
- char b[BDEVNAME_SIZE];
- bio = bio_alloc_bioset(GFP_NOIO, 0, &ppl_conf->flush_bs);
- bio_set_dev(bio, bdev);
+ bio = bio_alloc_bioset(bdev, 0, GFP_NOIO,
+ REQ_OP_WRITE | REQ_PREFLUSH,
+ &ppl_conf->flush_bs);
bio->bi_private = io;
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
bio->bi_end_io = ppl_flush_endio;
- pr_debug("%s: dev: %s\n", __func__,
- bio_devname(bio, b));
+ pr_debug("%s: dev: %ps\n", __func__, bio->bi_bdev);
submit_bio(bio);
flushed_disks++;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ffe720c73b0a..8891aaba6596 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2310,8 +2310,8 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
for (i = 0; i < disks; i++) {
struct r5dev *dev = &sh->dev[i];
- bio_init(&dev->req, &dev->vec, 1);
- bio_init(&dev->rreq, &dev->rvec, 1);
+ bio_init(&dev->req, NULL, &dev->vec, 1, 0);
+ bio_init(&dev->rreq, NULL, &dev->rvec, 1, 0);
}
if (raid5_has_ppl(conf)) {
@@ -2677,7 +2677,7 @@ static void raid5_end_read_request(struct bio * bi)
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
bi->bi_status);
if (i == disks) {
- bio_reset(bi);
+ bio_reset(bi, NULL, 0);
BUG();
return;
}
@@ -2785,7 +2785,7 @@ static void raid5_end_read_request(struct bio * bi)
}
}
rdev_dec_pending(rdev, conf->mddev);
- bio_reset(bi);
+ bio_reset(bi, NULL, 0);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
@@ -2823,7 +2823,7 @@ static void raid5_end_write_request(struct bio *bi)
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
bi->bi_status);
if (i == disks) {
- bio_reset(bi);
+ bio_reset(bi, NULL, 0);
BUG();
return;
}
@@ -2860,7 +2860,7 @@ static void raid5_end_write_request(struct bio *bi)
if (sh->batch_head && bi->bi_status && !replacement)
set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
- bio_reset(bi);
+ bio_reset(bi, NULL, 0);
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -5438,14 +5438,14 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
return 0;
}
- align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_acct_set);
+ align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
+ &mddev->io_acct_set);
md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
raid_bio->bi_next = (void *)rdev;
if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
md_io_acct->start_time = bio_start_io_acct(raid_bio);
md_io_acct->orig_bio = raid_bio;
- bio_set_dev(align_bio, rdev->bdev);
align_bio->bi_end_io = raid5_align_endio;
align_bio->bi_private = md_io_acct;
align_bio->bi_iter.bi_sector = sector;
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 0cda6c6baefc..3993bdd4b519 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -1943,22 +1943,6 @@ static void msb_io_work(struct work_struct *work)
static DEFINE_IDR(msb_disk_idr); /*set of used disk numbers */
static DEFINE_MUTEX(msb_disk_lock); /* protects against races in open/release */
-static int msb_bd_open(struct block_device *bdev, fmode_t mode)
-{
- struct gendisk *disk = bdev->bd_disk;
- struct msb_data *msb = disk->private_data;
-
- dbg_verbose("block device open");
-
- mutex_lock(&msb_disk_lock);
-
- if (msb && msb->card)
- msb->usage_count++;
-
- mutex_unlock(&msb_disk_lock);
- return 0;
-}
-
static void msb_data_clear(struct msb_data *msb)
{
kfree(msb->boot_page);
@@ -1968,33 +1952,6 @@ static void msb_data_clear(struct msb_data *msb)
msb->card = NULL;
}
-static int msb_disk_release(struct gendisk *disk)
-{
- struct msb_data *msb = disk->private_data;
-
- dbg_verbose("block device release");
- mutex_lock(&msb_disk_lock);
-
- if (msb) {
- if (msb->usage_count)
- msb->usage_count--;
-
- if (!msb->usage_count) {
- disk->private_data = NULL;
- idr_remove(&msb_disk_idr, msb->disk_id);
- put_disk(disk);
- kfree(msb);
- }
- }
- mutex_unlock(&msb_disk_lock);
- return 0;
-}
-
-static void msb_bd_release(struct gendisk *disk, fmode_t mode)
-{
- msb_disk_release(disk);
-}
-
static int msb_bd_getgeo(struct block_device *bdev,
struct hd_geometry *geo)
{
@@ -2003,6 +1960,17 @@ static int msb_bd_getgeo(struct block_device *bdev,
return 0;
}
+static void msb_bd_free_disk(struct gendisk *disk)
+{
+ struct msb_data *msb = disk->private_data;
+
+ mutex_lock(&msb_disk_lock);
+ idr_remove(&msb_disk_idr, msb->disk_id);
+ mutex_unlock(&msb_disk_lock);
+
+ kfree(msb);
+}
+
static blk_status_t msb_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -2096,10 +2064,9 @@ static void msb_start(struct memstick_dev *card)
}
static const struct block_device_operations msb_bdops = {
- .open = msb_bd_open,
- .release = msb_bd_release,
- .getgeo = msb_bd_getgeo,
- .owner = THIS_MODULE
+ .owner = THIS_MODULE,
+ .getgeo = msb_bd_getgeo,
+ .free_disk = msb_bd_free_disk,
};
static const struct blk_mq_ops msb_mq_ops = {
@@ -2147,7 +2114,6 @@ static int msb_init_disk(struct memstick_dev *card)
set_capacity(msb->disk, capacity);
dbg("Set total disk size to %lu sectors", capacity);
- msb->usage_count = 1;
msb->io_queue = alloc_ordered_workqueue("ms_block", WQ_MEM_RECLAIM);
INIT_WORK(&msb->io_work, msb_io_work);
sg_init_table(msb->prealloc_sg, MS_BLOCK_MAX_SEGS+1);
@@ -2229,7 +2195,7 @@ static void msb_remove(struct memstick_dev *card)
msb_data_clear(msb);
mutex_unlock(&msb_disk_lock);
- msb_disk_release(msb->disk);
+ put_disk(msb->disk);
memstick_set_drvdata(card, NULL);
}
diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h
index 122e1a8a8bd5..7058f9aefeb9 100644
--- a/drivers/memstick/core/ms_block.h
+++ b/drivers/memstick/core/ms_block.h
@@ -143,7 +143,6 @@ struct ms_boot_page {
} __packed;
struct msb_data {
- unsigned int usage_count;
struct memstick_dev *card;
struct gendisk *disk;
struct request_queue *queue;
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index c0450397b673..725ba74ded30 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -133,7 +133,6 @@ struct mspro_devinfo {
struct mspro_block_data {
struct memstick_dev *card;
- unsigned int usage_count;
unsigned int caps;
struct gendisk *disk;
struct request_queue *queue;
@@ -178,53 +177,16 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error);
/*** Block device ***/
-static int mspro_block_bd_open(struct block_device *bdev, fmode_t mode)
-{
- struct gendisk *disk = bdev->bd_disk;
- struct mspro_block_data *msb = disk->private_data;
- int rc = -ENXIO;
-
- mutex_lock(&mspro_block_disk_lock);
-
- if (msb && msb->card) {
- msb->usage_count++;
- if ((mode & FMODE_WRITE) && msb->read_only)
- rc = -EROFS;
- else
- rc = 0;
- }
-
- mutex_unlock(&mspro_block_disk_lock);
-
- return rc;
-}
-
-
-static void mspro_block_disk_release(struct gendisk *disk)
+static void mspro_block_bd_free_disk(struct gendisk *disk)
{
struct mspro_block_data *msb = disk->private_data;
int disk_id = MINOR(disk_devt(disk)) >> MSPRO_BLOCK_PART_SHIFT;
mutex_lock(&mspro_block_disk_lock);
-
- if (msb) {
- if (msb->usage_count)
- msb->usage_count--;
-
- if (!msb->usage_count) {
- kfree(msb);
- disk->private_data = NULL;
- idr_remove(&mspro_block_disk_idr, disk_id);
- put_disk(disk);
- }
- }
-
+ idr_remove(&mspro_block_disk_idr, disk_id);
mutex_unlock(&mspro_block_disk_lock);
-}
-static void mspro_block_bd_release(struct gendisk *disk, fmode_t mode)
-{
- mspro_block_disk_release(disk);
+ kfree(msb);
}
static int mspro_block_bd_getgeo(struct block_device *bdev,
@@ -240,10 +202,9 @@ static int mspro_block_bd_getgeo(struct block_device *bdev,
}
static const struct block_device_operations ms_block_bdops = {
- .open = mspro_block_bd_open,
- .release = mspro_block_bd_release,
- .getgeo = mspro_block_bd_getgeo,
- .owner = THIS_MODULE
+ .owner = THIS_MODULE,
+ .getgeo = mspro_block_bd_getgeo,
+ .free_disk = mspro_block_bd_free_disk,
};
/*** Information ***/
@@ -1226,7 +1187,6 @@ static int mspro_block_init_disk(struct memstick_dev *card)
msb->disk->first_minor = disk_id << MSPRO_BLOCK_PART_SHIFT;
msb->disk->minors = 1 << MSPRO_BLOCK_PART_SHIFT;
msb->disk->fops = &ms_block_bdops;
- msb->usage_count = 1;
msb->disk->private_data = msb;
sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
@@ -1239,6 +1199,9 @@ static int mspro_block_init_disk(struct memstick_dev *card)
set_capacity(msb->disk, capacity);
dev_dbg(&card->dev, "capacity set %ld\n", capacity);
+ if (msb->read_only)
+ set_disk_ro(msb->disk, true);
+
rc = device_add_disk(&card->dev, msb->disk, NULL);
if (rc)
goto out_cleanup_disk;
@@ -1341,7 +1304,7 @@ static void mspro_block_remove(struct memstick_dev *card)
mspro_block_data_clear(msb);
mutex_unlock(&mspro_block_disk_lock);
- mspro_block_disk_release(msb->disk);
+ put_disk(msb->disk);
memstick_set_drvdata(card, NULL);
}
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index e86b04bc1d6b..dc7f1532a37f 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -19,7 +19,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
diff --git a/drivers/mtd/nand/raw/sharpsl.c b/drivers/mtd/nand/raw/sharpsl.c
index 5612ee628425..52ce5162538a 100644
--- a/drivers/mtd/nand/raw/sharpsl.c
+++ b/drivers/mtd/nand/raw/sharpsl.c
@@ -6,7 +6,6 @@
* Based on Sharp's NAND driver sharp_sl.c
*/
-#include <linux/genhd.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/delay.h>
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 228c33b8d1d6..c1db43524d75 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -6,7 +6,6 @@
#include <linux/blkdev.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/nd.h>
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index da3f007a1211..cbd994f7f1fe 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -11,7 +11,6 @@
#include <linux/device.h>
#include <linux/mutex.h>
#include <linux/hdreg.h>
-#include <linux/genhd.h>
#include <linux/sizes.h>
#include <linux/ndctl.h>
#include <linux/fs.h>
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 8b52e5144f08..e5a58520d398 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -4,7 +4,6 @@
*/
#include <linux/blkdev.h>
#include <linux/device.h>
-#include <linux/genhd.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/fs.h>
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 9dc7f3edd42b..5bbe31b08581 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -11,7 +11,6 @@
#include <linux/blkdev.h>
#include <linux/fcntl.h>
#include <linux/async.h>
-#include <linux/genhd.h>
#include <linux/ndctl.h>
#include <linux/sched.h>
#include <linux/slab.h>
diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
index 10351d5b49fa..c6a648fd8744 100644
--- a/drivers/nvdimm/nd_virtio.c
+++ b/drivers/nvdimm/nd_virtio.c
@@ -105,12 +105,12 @@ int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
* parent bio. Otherwise directly call nd_region flush.
*/
if (bio && bio->bi_iter.bi_sector != -1) {
- struct bio *child = bio_alloc(GFP_ATOMIC, 0);
+ struct bio *child = bio_alloc(bio->bi_bdev, 0, REQ_PREFLUSH,
+ GFP_ATOMIC);
if (!child)
return -ENOMEM;
- bio_copy_dev(child, bio);
- child->bi_opf = REQ_PREFLUSH;
+ bio_clone_blkg_association(child, bio);
child->bi_iter.bi_sector = -1;
bio_chain(child, bio);
submit_bio(child);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 58eda16f5c53..c31e184bfa45 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -5,7 +5,6 @@
#include <linux/memremap.h>
#include <linux/blkdev.h>
#include <linux/device.h>
-#include <linux/genhd.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/fs.h>
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 70ca9dfc1771..95c2bbb0b2f5 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -267,15 +267,15 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
if (nvmet_use_inline_bvec(req)) {
bio = &req->b.inline_bio;
- bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
+ bio_init(bio, req->ns->bdev, req->inline_bvec,
+ ARRAY_SIZE(req->inline_bvec), op);
} else {
- bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
+ bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt), op,
+ GFP_KERNEL);
}
- bio_set_dev(bio, req->ns->bdev);
bio->bi_iter.bi_sector = sector;
bio->bi_private = req;
bio->bi_end_io = nvmet_bio_done;
- bio->bi_opf = op;
blk_start_plug(&plug);
if (req->metadata_len)
@@ -296,10 +296,9 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
}
}
- bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
- bio_set_dev(bio, req->ns->bdev);
+ bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt),
+ op, GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
- bio->bi_opf = op;
bio_chain(bio, prev);
submit_bio(prev);
@@ -328,11 +327,10 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req)
if (!nvmet_check_transfer_len(req, 0))
return;
- bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
- bio_set_dev(bio, req->ns->bdev);
+ bio_init(bio, req->ns->bdev, req->inline_bvec,
+ ARRAY_SIZE(req->inline_bvec), REQ_OP_WRITE | REQ_PREFLUSH);
bio->bi_private = req;
bio->bi_end_io = nvmet_bio_done;
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
submit_bio(bio);
}
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index 9e5b89ae29df..a810bf569fff 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -206,12 +206,13 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
if (nvmet_use_inline_bvec(req)) {
bio = &req->p.inline_bio;
- bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
+ bio_init(bio, NULL, req->inline_bvec,
+ ARRAY_SIZE(req->inline_bvec), req_op(rq));
} else {
- bio = bio_alloc(GFP_KERNEL, bio_max_segs(req->sg_cnt));
+ bio = bio_alloc(NULL, bio_max_segs(req->sg_cnt), req_op(rq),
+ GFP_KERNEL);
bio->bi_end_io = bio_put;
}
- bio->bi_opf = req_op(rq);
for_each_sg(req->sg, sg, req->sg_cnt, i) {
if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length,
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 46bc30fe85d2..3e421217a7ad 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -412,10 +412,10 @@ static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req)
while (sector < get_capacity(bdev->bd_disk)) {
if (test_bit(blk_queue_zone_no(q, sector), d.zbitmap)) {
- bio = blk_next_bio(bio, 0, GFP_KERNEL);
- bio->bi_opf = zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC;
+ bio = blk_next_bio(bio, bdev, 0,
+ zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
/* This may take a while, so be nice to others */
cond_resched();
}
@@ -522,6 +522,7 @@ static void nvmet_bdev_zone_append_bio_done(struct bio *bio)
void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
{
sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba);
+ const unsigned int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
u16 status = NVME_SC_SUCCESS;
unsigned int total_len = 0;
struct scatterlist *sg;
@@ -551,14 +552,13 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
if (nvmet_use_inline_bvec(req)) {
bio = &req->z.inline_bio;
- bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
+ bio_init(bio, req->ns->bdev, req->inline_bvec,
+ ARRAY_SIZE(req->inline_bvec), op);
} else {
- bio = bio_alloc(GFP_KERNEL, req->sg_cnt);
+ bio = bio_alloc(req->ns->bdev, req->sg_cnt, op, GFP_KERNEL);
}
- bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
bio->bi_end_io = nvmet_bdev_zone_append_bio_done;
- bio_set_dev(bio, req->ns->bdev);
bio->bi_iter.bi_sector = sect;
bio->bi_private = req;
if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index 8b458010f88a..3b7af00a7825 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -47,7 +47,6 @@
#include <linux/module.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
-#include <linux/genhd.h>
#include <linux/hdreg.h>
#include <linux/interrupt.h>
#include <linux/log2.h>
diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c
index 61ecdcb2cc6a..2a9c0ddcade5 100644
--- a/drivers/s390/block/scm_blk.c
+++ b/drivers/s390/block/scm_blk.c
@@ -15,7 +15,6 @@
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
-#include <linux/genhd.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <asm/eadm.h>
diff --git a/drivers/s390/block/scm_blk.h b/drivers/s390/block/scm_blk.h
index a05a4297cfae..af82b3214774 100644
--- a/drivers/s390/block/scm_blk.h
+++ b/drivers/s390/block/scm_blk.h
@@ -6,7 +6,6 @@
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
-#include <linux/genhd.h>
#include <linux/list.h>
#include <asm/debug.h>
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 2104973a35cd..911cc72dd7ac 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -23,7 +23,6 @@
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>
-#include <linux/genhd.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 0a70aa763a96..e30bc51578e9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1276,7 +1276,7 @@ scsi_device_state_check(struct scsi_device *sdev, struct request *req)
* power management commands.
*/
if (req && !(req->rq_flags & RQF_PM))
- return BLK_STS_IOERR;
+ return BLK_STS_OFFLINE;
return BLK_STS_OK;
}
}
diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c
index 0ffdb8f2995f..acdc0aceca5e 100644
--- a/drivers/scsi/scsicam.c
+++ b/drivers/scsi/scsicam.c
@@ -14,7 +14,6 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/pagemap.h>
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 62eb9921cc94..73e6f5f0f37c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -38,7 +38,6 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/bio.h>
-#include <linux/genhd.h>
#include <linux/hdreg.h>
#include <linux/errno.h>
#include <linux/idr.h>
@@ -122,11 +121,6 @@ static void scsi_disk_release(struct device *cdev);
static DEFINE_IDA(sd_index_ida);
-/* This semaphore is used to mediate the 0->1 reference get in the
- * face of object destruction (i.e. we can't allow a get on an
- * object after last put) */
-static DEFINE_MUTEX(sd_ref_mutex);
-
static struct kmem_cache *sd_cdb_cache;
static mempool_t *sd_cdb_pool;
static mempool_t *sd_page_pool;
@@ -664,33 +658,6 @@ static int sd_major(int major_idx)
}
}
-static struct scsi_disk *scsi_disk_get(struct gendisk *disk)
-{
- struct scsi_disk *sdkp = NULL;
-
- mutex_lock(&sd_ref_mutex);
-
- if (disk->private_data) {
- sdkp = scsi_disk(disk);
- if (scsi_device_get(sdkp->device) == 0)
- get_device(&sdkp->dev);
- else
- sdkp = NULL;
- }
- mutex_unlock(&sd_ref_mutex);
- return sdkp;
-}
-
-static void scsi_disk_put(struct scsi_disk *sdkp)
-{
- struct scsi_device *sdev = sdkp->device;
-
- mutex_lock(&sd_ref_mutex);
- put_device(&sdkp->dev);
- scsi_device_put(sdev);
- mutex_unlock(&sd_ref_mutex);
-}
-
#ifdef CONFIG_BLK_SED_OPAL
static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer,
size_t len, bool send)
@@ -1419,17 +1386,15 @@ static bool sd_need_revalidate(struct block_device *bdev,
**/
static int sd_open(struct block_device *bdev, fmode_t mode)
{
- struct scsi_disk *sdkp = scsi_disk_get(bdev->bd_disk);
- struct scsi_device *sdev;
+ struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
+ struct scsi_device *sdev = sdkp->device;
int retval;
- if (!sdkp)
+ if (scsi_device_get(sdev))
return -ENXIO;
SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n"));
- sdev = sdkp->device;
-
/*
* If the device is in error recovery, wait until it is done.
* If the device is offline, then disallow any access to it.
@@ -1474,7 +1439,7 @@ static int sd_open(struct block_device *bdev, fmode_t mode)
return 0;
error_out:
- scsi_disk_put(sdkp);
+ scsi_device_put(sdev);
return retval;
}
@@ -1503,7 +1468,7 @@ static void sd_release(struct gendisk *disk, fmode_t mode)
scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW);
}
- scsi_disk_put(sdkp);
+ scsi_device_put(sdev);
}
static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -1617,7 +1582,7 @@ static int media_not_present(struct scsi_disk *sdkp,
**/
static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing)
{
- struct scsi_disk *sdkp = scsi_disk_get(disk);
+ struct scsi_disk *sdkp = disk->private_data;
struct scsi_device *sdp;
int retval;
bool disk_changed;
@@ -1680,7 +1645,6 @@ out:
*/
disk_changed = sdp->changed;
sdp->changed = 0;
- scsi_disk_put(sdkp);
return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
}
@@ -1888,6 +1852,13 @@ static const struct pr_ops sd_pr_ops = {
.pr_clear = sd_pr_clear,
};
+static void scsi_disk_free_disk(struct gendisk *disk)
+{
+ struct scsi_disk *sdkp = scsi_disk(disk);
+
+ put_device(&sdkp->disk_dev);
+}
+
static const struct block_device_operations sd_fops = {
.owner = THIS_MODULE,
.open = sd_open,
@@ -1899,6 +1870,7 @@ static const struct block_device_operations sd_fops = {
.unlock_native_capacity = sd_unlock_native_capacity,
.report_zones = sd_zbc_report_zones,
.get_unique_id = sd_get_unique_id,
+ .free_disk = scsi_disk_free_disk,
.pr_ops = &sd_pr_ops,
};
@@ -3516,7 +3488,6 @@ static int sd_probe(struct device *dev)
}
sdkp->device = sdp;
- sdkp->driver = &sd_template;
sdkp->disk = gd;
sdkp->index = index;
sdkp->max_retries = SD_MAX_RETRIES;
@@ -3531,14 +3502,14 @@ static int sd_probe(struct device *dev)
SD_MOD_TIMEOUT);
}
- device_initialize(&sdkp->dev);
- sdkp->dev.parent = get_device(dev);
- sdkp->dev.class = &sd_disk_class;
- dev_set_name(&sdkp->dev, "%s", dev_name(dev));
+ device_initialize(&sdkp->disk_dev);
+ sdkp->disk_dev.parent = get_device(dev);
+ sdkp->disk_dev.class = &sd_disk_class;
+ dev_set_name(&sdkp->disk_dev, "%s", dev_name(dev));
- error = device_add(&sdkp->dev);
+ error = device_add(&sdkp->disk_dev);
if (error) {
- put_device(&sdkp->dev);
+ put_device(&sdkp->disk_dev);
goto out;
}
@@ -3549,7 +3520,7 @@ static int sd_probe(struct device *dev)
gd->minors = SD_MINORS;
gd->fops = &sd_fops;
- gd->private_data = &sdkp->driver;
+ gd->private_data = sdkp;
/* defaults, until the device tells us otherwise */
sdp->sector_size = 512;
@@ -3579,7 +3550,7 @@ static int sd_probe(struct device *dev)
error = device_add_disk(dev, gd, NULL);
if (error) {
- put_device(&sdkp->dev);
+ put_device(&sdkp->disk_dev);
goto out;
}
@@ -3625,58 +3596,26 @@ static int sd_probe(struct device *dev)
**/
static int sd_remove(struct device *dev)
{
- struct scsi_disk *sdkp;
+ struct scsi_disk *sdkp = dev_get_drvdata(dev);
- sdkp = dev_get_drvdata(dev);
scsi_autopm_get_device(sdkp->device);
- device_del(&sdkp->dev);
+ device_del(&sdkp->disk_dev);
del_gendisk(sdkp->disk);
sd_shutdown(dev);
- free_opal_dev(sdkp->opal_dev);
-
- mutex_lock(&sd_ref_mutex);
- dev_set_drvdata(dev, NULL);
- put_device(&sdkp->dev);
- mutex_unlock(&sd_ref_mutex);
-
+ put_disk(sdkp->disk);
return 0;
}
-/**
- * scsi_disk_release - Called to free the scsi_disk structure
- * @dev: pointer to embedded class device
- *
- * sd_ref_mutex must be held entering this routine. Because it is
- * called on last put, you should always use the scsi_disk_get()
- * scsi_disk_put() helpers which manipulate the semaphore directly
- * and never do a direct put_device.
- **/
static void scsi_disk_release(struct device *dev)
{
struct scsi_disk *sdkp = to_scsi_disk(dev);
- struct gendisk *disk = sdkp->disk;
- struct request_queue *q = disk->queue;
ida_free(&sd_index_ida, sdkp->index);
-
- /*
- * Wait until all requests that are in progress have completed.
- * This is necessary to avoid that e.g. scsi_end_request() crashes
- * due to clearing the disk->private_data pointer. Wait from inside
- * scsi_disk_release() instead of from sd_release() to avoid that
- * freezing and unfreezing the request queue affects user space I/O
- * in case multiple processes open a /dev/sd... node concurrently.
- */
- blk_mq_freeze_queue(q);
- blk_mq_unfreeze_queue(q);
-
- disk->private_data = NULL;
- put_disk(disk);
- put_device(&sdkp->device->sdev_gendev);
-
sd_zbc_release_disk(sdkp);
+ put_device(&sdkp->device->sdev_gendev);
+ free_opal_dev(sdkp->opal_dev);
kfree(sdkp);
}
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 2e5932bde43d..0a33a4b68ffb 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -68,9 +68,13 @@ enum {
};
struct scsi_disk {
- struct scsi_driver *driver; /* always &sd_template */
struct scsi_device *device;
- struct device dev;
+
+ /*
+ * disk_dev is used to show attributes in /sys/class/scsi_disk/,
+ * but otherwise not really needed. Do not use for refcounting.
+ */
+ struct device disk_dev;
struct gendisk *disk;
struct opal_dev *opal_dev;
#ifdef CONFIG_BLK_DEV_ZONED
@@ -127,11 +131,11 @@ struct scsi_disk {
unsigned security : 1;
unsigned ignore_medium_access_errors : 1;
};
-#define to_scsi_disk(obj) container_of(obj,struct scsi_disk,dev)
+#define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev)
static inline struct scsi_disk *scsi_disk(struct gendisk *disk)
{
- return container_of(disk->private_data, struct scsi_disk, driver);
+ return disk->private_data;
}
#define sd_printk(prefix, sdsk, fmt, a...) \
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index f925b1f1f9ad..641552d6330b 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -109,11 +109,6 @@ static DEFINE_SPINLOCK(sr_index_lock);
static struct lock_class_key sr_bio_compl_lkclass;
-/* This semaphore is used to mediate the 0->1 reference get in the
- * face of object destruction (i.e. we can't allow a get on an
- * object after last put) */
-static DEFINE_MUTEX(sr_ref_mutex);
-
static int sr_open(struct cdrom_device_info *, int);
static void sr_release(struct cdrom_device_info *);
@@ -143,11 +138,9 @@ static const struct cdrom_device_ops sr_dops = {
.capability = SR_CAPABILITIES,
};
-static void sr_kref_release(struct kref *kref);
-
static inline struct scsi_cd *scsi_cd(struct gendisk *disk)
{
- return container_of(disk->private_data, struct scsi_cd, driver);
+ return disk->private_data;
}
static int sr_runtime_suspend(struct device *dev)
@@ -163,38 +156,6 @@ static int sr_runtime_suspend(struct device *dev)
return 0;
}
-/*
- * The get and put routines for the struct scsi_cd. Note this entity
- * has a scsi_device pointer and owns a reference to this.
- */
-static inline struct scsi_cd *scsi_cd_get(struct gendisk *disk)
-{
- struct scsi_cd *cd = NULL;
-
- mutex_lock(&sr_ref_mutex);
- if (disk->private_data == NULL)
- goto out;
- cd = scsi_cd(disk);
- kref_get(&cd->kref);
- if (scsi_device_get(cd->device)) {
- kref_put(&cd->kref, sr_kref_release);
- cd = NULL;
- }
- out:
- mutex_unlock(&sr_ref_mutex);
- return cd;
-}
-
-static void scsi_cd_put(struct scsi_cd *cd)
-{
- struct scsi_device *sdev = cd->device;
-
- mutex_lock(&sr_ref_mutex);
- kref_put(&cd->kref, sr_kref_release);
- scsi_device_put(sdev);
- mutex_unlock(&sr_ref_mutex);
-}
-
static unsigned int sr_get_events(struct scsi_device *sdev)
{
u8 buf[8];
@@ -522,15 +483,13 @@ static void sr_revalidate_disk(struct scsi_cd *cd)
static int sr_block_open(struct block_device *bdev, fmode_t mode)
{
- struct scsi_cd *cd;
- struct scsi_device *sdev;
- int ret = -ENXIO;
+ struct scsi_cd *cd = scsi_cd(bdev->bd_disk);
+ struct scsi_device *sdev = cd->device;
+ int ret;
- cd = scsi_cd_get(bdev->bd_disk);
- if (!cd)
- goto out;
+ if (scsi_device_get(cd->device))
+ return -ENXIO;
- sdev = cd->device;
scsi_autopm_get_device(sdev);
if (bdev_check_media_change(bdev))
sr_revalidate_disk(cd);
@@ -541,9 +500,7 @@ static int sr_block_open(struct block_device *bdev, fmode_t mode)
scsi_autopm_put_device(sdev);
if (ret)
- scsi_cd_put(cd);
-
-out:
+ scsi_device_put(cd->device);
return ret;
}
@@ -555,7 +512,7 @@ static void sr_block_release(struct gendisk *disk, fmode_t mode)
cdrom_release(&cd->cdi, mode);
mutex_unlock(&cd->lock);
- scsi_cd_put(cd);
+ scsi_device_put(cd->device);
}
static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
@@ -595,18 +552,24 @@ out:
static unsigned int sr_block_check_events(struct gendisk *disk,
unsigned int clearing)
{
- unsigned int ret = 0;
- struct scsi_cd *cd;
+ struct scsi_cd *cd = disk->private_data;
- cd = scsi_cd_get(disk);
- if (!cd)
+ if (atomic_read(&cd->device->disk_events_disable_depth))
return 0;
+ return cdrom_check_events(&cd->cdi, clearing);
+}
- if (!atomic_read(&cd->device->disk_events_disable_depth))
- ret = cdrom_check_events(&cd->cdi, clearing);
+static void sr_free_disk(struct gendisk *disk)
+{
+ struct scsi_cd *cd = disk->private_data;
- scsi_cd_put(cd);
- return ret;
+ spin_lock(&sr_index_lock);
+ clear_bit(MINOR(disk_devt(disk)), sr_index_bits);
+ spin_unlock(&sr_index_lock);
+
+ unregister_cdrom(&cd->cdi);
+ mutex_destroy(&cd->lock);
+ kfree(cd);
}
static const struct block_device_operations sr_bdops =
@@ -617,6 +580,7 @@ static const struct block_device_operations sr_bdops =
.ioctl = sr_block_ioctl,
.compat_ioctl = blkdev_compat_ptr_ioctl,
.check_events = sr_block_check_events,
+ .free_disk = sr_free_disk,
};
static int sr_open(struct cdrom_device_info *cdi, int purpose)
@@ -660,8 +624,6 @@ static int sr_probe(struct device *dev)
if (!cd)
goto fail;
- kref_init(&cd->kref);
-
disk = __alloc_disk_node(sdev->request_queue, NUMA_NO_NODE,
&sr_bio_compl_lkclass);
if (!disk)
@@ -692,7 +654,6 @@ static int sr_probe(struct device *dev)
cd->device = sdev;
cd->disk = disk;
- cd->driver = &sr_template;
cd->capacity = 0x1fffff;
cd->device->changed = 1; /* force recheck CD type */
cd->media_present = 1;
@@ -713,7 +674,7 @@ static int sr_probe(struct device *dev)
sr_vendor_init(cd);
set_capacity(disk, cd->capacity);
- disk->private_data = &cd->driver;
+ disk->private_data = cd;
if (register_cdrom(disk, &cd->cdi))
goto fail_minor;
@@ -728,10 +689,8 @@ static int sr_probe(struct device *dev)
sr_revalidate_disk(cd);
error = device_add_disk(&sdev->sdev_gendev, disk, NULL);
- if (error) {
- kref_put(&cd->kref, sr_kref_release);
- goto fail;
- }
+ if (error)
+ goto unregister_cdrom;
sdev_printk(KERN_DEBUG, sdev,
"Attached scsi CD-ROM %s\n", cd->cdi.name);
@@ -739,6 +698,8 @@ static int sr_probe(struct device *dev)
return 0;
+unregister_cdrom:
+ unregister_cdrom(&cd->cdi);
fail_minor:
spin_lock(&sr_index_lock);
clear_bit(minor, sr_index_bits);
@@ -1010,36 +971,6 @@ out_put_request:
return ret;
}
-
-/**
- * sr_kref_release - Called to free the scsi_cd structure
- * @kref: pointer to embedded kref
- *
- * sr_ref_mutex must be held entering this routine. Because it is
- * called on last put, you should always use the scsi_cd_get()
- * scsi_cd_put() helpers which manipulate the semaphore directly
- * and never do a direct kref_put().
- **/
-static void sr_kref_release(struct kref *kref)
-{
- struct scsi_cd *cd = container_of(kref, struct scsi_cd, kref);
- struct gendisk *disk = cd->disk;
-
- spin_lock(&sr_index_lock);
- clear_bit(MINOR(disk_devt(disk)), sr_index_bits);
- spin_unlock(&sr_index_lock);
-
- unregister_cdrom(&cd->cdi);
-
- disk->private_data = NULL;
-
- put_disk(disk);
-
- mutex_destroy(&cd->lock);
-
- kfree(cd);
-}
-
static int sr_remove(struct device *dev)
{
struct scsi_cd *cd = dev_get_drvdata(dev);
@@ -1047,11 +978,7 @@ static int sr_remove(struct device *dev)
scsi_autopm_get_device(cd->device);
del_gendisk(cd->disk);
- dev_set_drvdata(dev, NULL);
-
- mutex_lock(&sr_ref_mutex);
- kref_put(&cd->kref, sr_kref_release);
- mutex_unlock(&sr_ref_mutex);
+ put_disk(cd->disk);
return 0;
}
diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h
index 339c624e04d8..1175f2e213b5 100644
--- a/drivers/scsi/sr.h
+++ b/drivers/scsi/sr.h
@@ -18,8 +18,6 @@
#ifndef _SR_H
#define _SR_H
-#include <linux/genhd.h>
-#include <linux/kref.h>
#include <linux/mutex.h>
#define MAX_RETRIES 3
@@ -33,7 +31,6 @@ struct scsi_device;
typedef struct scsi_cd {
- struct scsi_driver *driver;
unsigned capacity; /* size in blocks */
struct scsi_device *device;
unsigned int vendor; /* vendor code, see sr_vendor.c */
@@ -53,9 +50,6 @@ typedef struct scsi_cd {
struct cdrom_device_info cdi;
struct mutex lock;
- /* We hold gendisk and scsi_device references on probe and use
- * the refs on this kref to decide when to release them */
- struct kref kref;
struct gendisk *disk;
} Scsi_CD;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index e869e90e05af..ebe9412c86f4 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4276,7 +4276,6 @@ static int st_probe(struct device *dev)
goto out_buffer_free;
}
kref_init(&tpnt->kref);
- tpnt->driver = &st_template;
tpnt->device = SDp;
if (SDp->scsi_level <= 2)
diff --git a/drivers/scsi/st.h b/drivers/scsi/st.h
index c0ef0d9aaf8a..7a68eaba7e81 100644
--- a/drivers/scsi/st.h
+++ b/drivers/scsi/st.h
@@ -117,7 +117,6 @@ struct scsi_tape_stats {
/* The tape drive descriptor */
struct scsi_tape {
- struct scsi_driver *driver;
struct scsi_device *device;
struct mutex lock; /* For serialization */
struct completion wait; /* For SCSI commands */
diff --git a/drivers/scsi/ufs/ufshpb.c b/drivers/scsi/ufs/ufshpb.c
index 2d36a0715fca..8970068314ef 100644
--- a/drivers/scsi/ufs/ufshpb.c
+++ b/drivers/scsi/ufs/ufshpb.c
@@ -494,7 +494,7 @@ static struct ufshpb_req *ufshpb_get_map_req(struct ufshpb_lu *hpb,
if (!map_req)
return NULL;
- bio = bio_alloc(GFP_KERNEL, hpb->pages_per_srgn);
+ bio = bio_alloc(NULL, hpb->pages_per_srgn, 0, GFP_KERNEL);
if (!bio) {
ufshpb_put_req(hpb, map_req);
return NULL;
@@ -2050,7 +2050,7 @@ static int ufshpb_pre_req_mempool_init(struct ufshpb_lu *hpb)
INIT_LIST_HEAD(&pre_req->list_req);
pre_req->req = NULL;
- pre_req->bio = bio_alloc(GFP_KERNEL, 1);
+ pre_req->bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
if (!pre_req->bio)
goto release_mem;
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index bf8ae4825a06..87ede165ddba 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -20,7 +20,6 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/bio.h>
-#include <linux/genhd.h>
#include <linux/file.h>
#include <linux/module.h>
#include <linux/scatterlist.h>
@@ -353,18 +352,16 @@ static struct bio *iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num,
* Only allocate as many vector entries as the bio code allows us to,
* we'll loop later on until we have handled the whole request.
*/
- bio = bio_alloc_bioset(GFP_NOIO, bio_max_segs(sg_num),
- &ib_dev->ibd_bio_set);
+ bio = bio_alloc_bioset(ib_dev->ibd_bd, bio_max_segs(sg_num), opf,
+ GFP_NOIO, &ib_dev->ibd_bio_set);
if (!bio) {
pr_err("Unable to allocate memory for bio\n");
return NULL;
}
- bio_set_dev(bio, ib_dev->ibd_bd);
bio->bi_private = cmd;
bio->bi_end_io = &iblock_bio_done;
bio->bi_iter.bi_sector = lba;
- bio->bi_opf = opf;
return bio;
}
@@ -418,10 +415,9 @@ iblock_execute_sync_cache(struct se_cmd *cmd)
if (immed)
target_complete_cmd(cmd, SAM_STAT_GOOD);
- bio = bio_alloc(GFP_KERNEL, 0);
+ bio = bio_alloc(ib_dev->ibd_bd, 0, REQ_OP_WRITE | REQ_PREFLUSH,
+ GFP_KERNEL);
bio->bi_end_io = iblock_end_io_flush;
- bio_set_dev(bio, ib_dev->ibd_bd);
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
if (!immed)
bio->bi_private = cmd;
submit_bio(bio);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 807d06ecadee..0fae71ac5cc8 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -17,7 +17,6 @@
#include <linux/blk_types.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/genhd.h>
#include <linux/cdrom.h>
#include <linux/ratelimit.h>
#include <linux/module.h>
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7e9f90fa0388..abac86a75840 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -78,7 +78,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mutex.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/string.h>
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 48590a380762..b3e9cf3fd1dd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4033,8 +4033,9 @@ static int write_dev_supers(struct btrfs_device *device,
* to do I/O, so we don't lose the ability to do integrity
* checking.
*/
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, device->bdev);
+ bio = bio_alloc(device->bdev, 1,
+ REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
+ GFP_NOFS);
bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
bio->bi_private = device;
bio->bi_end_io = btrfs_end_super_write;
@@ -4046,7 +4047,6 @@ static int write_dev_supers(struct btrfs_device *device,
* go down lazy and there's a short window where the on-disk
* copies might still contain the older version.
*/
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
bio->bi_opf |= REQ_FUA;
@@ -4158,10 +4158,8 @@ static void write_dev_flush(struct btrfs_device *device)
return;
#endif
- bio_reset(bio);
+ bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
bio->bi_end_io = btrfs_end_empty_barrier;
- bio_set_dev(bio, device->bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4c91060d103a..5923eec8caa8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3143,7 +3143,7 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
struct bio *bio;
ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
- bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
+ bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
btrfs_bio_init(btrfs_bio(bio));
return bio;
}
@@ -3154,7 +3154,7 @@ struct bio *btrfs_bio_clone(struct bio *bio)
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
- new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
+ new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset);
bbio = btrfs_bio(new);
btrfs_bio_init(bbio);
bbio->iter = bio->bi_iter;
@@ -3169,7 +3169,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
/* this will never fail when it's backed by a bioset */
- bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
+ bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
bbio = btrfs_bio(bio);
diff --git a/fs/buffer.c b/fs/buffer.c
index 8e112b6bd371..a17c386a142c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3024,12 +3024,16 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
clear_buffer_write_io_error(bh);
- bio = bio_alloc(GFP_NOIO, 1);
+ if (buffer_meta(bh))
+ op_flags |= REQ_META;
+ if (buffer_prio(bh))
+ op_flags |= REQ_PRIO;
+
+ bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
@@ -3038,12 +3042,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
- if (buffer_meta(bh))
- op_flags |= REQ_META;
- if (buffer_prio(bh))
- op_flags |= REQ_PRIO;
- bio_set_op_attrs(bio, op, op_flags);
-
/* Take care of bh's that straddle the end of the device */
guard_bio_eod(bio);
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index bfc2a5b74ed3..2217fe5ece6f 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -54,7 +54,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
int num_pages = 0;
/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
- bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
+ bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE,
+ GFP_NOFS);
while (len) {
unsigned int blocks_this_page = min(len, blocks_per_page);
@@ -62,10 +63,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
if (num_pages == 0) {
fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS);
- bio_set_dev(bio, inode->i_sb->s_bdev);
bio->bi_iter.bi_sector =
pblk << (blockbits - SECTOR_SHIFT);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
}
ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0);
if (WARN_ON(ret != bytes_this_page)) {
@@ -81,7 +80,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
err = submit_bio_wait(bio);
if (err)
goto out;
- bio_reset(bio);
+ bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
num_pages = 0;
}
}
@@ -150,12 +149,10 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
return -EINVAL;
/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
- bio = bio_alloc(GFP_NOFS, nr_pages);
+ bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS);
do {
- bio_set_dev(bio, inode->i_sb->s_bdev);
bio->bi_iter.bi_sector = pblk << (blockbits - 9);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
i = 0;
offset = 0;
@@ -182,7 +179,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
err = submit_bio_wait(bio);
if (err)
goto out;
- bio_reset(bio);
+ bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
} while (len != 0);
err = 0;
out:
diff --git a/fs/dax.c b/fs/dax.c
index cd03485867a7..ab0978739eaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -11,7 +11,6 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 654443558047..38bca4980a1c 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -396,11 +396,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
* bio_alloc() is guaranteed to return a bio when allowed to sleep and
* we request a valid number of vectors.
*/
- bio = bio_alloc(GFP_KERNEL, nr_vecs);
-
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, nr_vecs, dio->op | dio->op_flags, GFP_KERNEL);
bio->bi_iter.bi_sector = first_sector;
- bio_set_op_attrs(bio, dio->op, dio->op_flags);
if (dio->is_async)
bio->bi_end_io = dio_bio_end_aio;
else
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 423bc1a61da5..27d42ffdafd2 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1370,15 +1370,14 @@ submit_bio_retry:
}
if (!bio) {
- bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+ bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+ REQ_OP_READ, GFP_NOIO);
bio->bi_end_io = z_erofs_decompressqueue_endio;
- bio_set_dev(bio, mdev.m_bdev);
last_bdev = mdev.m_bdev;
bio->bi_iter.bi_sector = (sector_t)cur <<
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
- bio->bi_opf = REQ_OP_READ;
if (f->readahead)
bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 1d370364230e..18373fa52922 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -323,10 +323,9 @@ static void ext4_end_bio(struct bio *bio)
{
ext4_io_end_t *io_end = bio->bi_private;
sector_t bi_sector = bio->bi_iter.bi_sector;
- char b[BDEVNAME_SIZE];
- if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
- bio_devname(bio, b),
+ if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
+ bio->bi_bdev,
(long long) bio->bi_iter.bi_sector,
(unsigned) bio_sectors(bio),
bio->bi_status)) {
@@ -398,10 +397,9 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+ bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, 0, GFP_NOIO);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 4cd62f1d848c..1aa26d6634fc 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -365,15 +365,15 @@ int ext4_mpage_readpages(struct inode *inode,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages));
+ bio = bio_alloc(bdev, bio_max_segs(nr_pages),
+ REQ_OP_READ, GFP_KERNEL);
fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, page->index);
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
- bio_set_op_attrs(bio, REQ_OP_READ,
- rac ? REQ_RAHEAD : 0);
+ if (rac)
+ bio->bi_opf |= REQ_RAHEAD;
}
length = first_hole << blkbits;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8c417864c66a..e71dde8de0db 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -394,7 +394,7 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
struct f2fs_sb_info *sbi = fio->sbi;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset);
+ bio = bio_alloc_bioset(NULL, npages, 0, GFP_NOIO, &f2fs_bioset);
f2fs_target_device(sbi, fio->new_blkaddr, bio);
if (is_read_io(fio->op)) {
@@ -985,8 +985,8 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
struct bio_post_read_ctx *ctx = NULL;
unsigned int post_read_steps = 0;
- bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
- bio_max_segs(nr_pages), &f2fs_bioset);
+ bio = bio_alloc_bioset(NULL, bio_max_segs(nr_pages), REQ_OP_READ,
+ for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset);
if (!bio)
return ERR_PTR(-ENOMEM);
@@ -994,7 +994,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
f2fs_target_device(sbi, blkaddr, bio);
bio->bi_end_io = f2fs_read_end_io;
- bio_set_op_attrs(bio, REQ_OP_READ, op_flag);
if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= STEP_DECRYPT;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f8d7fe6db989..33d54c9fbefc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1903,8 +1903,7 @@ static long writeback_sb_inodes(struct super_block *sb,
* unplug, so get our IOs out the door before we
* give up the CPU.
*/
- if (current->plug)
- blk_flush_plug(current->plug, false);
+ blk_flush_plug(current->plug, false);
cond_resched();
}
@@ -2301,8 +2300,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
/*
* If we are expecting writeback progress we must submit plugged IO.
*/
- if (blk_needs_flush_plug(current))
- blk_flush_plug(current->plug, true);
+ blk_flush_plug(current->plug, true);
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ca0bb3a73912..4ae1eefae616 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -265,10 +265,9 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
bio_end_io_t *end_io)
{
struct super_block *sb = sdp->sd_vfs;
- struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+ struct bio *bio = bio_alloc(sb->s_bdev, BIO_MAX_VECS, 0, GFP_NOIO);
bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift;
- bio_set_dev(bio, sb->s_bdev);
bio->bi_end_io = end_io;
bio->bi_private = sdp;
@@ -489,10 +488,9 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs)
{
struct bio *new;
- new = bio_alloc(GFP_NOIO, nr_iovecs);
- bio_copy_dev(new, prev);
+ new = bio_alloc(prev->bi_bdev, nr_iovecs, prev->bi_opf, GFP_NOIO);
+ bio_clone_blkg_association(new, prev);
new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_opf = prev->bi_opf;
new->bi_write_hint = prev->bi_write_hint;
bio_chain(new, prev);
submit_bio(prev);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 72d30a682ece..a580b90b7522 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -222,9 +222,8 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
struct buffer_head *bh = *bhs;
struct bio *bio;
- bio = bio_alloc(GFP_NOIO, num);
+ bio = bio_alloc(bh->b_bdev, num, op | op_flags, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
while (num > 0) {
bh = *bhs;
if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
@@ -235,7 +234,6 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
num--;
}
bio->bi_end_io = gfs2_meta_read_endio;
- bio_set_op_attrs(bio, op, op_flags);
submit_bio(bio);
}
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7f8410d8fdc1..c9b423c874a3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -251,14 +251,12 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
ClearPageDirty(page);
lock_page(page);
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = bio_alloc(sb->s_bdev, 1, REQ_OP_READ | REQ_META, GFP_NOFS);
bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
- bio_set_dev(bio, sb->s_bdev);
bio_add_page(bio, page, PAGE_SIZE, 0);
bio->bi_end_io = end_bio_io_page;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_META);
submit_bio(bio);
wait_on_page_locked(page);
bio_put(bio);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a6002b2d146d..d87ea98cf535 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -15,7 +15,7 @@
#include <linux/kobject.h>
#include <linux/uaccess.h>
#include <linux/gfs2_ondisk.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 5beb82652435..8082eb01127c 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -9,7 +9,7 @@
*/
#include <linux/cdrom.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/nls.h>
#include <linux/slab.h>
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 51ae6f1eb4a5..0b8ad6586df5 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -12,7 +12,6 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/cdrom.h>
-#include <linux/genhd.h>
#include <asm/unaligned.h>
#include "hfsplus_fs.h"
@@ -64,10 +63,8 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
offset = start & (io_size - 1);
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
- bio = bio_alloc(GFP_NOIO, 1);
+ bio = bio_alloc(sb->s_bdev, 1, op | op_flags, GFP_NOIO);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, sb->s_bdev);
- bio_set_op_attrs(bio, op, op_flags);
if (op != WRITE && data)
*data = (u8 *)buf + offset;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 6c51a75d0be6..4653f3d07a1d 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -292,19 +292,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
if (ctx->rac) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
- ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs));
+ ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
+ REQ_OP_READ, gfp);
/*
* If the bio_alloc fails, try it again for a single page to
* avoid having to deal with partial page reads. This emulates
* what do_mpage_readpage does.
*/
- if (!ctx->bio)
- ctx->bio = bio_alloc(orig_gfp, 1);
- ctx->bio->bi_opf = REQ_OP_READ;
+ if (!ctx->bio) {
+ ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
+ orig_gfp);
+ }
if (ctx->rac)
ctx->bio->bi_opf |= REQ_RAHEAD;
ctx->bio->bi_iter.bi_sector = sector;
- bio_set_dev(ctx->bio, iomap->bdev);
ctx->bio->bi_end_io = iomap_read_end_io;
bio_add_folio(ctx->bio, folio, plen, poff);
}
@@ -550,10 +551,8 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
struct bio_vec bvec;
struct bio bio;
- bio_init(&bio, &bvec, 1);
- bio.bi_opf = REQ_OP_READ;
+ bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
- bio_set_dev(&bio, iomap->bdev);
bio_add_folio(&bio, folio, plen, poff);
return submit_bio_wait(&bio);
}
@@ -1229,10 +1228,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
struct iomap_ioend *ioend;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset);
- bio_set_dev(bio, wpc->iomap.bdev);
+ bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE | wbc_to_write_flags(wbc),
+ GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = sector;
- bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
bio->bi_write_hint = inode->i_write_hint;
wbc_init_bio(wbc, bio);
@@ -1261,10 +1260,9 @@ iomap_chain_bio(struct bio *prev)
{
struct bio *new;
- new = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
- bio_copy_dev(new, prev);/* also copies over blkcg information */
+ new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
+ bio_clone_blkg_association(new, prev);
new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_opf = prev->bi_opf;
new->bi_write_hint = prev->bi_write_hint;
bio_chain(prev, new);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 03ea367df19a..e2ba13645ef2 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -183,15 +183,13 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
- bio = bio_alloc(GFP_KERNEL, 1);
- bio_set_dev(bio, iter->iomap.bdev);
+ bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
get_page(page);
__bio_add_page(bio, page, len, 0);
- bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
iomap_dio_submit_bio(iter, dio, bio, pos);
}
@@ -309,14 +307,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- bio = bio_alloc(GFP_KERNEL, nr_pages);
- bio_set_dev(bio, iomap->bdev);
+ bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- bio->bi_opf = bio_opf;
ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
if (unlikely(ret)) {
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 78fd136ac13b..997c81fcea34 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1980,17 +1980,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bp->l_flag |= lbmREAD;
- bio = bio_alloc(GFP_NOFS, 1);
-
+ bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
- bio_set_dev(bio, log->bdev);
-
bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio->bi_opf = REQ_OP_READ;
/*check if journaling to disk has been disabled*/
if (log->no_integrity) {
bio->bi_iter.bi_size = 0;
@@ -2125,16 +2121,13 @@ static void lbmStartIO(struct lbuf * bp)
jfs_info("lbmStartIO");
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
- bio_set_dev(bio, log->bdev);
-
bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
/* check if journaling to disk has been disabled */
if (log->no_integrity) {
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 104ae698443e..fde1a9cf902e 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -417,12 +417,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
}
len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, inode->i_sb->s_bdev);
+ bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOFS);
bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_write_end_io;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
/* Don't call bio_add_page yet, we may add to this vec */
bio_offset = offset;
@@ -497,13 +495,12 @@ static int metapage_readpage(struct file *fp, struct page *page)
if (bio)
submit_bio(bio);
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, inode->i_sb->s_bdev);
+ bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_READ,
+ GFP_NOFS);
bio->bi_iter.bi_sector =
pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_read_end_io;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
len = xlen << inode->i_blkbits;
offset = block_offset << inode->i_blkbits;
if (bio_add_page(bio, page, len, offset) < len)
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 19d36393974c..9cebb6ba555b 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -11,7 +11,6 @@
#include <linux/writeback.h>
#include <linux/xattr.h>
#include <linux/falloc.h>
-#include <linux/genhd.h>
#include <linux/fsnotify.h>
#include <linux/dcache.h>
#include <linux/slab.h>
diff --git a/fs/mpage.c b/fs/mpage.c
index 87f5cfef6caa..dbfc02e23d97 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -66,29 +66,6 @@ static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
return NULL;
}
-static struct bio *
-mpage_alloc(struct block_device *bdev,
- sector_t first_sector, int nr_vecs,
- gfp_t gfp_flags)
-{
- struct bio *bio;
-
- /* Restrict the given (page cache) mask for slab allocations */
- gfp_flags &= GFP_KERNEL;
- bio = bio_alloc(gfp_flags, nr_vecs);
-
- if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(gfp_flags, nr_vecs);
- }
-
- if (bio) {
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = first_sector;
- }
- return bio;
-}
-
/*
* support function for mpage_readahead. The fs supplied get_block might
* return an up to date buffer. This is used to map that buffer into
@@ -296,10 +273,11 @@ alloc_new:
page))
goto out;
}
- args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- bio_max_segs(args->nr_pages), gfp);
+ args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), 0,
+ gfp);
if (args->bio == NULL)
goto confused;
+ args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
}
length = first_hole << blkbits;
@@ -608,10 +586,8 @@ alloc_new:
page, wbc))
goto out;
}
- bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH);
- if (bio == NULL)
- goto confused;
+ bio = bio_alloc(bdev, BIO_MAX_VECS, 0, GFP_NOFS);
+ bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index fe860c538747..79a8b451791f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -115,23 +115,6 @@ bl_submit_bio(struct bio *bio)
return NULL;
}
-static struct bio *bl_alloc_init_bio(unsigned int npg,
- struct block_device *bdev, sector_t disk_sector,
- bio_end_io_t end_io, struct parallel_io *par)
-{
- struct bio *bio;
-
- npg = bio_max_segs(npg);
- bio = bio_alloc(GFP_NOIO, npg);
- if (bio) {
- bio->bi_iter.bi_sector = disk_sector;
- bio_set_dev(bio, bdev);
- bio->bi_end_io = end_io;
- bio->bi_private = par;
- }
- return bio;
-}
-
static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map)
{
return offset >= map->start && offset < map->start + map->len;
@@ -171,11 +154,10 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
retry:
if (!bio) {
- bio = bl_alloc_init_bio(npg, map->bdev,
- disk_addr >> SECTOR_SHIFT, end_io, par);
- if (!bio)
- return ERR_PTR(-ENOMEM);
- bio_set_op_attrs(bio, rw, 0);
+ bio = bio_alloc(map->bdev, bio_max_segs(npg), rw, GFP_NOIO);
+ bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT;
+ bio->bi_end_io = end_io;
+ bio->bi_private = par;
}
if (bio_add_page(bio, page, *len, offset) < *len) {
bio = bl_submit_bio(bio);
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index ef9db135c649..6c977288cc28 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -27,7 +27,6 @@
*/
#include <linux/module.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include "blocklayout.h"
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index e5c0982a381d..b6d01d51a746 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -4,7 +4,6 @@
*/
#include <linux/exportfs.h>
#include <linux/iomap.h>
-#include <linux/genhd.h>
#include <linux/slab.h>
#include <linux/pr.h>
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 43287b0d3e9b..4f71faacd825 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -371,29 +371,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
return err;
}
-/**
- * nilfs_alloc_seg_bio - allocate a new bio for writing log
- * @nilfs: nilfs object
- * @start: start block number of the bio
- * @nr_vecs: request size of page vector.
- *
- * Return Value: On success, pointer to the struct bio is returned.
- * On error, NULL is returned.
- */
-static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
- int nr_vecs)
-{
- struct bio *bio;
-
- bio = bio_alloc(GFP_NOIO, nr_vecs);
- if (likely(bio)) {
- bio_set_dev(bio, nilfs->ns_bdev);
- bio->bi_iter.bi_sector =
- start << (nilfs->ns_blocksize_bits - 9);
- }
- return bio;
-}
-
static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
struct nilfs_write_info *wi)
{
@@ -414,10 +391,10 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
BUG_ON(wi->nr_vecs <= 0);
repeat:
if (!wi->bio) {
- wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
- wi->nr_vecs);
- if (unlikely(!wi->bio))
- return -ENOMEM;
+ wi->bio = bio_alloc(wi->nilfs->ns_bdev, wi->nr_vecs, 0,
+ GFP_NOIO);
+ wi->bio->bi_iter.bi_sector = (wi->blocknr + wi->end) <<
+ (wi->nilfs->ns_blocksize_bits - 9);
}
len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 4de9acb16968..3de5700a9b83 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -1443,17 +1443,6 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
return err;
}
-static inline struct bio *ntfs_alloc_bio(u32 nr_vecs)
-{
- struct bio *bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs);
-
- if (!bio && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs);
- }
- return bio;
-}
-
/*
* ntfs_bio_pages - Read/write pages from/to disk.
*/
@@ -1496,19 +1485,13 @@ int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
lbo = ((u64)lcn << cluster_bits) + off;
len = ((u64)clen << cluster_bits) - off;
new_bio:
- new = ntfs_alloc_bio(nr_pages - page_idx);
- if (!new) {
- err = -ENOMEM;
- goto out;
- }
+ new = bio_alloc(bdev, nr_pages - page_idx, op, GFP_NOFS);
if (bio) {
bio_chain(bio, new);
submit_bio(bio);
}
bio = new;
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = lbo >> 9;
- bio->bi_opf = op;
while (len) {
off = vbo & (PAGE_SIZE - 1);
@@ -1599,18 +1582,12 @@ int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run)
lbo = (u64)lcn << cluster_bits;
len = (u64)clen << cluster_bits;
new_bio:
- new = ntfs_alloc_bio(BIO_MAX_VECS);
- if (!new) {
- err = -ENOMEM;
- break;
- }
+ new = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS);
if (bio) {
bio_chain(bio, new);
submit_bio(bio);
}
bio = new;
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE;
bio->bi_iter.bi_sector = lbo >> 9;
for (;;) {
@@ -1626,11 +1603,10 @@ new_bio:
}
} while (run_get_entry(run, ++run_idx, NULL, &lcn, &clen));
- if (bio) {
- if (!err)
- err = submit_bio_wait(bio);
- bio_put(bio);
- }
+ if (!err)
+ err = submit_bio_wait(bio);
+ bio_put(bio);
+
blk_finish_plug(&plug);
out:
unlock_page(fill);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a17be1618bf7..ea0e70c0fce0 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -518,7 +518,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
* GFP_KERNEL that the local node can get fenced. It would be
* nicest if we could pre-allocate these bios and avoid this
* all together. */
- bio = bio_alloc(GFP_ATOMIC, 16);
+ bio = bio_alloc(reg->hr_bdev, 16, op | op_flags, GFP_ATOMIC);
if (!bio) {
mlog(ML_ERROR, "Could not alloc slots BIO!\n");
bio = ERR_PTR(-ENOMEM);
@@ -527,10 +527,8 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
/* Must put everything in 512 byte sectors for the bio... */
bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
- bio_set_dev(bio, reg->hr_bdev);
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
- bio_set_op_attrs(bio, op, op_flags);
vec_start = (cs << bits) % PAGE_SIZE;
while(cs < max_slots) {
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2db8bcf7ff85..622c844f6d11 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -86,16 +86,17 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
int error, i;
struct bio *bio;
- if (page_count <= BIO_MAX_VECS)
- bio = bio_alloc(GFP_NOIO, page_count);
- else
+ if (page_count <= BIO_MAX_VECS) {
+ bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO);
+ } else {
bio = bio_kmalloc(GFP_NOIO, page_count);
+ bio_set_dev(bio, sb->s_bdev);
+ bio->bi_opf = REQ_OP_READ;
+ }
if (!bio)
return -ENOMEM;
- bio_set_dev(bio, sb->s_bdev);
- bio->bi_opf = READ;
bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);
for (i = 0; i < page_count; ++i) {
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index 667e297f59b1..32fa02945f73 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -36,9 +36,7 @@ xfs_flush_bdev_async(
return;
}
- bio_init(bio, NULL, 0);
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ bio_init(bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
bio->bi_private = done;
bio->bi_end_io = xfs_flush_bdev_async_endio;
@@ -61,10 +59,9 @@ xfs_rw_bdev(
if (is_vmalloc && op == REQ_OP_WRITE)
flush_kernel_vmap_range(data, count);
- bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
- bio->bi_opf = op | REQ_META | REQ_SYNC;
do {
struct page *page = kmem_to_page(data);
@@ -74,10 +71,9 @@ xfs_rw_bdev(
while (bio_add_page(bio, page, len, off) != len) {
struct bio *prev = bio;
- bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- bio_copy_dev(bio, prev);
+ bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
+ prev->bi_opf, GFP_KERNEL);
bio->bi_iter.bi_sector = bio_end_sector(prev);
- bio->bi_opf = prev->bi_opf;
bio_chain(prev, bio);
submit_bio(prev);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b45e0d50a405..ae87fd95b17e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1440,12 +1440,10 @@ next_chunk:
atomic_inc(&bp->b_io_remaining);
nr_pages = bio_max_segs(total_nr_pages);
- bio = bio_alloc(GFP_NOIO, nr_pages);
- bio_set_dev(bio, bp->b_target->bt_bdev);
+ bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
- bio->bi_opf = op;
for (; size && nr_pages; nr_pages--, page_index++) {
int rbytes, nbytes = PAGE_SIZE - offset;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 89fec9a18c34..16f9edbda4eb 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1883,19 +1883,19 @@ xlog_write_iclog(
return;
}
- bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
- bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
- iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
- iclog->ic_bio.bi_end_io = xlog_bio_end_io;
- iclog->ic_bio.bi_private = iclog;
-
/*
* We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
* IOs coming immediately after this one. This prevents the block layer
* writeback throttle from throttling log writes behind background
* metadata writeback and causing priority inversions.
*/
- iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
+ bio_init(&iclog->ic_bio, log->l_targ->bt_bdev, iclog->ic_bvec,
+ howmany(count, PAGE_SIZE),
+ REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE);
+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
+ iclog->ic_bio.bi_end_io = xlog_bio_end_io;
+ iclog->ic_bio.bi_private = iclog;
+
if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
/*
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index b76dfb310ab6..d331b52592a0 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -692,12 +692,11 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
if (!nr_pages)
return 0;
- bio = bio_alloc(GFP_NOFS, nr_pages);
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, nr_pages,
+ REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
bio->bi_iter.bi_sector = zi->i_zsector;
bio->bi_write_hint = iocb->ki_hint;
bio->bi_ioprio = iocb->ki_ioprio;
- bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
if (iocb->ki_flags & IOCB_DSYNC)
bio->bi_opf |= REQ_FUA;
@@ -1541,10 +1540,8 @@ static int zonefs_read_super(struct super_block *sb)
if (!page)
return -ENOMEM;
- bio_init(&bio, &bio_vec, 1);
+ bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = 0;
- bio.bi_opf = REQ_OP_READ;
- bio_set_dev(&bio, sb->s_bdev);
bio_add_page(&bio, page, PAGE_SIZE, 0);
ret = submit_bio_wait(&bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 117d7f248ac9..4c21f6e69e18 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -405,21 +405,25 @@ extern void bioset_exit(struct bio_set *);
extern int biovec_init_pool(mempool_t *pool, int pool_entries);
extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src);
-struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs,
- struct bio_set *bs);
-struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
- struct bio_set *bs);
+struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
+ unsigned int opf, gfp_t gfp_mask,
+ struct bio_set *bs);
+struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev,
+ unsigned short nr_vecs, unsigned int opf, struct bio_set *bs);
struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs);
extern void bio_put(struct bio *);
-extern void __bio_clone_fast(struct bio *, struct bio *);
-extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
+struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
+ gfp_t gfp, struct bio_set *bs);
+int bio_init_clone(struct block_device *bdev, struct bio *bio,
+ struct bio *bio_src, gfp_t gfp);
extern struct bio_set fs_bio_set;
-static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs)
+static inline struct bio *bio_alloc(struct block_device *bdev,
+ unsigned short nr_vecs, unsigned int opf, gfp_t gfp_mask)
{
- return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
+ return bio_alloc_bioset(bdev, nr_vecs, opf, gfp_mask, &fs_bio_set);
}
void submit_bio(struct bio *bio);
@@ -454,10 +458,10 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
struct request_queue;
extern int submit_bio_wait(struct bio *bio);
-extern void bio_init(struct bio *bio, struct bio_vec *table,
- unsigned short max_vecs);
+void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
+ unsigned short max_vecs, unsigned int opf);
extern void bio_uninit(struct bio *);
-extern void bio_reset(struct bio *);
+void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf);
void bio_chain(struct bio *, struct bio *);
int bio_add_page(struct bio *, struct page *, unsigned len, unsigned off);
@@ -487,8 +491,6 @@ static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
__bio_release_pages(bio, mark_dirty);
}
-extern const char *bio_devname(struct bio *bio, char *buffer);
-
#define bio_dev(bio) \
disk_devt((bio)->bi_bdev->bd_disk)
@@ -515,13 +517,6 @@ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
bio_associate_blkg(bio);
}
-static inline void bio_copy_dev(struct bio *dst, struct bio *src)
-{
- bio_clear_flag(dst, BIO_REMAPPED);
- dst->bi_bdev = src->bi_bdev;
- bio_clone_blkg_association(dst, src);
-}
-
/*
* BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
*
@@ -790,6 +785,7 @@ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
bio->bi_opf |= REQ_NOWAIT;
}
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+ unsigned int nr_pages, unsigned int opf, gfp_t gfp);
#endif /* __LINUX_BIO_H */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index b4de2010fba5..f2ad8ed8f777 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -25,14 +25,8 @@
#include <linux/kthread.h>
#include <linux/fs.h>
-/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
-#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
-
-/* Max limits for throttle policy */
-#define THROTL_IOPS_MAX UINT_MAX
#define FC_APPID_LEN 129
-
#ifdef CONFIG_BLK_CGROUP
enum blkg_iostat_type {
@@ -44,6 +38,7 @@ enum blkg_iostat_type {
};
struct blkcg_gq;
+struct blkg_policy_data;
struct blkcg {
struct cgroup_subsys_state css;
@@ -76,36 +71,6 @@ struct blkg_iostat_set {
struct blkg_iostat last;
};
-/*
- * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
- * request_queue (q). This is used by blkcg policies which need to track
- * information per blkcg - q pair.
- *
- * There can be multiple active blkcg policies and each blkg:policy pair is
- * represented by a blkg_policy_data which is allocated and freed by each
- * policy's pd_alloc/free_fn() methods. A policy can allocate private data
- * area by allocating larger data structure which embeds blkg_policy_data
- * at the beginning.
- */
-struct blkg_policy_data {
- /* the blkg and policy id this per-policy data belongs to */
- struct blkcg_gq *blkg;
- int plid;
-};
-
-/*
- * Policies that need to keep per-blkcg data which is independent from any
- * request_queue associated to it should implement cpd_alloc/free_fn()
- * methods. A policy can allocate private data area by allocating larger
- * data structure which embeds blkcg_policy_data at the beginning.
- * cpd_init() is invoked to let each policy handle per-blkcg data.
- */
-struct blkcg_policy_data {
- /* the blkcg and policy id this per-policy data belongs to */
- struct blkcg *blkcg;
- int plid;
-};
-
/* association between a blk cgroup and a request queue */
struct blkcg_gq {
/* Pointer to the associated request_queue */
@@ -141,93 +106,11 @@ struct blkcg_gq {
struct rcu_head rcu_head;
};
-typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
-typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
-typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
-typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
-typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp,
- struct request_queue *q, struct blkcg *blkcg);
-typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
-typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
- struct seq_file *s);
-
-struct blkcg_policy {
- int plid;
- /* cgroup files for the policy */
- struct cftype *dfl_cftypes;
- struct cftype *legacy_cftypes;
-
- /* operations */
- blkcg_pol_alloc_cpd_fn *cpd_alloc_fn;
- blkcg_pol_init_cpd_fn *cpd_init_fn;
- blkcg_pol_free_cpd_fn *cpd_free_fn;
- blkcg_pol_bind_cpd_fn *cpd_bind_fn;
-
- blkcg_pol_alloc_pd_fn *pd_alloc_fn;
- blkcg_pol_init_pd_fn *pd_init_fn;
- blkcg_pol_online_pd_fn *pd_online_fn;
- blkcg_pol_offline_pd_fn *pd_offline_fn;
- blkcg_pol_free_pd_fn *pd_free_fn;
- blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
- blkcg_pol_stat_pd_fn *pd_stat_fn;
-};
-
-extern struct blkcg blkcg_root;
extern struct cgroup_subsys_state * const blkcg_root_css;
-extern bool blkcg_debug_stats;
-
-struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
- struct request_queue *q, bool update_hint);
-int blkcg_init_queue(struct request_queue *q);
-void blkcg_exit_queue(struct request_queue *q);
-
-/* Blkio controller policy registration */
-int blkcg_policy_register(struct blkcg_policy *pol);
-void blkcg_policy_unregister(struct blkcg_policy *pol);
-int blkcg_activate_policy(struct request_queue *q,
- const struct blkcg_policy *pol);
-void blkcg_deactivate_policy(struct request_queue *q,
- const struct blkcg_policy *pol);
-
-const char *blkg_dev_name(struct blkcg_gq *blkg);
-void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
- u64 (*prfill)(struct seq_file *,
- struct blkg_policy_data *, int),
- const struct blkcg_policy *pol, int data,
- bool show_total);
-u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-
-struct blkg_conf_ctx {
- struct block_device *bdev;
- struct blkcg_gq *blkg;
- char *body;
-};
-
-struct block_device *blkcg_conf_open_bdev(char **inputp);
-int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- char *input, struct blkg_conf_ctx *ctx);
-void blkg_conf_finish(struct blkg_conf_ctx *ctx);
-/**
- * blkcg_css - find the current css
- *
- * Find the css associated with either the kthread or the current task.
- * This may return a dying css, so it is up to the caller to use tryget logic
- * to confirm it is alive and well.
- */
-static inline struct cgroup_subsys_state *blkcg_css(void)
-{
- struct cgroup_subsys_state *css;
-
- css = kthread_blkcg();
- if (css)
- return css;
- return task_css(current, io_cgrp_id);
-}
+void blkcg_destroy_blkgs(struct blkcg *blkcg);
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
+void blkcg_maybe_throttle_current(void);
static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
{
@@ -235,27 +118,6 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
}
/**
- * __bio_blkcg - internal, inconsistent version to get blkcg
- *
- * DO NOT USE.
- * This function is inconsistent and consequently is dangerous to use. The
- * first part of the function returns a blkcg where a reference is owned by the
- * bio. This means it does not need to be rcu protected as it cannot go away
- * with the bio owning a reference to it. However, the latter potentially gets
- * it from task_css(). This can race against task migration and the cgroup
- * dying. It is also semantically different as it must be called rcu protected
- * and is susceptible to failure when trying to get a reference to it.
- * Therefore, it is not ok to assume that *_get() will always succeed on the
- * blkcg returned here.
- */
-static inline struct blkcg *__bio_blkcg(struct bio *bio)
-{
- if (bio && bio->bi_blkg)
- return bio->bi_blkg->blkcg;
- return css_to_blkcg(blkcg_css());
-}
-
-/**
* bio_blkcg - grab the blkcg associated with a bio
* @bio: target bio
*
@@ -291,22 +153,6 @@ static inline bool blk_cgroup_congested(void)
}
/**
- * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
- * @return: true if this bio needs to be submitted with the root blkg context.
- *
- * In order to avoid priority inversions we sometimes need to issue a bio as if
- * it were attached to the root blkg, and then backcharge to the actual owning
- * blkg. The idea is we do bio_blkcg() to look up the actual context for the
- * bio and attach the appropriate blkg to the bio. Then we call this helper and
- * if it is true run with the root blkg for that queue and then do any
- * backcharging to the originating cgroup once the io is complete.
- */
-static inline bool bio_issue_as_root_blkg(struct bio *bio)
-{
- return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
-}
-
-/**
* blkcg_parent - get the parent of a blkcg
* @blkcg: blkcg of interest
*
@@ -318,96 +164,6 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
}
/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state. If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
- struct request_queue *q,
- bool update_hint)
-{
- struct blkcg_gq *blkg;
-
- if (blkcg == &blkcg_root)
- return q->root_blkg;
-
- blkg = rcu_dereference(blkcg->blkg_hint);
- if (blkg && blkg->q == q)
- return blkg;
-
- return blkg_lookup_slowpath(blkcg, q, update_hint);
-}
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair. This function should be called
- * under RCU read lock.
- */
-static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
- struct request_queue *q)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
- return __blkg_lookup(blkcg, q, false);
-}
-
-/**
- * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
- * @q: request_queue of interest
- *
- * Lookup blkg for @q at the root level. See also blkg_lookup().
- */
-static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
-{
- return q->root_blkg;
-}
-
-/**
- * blkg_to_pdata - get policy private data
- * @blkg: blkg of interest
- * @pol: policy of interest
- *
- * Return pointer to private data associated with the @blkg-@pol pair.
- */
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
- struct blkcg_policy *pol)
-{
- return blkg ? blkg->pd[pol->plid] : NULL;
-}
-
-static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
- struct blkcg_policy *pol)
-{
- return blkcg ? blkcg->cpd[pol->plid] : NULL;
-}
-
-/**
- * pdata_to_blkg - get blkg associated with policy private data
- * @pd: policy private data of interest
- *
- * @pd is policy private data. Determine the blkg it's associated with.
- */
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
-{
- return pd ? pd->blkg : NULL;
-}
-
-static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
-{
- return cpd ? cpd->blkcg : NULL;
-}
-
-extern void blkcg_destroy_blkgs(struct blkcg *blkcg);
-
-/**
* blkcg_pin_online - pin online state
* @blkcg: blkcg of interest
*
@@ -439,231 +195,24 @@ static inline void blkcg_unpin_online(struct blkcg *blkcg)
} while (blkcg);
}
-/**
- * blkg_path - format cgroup path of blkg
- * @blkg: blkg of interest
- * @buf: target buffer
- * @buflen: target buffer length
- *
- * Format the path of the cgroup of @blkg into @buf.
- */
-static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
-{
- return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-}
-
-/**
- * blkg_get - get a blkg reference
- * @blkg: blkg to get
- *
- * The caller should be holding an existing reference.
- */
-static inline void blkg_get(struct blkcg_gq *blkg)
-{
- percpu_ref_get(&blkg->refcnt);
-}
-
-/**
- * blkg_tryget - try and get a blkg reference
- * @blkg: blkg to get
- *
- * This is for use when doing an RCU lookup of the blkg. We may be in the midst
- * of freeing this blkg, so we can only use it if the refcnt is not zero.
- */
-static inline bool blkg_tryget(struct blkcg_gq *blkg)
-{
- return blkg && percpu_ref_tryget(&blkg->refcnt);
-}
-
-/**
- * blkg_put - put a blkg reference
- * @blkg: blkg to put
- */
-static inline void blkg_put(struct blkcg_gq *blkg)
-{
- percpu_ref_put(&blkg->refcnt);
-}
-
-/**
- * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
- * read locked. If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs. The caller may
- * update @pos_css by calling css_rightmost_descendant() to skip subtree.
- * @p_blkg is included in the iteration and the first node to be visited.
- */
-#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
- css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
- if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
- (p_blkg)->q, false)))
-
-/**
- * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Similar to blkg_for_each_descendant_pre() but performs post-order
- * traversal instead. Synchronization rules are the same. @p_blkg is
- * included in the iteration and the last node to be visited.
- */
-#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
- css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
- if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
- (p_blkg)->q, false)))
-
-bool __blkcg_punt_bio_submit(struct bio *bio);
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio)
-{
- if (bio->bi_opf & REQ_CGROUP_PUNT)
- return __blkcg_punt_bio_submit(bio);
- else
- return false;
-}
-
-static inline void blkcg_bio_issue_init(struct bio *bio)
-{
- bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-}
-
-static inline void blkcg_use_delay(struct blkcg_gq *blkg)
-{
- if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
- return;
- if (atomic_add_return(1, &blkg->use_delay) == 1)
- atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
-}
-
-static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
-{
- int old = atomic_read(&blkg->use_delay);
-
- if (WARN_ON_ONCE(old < 0))
- return 0;
- if (old == 0)
- return 0;
-
- /*
- * We do this song and dance because we can race with somebody else
- * adding or removing delay. If we just did an atomic_dec we'd end up
- * negative and we'd already be in trouble. We need to subtract 1 and
- * then check to see if we were the last delay so we can drop the
- * congestion count on the cgroup.
- */
- while (old) {
- int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
- if (cur == old)
- break;
- old = cur;
- }
-
- if (old == 0)
- return 0;
- if (old == 1)
- atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
- return 1;
-}
-
-/**
- * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
- * @blkg: target blkg
- * @delay: delay duration in nsecs
- *
- * When enabled with this function, the delay is not decayed and must be
- * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
- * blkcg_[un]use_delay() and blkcg_add_delay() usages.
- */
-static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
-{
- int old = atomic_read(&blkg->use_delay);
-
- /* We only want 1 person setting the congestion count for this blkg. */
- if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old)
- atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
-
- atomic64_set(&blkg->delay_nsec, delay);
-}
-
-/**
- * blkcg_clear_delay - Disable allocator delay mechanism
- * @blkg: target blkg
- *
- * Disable use_delay mechanism. See blkcg_set_delay().
- */
-static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
-{
- int old = atomic_read(&blkg->use_delay);
-
- /* We only want 1 person clearing the congestion count for this blkg. */
- if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old)
- atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
-}
-
-void blk_cgroup_bio_start(struct bio *bio);
-void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
-void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
-void blkcg_maybe_throttle_current(void);
#else /* CONFIG_BLK_CGROUP */
struct blkcg {
};
-struct blkg_policy_data {
-};
-
-struct blkcg_policy_data {
-};
-
struct blkcg_gq {
};
-struct blkcg_policy {
-};
-
#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
static inline void blkcg_maybe_throttle_current(void) { }
static inline bool blk_cgroup_congested(void) { return false; }
#ifdef CONFIG_BLOCK
-
static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
-
-static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
-{ return NULL; }
-static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
-static inline void blkcg_exit_queue(struct request_queue *q) { }
-static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
-static inline int blkcg_activate_policy(struct request_queue *q,
- const struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_deactivate_policy(struct request_queue *q,
- const struct blkcg_policy *pol) { }
-
-static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+#endif /* CONFIG_BLOCK */
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
- struct blkcg_policy *pol) { return NULL; }
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
-static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
-static inline void blkg_get(struct blkcg_gq *blkg) { }
-static inline void blkg_put(struct blkcg_gq *blkg) { }
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
-static inline void blkcg_bio_issue_init(struct bio *bio) { }
-static inline void blk_cgroup_bio_start(struct bio *bio) { }
-
-#define blk_queue_for_each_rl(rl, q) \
- for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
-
-#endif /* CONFIG_BLOCK */
#endif /* CONFIG_BLK_CGROUP */
#ifdef CONFIG_BLK_CGROUP_FC_APPID
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d319ffa59354..7aa5c54901a9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -917,8 +917,7 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
}
#define queue_for_each_hw_ctx(q, hctx, i) \
- for ((i) = 0; (i) < (q)->nr_hw_queues && \
- ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
+ xa_for_each(&(q)->hctx_table, (i), (hctx))
#define hctx_for_each_ctx(hctx, ctx, i) \
for ((i) = 0; (i) < (hctx)->nr_ctx && \
@@ -952,8 +951,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
struct bio_set *bs, gfp_t gfp_mask,
int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
void blk_rq_unprep_clone(struct request *rq);
-blk_status_t blk_insert_cloned_request(struct request_queue *q,
- struct request *rq);
+blk_status_t blk_insert_cloned_request(struct request *rq);
struct rq_map_data {
struct page **pages;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index fe065c394fff..0c3563b45fe9 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -153,6 +153,13 @@ typedef u8 __bitwise blk_status_t;
*/
#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16)
+/*
+ * BLK_STS_OFFLINE is returned from the driver when the target device is offline
+ * or is being taken offline. This could help differentiate the case where a
+ * device is intentionally being shut down from a real I/O error.
+ */
+#define BLK_STS_OFFLINE ((__force blk_status_t)17)
+
/**
* blk_path_error - returns true if error may be path related
* @error: status the request was completed with
@@ -317,7 +324,8 @@ enum {
BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion
* of this bio. */
BIO_CGROUP_ACCT, /* has been accounted to a cgroup */
- BIO_TRACKED, /* set if bio goes through the rq_qos path */
+ BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */
+ BIO_QOS_MERGED, /* but went through rq_qos merge path */
BIO_REMAPPED,
BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
BIO_PERCPU_CACHE, /* can participate in per-cpu alloc cache */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 16b47035e4b0..eb27312a1b8f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1,9 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Portions Copyright (C) 1992 Drew Eckhardt
+ */
#ifndef _LINUX_BLKDEV_H
#define _LINUX_BLKDEV_H
-#include <linux/sched.h>
-#include <linux/genhd.h>
+#include <linux/types.h>
+#include <linux/blk_types.h>
+#include <linux/device.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <linux/minmax.h>
@@ -12,11 +16,15 @@
#include <linux/wait.h>
#include <linux/bio.h>
#include <linux/gfp.h>
+#include <linux/kdev_t.h>
#include <linux/rcupdate.h>
#include <linux/percpu-refcount.h>
#include <linux/blkzoned.h>
+#include <linux/sched.h>
#include <linux/sbitmap.h>
#include <linux/srcu.h>
+#include <linux/uuid.h>
+#include <linux/xarray.h>
struct module;
struct request_queue;
@@ -33,6 +41,10 @@ struct blk_queue_stats;
struct blk_stat_callback;
struct blk_crypto_profile;
+extern const struct device_type disk_type;
+extern struct device_type part_type;
+extern struct class block_class;
+
/* Must be consistent with blk_mq_poll_stats_bkt() */
#define BLK_MQ_POLL_STATS_BKTS 16
@@ -45,6 +57,145 @@ struct blk_crypto_profile;
*/
#define BLKCG_MAX_POLS 6
+#define DISK_MAX_PARTS 256
+#define DISK_NAME_LEN 32
+
+#define PARTITION_META_INFO_VOLNAMELTH 64
+/*
+ * Enough for the string representation of any kind of UUID plus NULL.
+ * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
+ */
+#define PARTITION_META_INFO_UUIDLTH (UUID_STRING_LEN + 1)
+
+struct partition_meta_info {
+ char uuid[PARTITION_META_INFO_UUIDLTH];
+ u8 volname[PARTITION_META_INFO_VOLNAMELTH];
+};
+
+/**
+ * DOC: genhd capability flags
+ *
+ * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to
+ * removable media. When set, the device remains present even when media is not
+ * inserted. Shall not be set for devices which are removed entirely when the
+ * media is removed.
+ *
+ * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events,
+ * doesn't appear in sysfs, and can't be opened from userspace or using
+ * blkdev_get*. Used for the underlying components of multipath devices.
+ *
+ * ``GENHD_FL_NO_PART``: partition support is disabled. The kernel will not
+ * scan for partitions from add_disk, and users can't add partitions manually.
+ *
+ */
+enum {
+ GENHD_FL_REMOVABLE = 1 << 0,
+ GENHD_FL_HIDDEN = 1 << 1,
+ GENHD_FL_NO_PART = 1 << 2,
+};
+
+enum {
+ DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */
+ DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */
+};
+
+enum {
+ /* Poll even if events_poll_msecs is unset */
+ DISK_EVENT_FLAG_POLL = 1 << 0,
+ /* Forward events to udev */
+ DISK_EVENT_FLAG_UEVENT = 1 << 1,
+ /* Block event polling when open for exclusive write */
+ DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE = 1 << 2,
+};
+
+struct disk_events;
+struct badblocks;
+
+struct blk_integrity {
+ const struct blk_integrity_profile *profile;
+ unsigned char flags;
+ unsigned char tuple_size;
+ unsigned char interval_exp;
+ unsigned char tag_size;
+};
+
+struct gendisk {
+ /*
+ * major/first_minor/minors should not be set by any new driver, the
+ * block core will take care of allocating them automatically.
+ */
+ int major;
+ int first_minor;
+ int minors;
+
+ char disk_name[DISK_NAME_LEN]; /* name of major driver */
+
+ unsigned short events; /* supported events */
+ unsigned short event_flags; /* flags related to event processing */
+
+ struct xarray part_tbl;
+ struct block_device *part0;
+
+ const struct block_device_operations *fops;
+ struct request_queue *queue;
+ void *private_data;
+
+ int flags;
+ unsigned long state;
+#define GD_NEED_PART_SCAN 0
+#define GD_READ_ONLY 1
+#define GD_DEAD 2
+#define GD_NATIVE_CAPACITY 3
+#define GD_ADDED 4
+
+ struct mutex open_mutex; /* open/close mutex */
+ unsigned open_partitions; /* number of open partitions */
+
+ struct backing_dev_info *bdi;
+ struct kobject *slave_dir;
+#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
+ struct list_head slave_bdevs;
+#endif
+ struct timer_rand_state *random;
+ atomic_t sync_io; /* RAID */
+ struct disk_events *ev;
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ struct kobject integrity_kobj;
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+#if IS_ENABLED(CONFIG_CDROM)
+ struct cdrom_device_info *cdi;
+#endif
+ int node_id;
+ struct badblocks *bb;
+ struct lockdep_map lockdep_map;
+ u64 diskseq;
+};
+
+static inline bool disk_live(struct gendisk *disk)
+{
+ return !inode_unhashed(disk->part0->bd_inode);
+}
+
+/*
+ * The gendisk is refcounted by the part0 block_device, and the bd_device
+ * therein is also used for device model presentation in sysfs.
+ */
+#define dev_to_disk(device) \
+ (dev_to_bdev(device)->bd_disk)
+#define disk_to_dev(disk) \
+ (&((disk)->part0->bd_device))
+
+#if IS_REACHABLE(CONFIG_CDROM)
+#define disk_to_cdi(disk) ((disk)->cdi)
+#else
+#define disk_to_cdi(disk) NULL
+#endif
+
+static inline dev_t disk_devt(struct gendisk *disk)
+{
+ return MKDEV(disk->major, disk->first_minor);
+}
+
static inline int blk_validate_block_size(unsigned long bsize)
{
if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
@@ -204,7 +355,7 @@ struct request_queue {
unsigned int queue_depth;
/* hw dispatch queues */
- struct blk_mq_hw_ctx **queue_hw_ctx;
+ struct xarray hctx_table;
unsigned int nr_hw_queues;
/*
@@ -262,6 +413,7 @@ struct request_queue {
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
struct blk_crypto_profile *crypto_profile;
+ struct kobject *crypto_kobject;
#endif
unsigned int rq_timeout;
@@ -596,6 +748,118 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
#define for_each_bio(_bio) \
for (; _bio; _bio = _bio->bi_next)
+int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups);
+static inline int __must_check add_disk(struct gendisk *disk)
+{
+ return device_add_disk(NULL, disk, NULL);
+}
+void del_gendisk(struct gendisk *gp);
+void invalidate_disk(struct gendisk *disk);
+void set_disk_ro(struct gendisk *disk, bool read_only);
+void disk_uevent(struct gendisk *disk, enum kobject_action action);
+
+static inline int get_disk_ro(struct gendisk *disk)
+{
+ return disk->part0->bd_read_only ||
+ test_bit(GD_READ_ONLY, &disk->state);
+}
+
+static inline int bdev_read_only(struct block_device *bdev)
+{
+ return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
+}
+
+bool set_capacity_and_notify(struct gendisk *disk, sector_t size);
+bool disk_force_media_change(struct gendisk *disk, unsigned int events);
+
+void add_disk_randomness(struct gendisk *disk) __latent_entropy;
+void rand_initialize_disk(struct gendisk *disk);
+
+static inline sector_t get_start_sect(struct block_device *bdev)
+{
+ return bdev->bd_start_sect;
+}
+
+static inline sector_t bdev_nr_sectors(struct block_device *bdev)
+{
+ return bdev->bd_nr_sectors;
+}
+
+static inline loff_t bdev_nr_bytes(struct block_device *bdev)
+{
+ return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT;
+}
+
+static inline sector_t get_capacity(struct gendisk *disk)
+{
+ return bdev_nr_sectors(disk->part0);
+}
+
+static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
+{
+ return bdev_nr_sectors(sb->s_bdev) >>
+ (sb->s_blocksize_bits - SECTOR_SHIFT);
+}
+
+int bdev_disk_changed(struct gendisk *disk, bool invalidate);
+
+struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
+ struct lock_class_key *lkclass);
+void put_disk(struct gendisk *disk);
+struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass);
+
+/**
+ * blk_alloc_disk - allocate a gendisk structure
+ * @node_id: numa node to allocate on
+ *
+ * Allocate and pre-initialize a gendisk structure for use with BIO based
+ * drivers.
+ *
+ * Context: can sleep
+ */
+#define blk_alloc_disk(node_id) \
+({ \
+ static struct lock_class_key __key; \
+ \
+ __blk_alloc_disk(node_id, &__key); \
+})
+void blk_cleanup_disk(struct gendisk *disk);
+
+int __register_blkdev(unsigned int major, const char *name,
+ void (*probe)(dev_t devt));
+#define register_blkdev(major, name) \
+ __register_blkdev(major, name, NULL)
+void unregister_blkdev(unsigned int major, const char *name);
+
+bool bdev_check_media_change(struct block_device *bdev);
+int __invalidate_device(struct block_device *bdev, bool kill_dirty);
+void set_capacity(struct gendisk *disk, sector_t size);
+
+#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
+int bd_register_pending_holders(struct gendisk *disk);
+#else
+static inline int bd_link_disk_holder(struct block_device *bdev,
+ struct gendisk *disk)
+{
+ return 0;
+}
+static inline void bd_unlink_disk_holder(struct block_device *bdev,
+ struct gendisk *disk)
+{
+}
+static inline int bd_register_pending_holders(struct gendisk *disk)
+{
+ return 0;
+}
+#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */
+
+dev_t part_devt(struct gendisk *disk, u8 partno);
+void inc_diskseq(struct gendisk *disk);
+dev_t blk_lookup_devt(const char *name, int partno);
+void blk_request_module(dev_t devt);
extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
@@ -792,14 +1056,11 @@ extern void blk_start_plug(struct blk_plug *);
extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
extern void blk_finish_plug(struct blk_plug *);
-void blk_flush_plug(struct blk_plug *plug, bool from_schedule);
-
-static inline bool blk_needs_flush_plug(struct task_struct *tsk)
+void __blk_flush_plug(struct blk_plug *plug, bool from_schedule);
+static inline void blk_flush_plug(struct blk_plug *plug, bool async)
{
- struct blk_plug *plug = tsk->plug;
-
- return plug &&
- (plug->mq_list || !list_empty(&plug->cb_list));
+ if (plug)
+ __blk_flush_plug(plug, async);
}
int blkdev_issue_flush(struct block_device *bdev);
@@ -825,11 +1086,6 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async)
{
}
-static inline bool blk_needs_flush_plug(struct task_struct *tsk)
-{
- return false;
-}
-
static inline int blkdev_issue_flush(struct block_device *bdev)
{
return 0;
@@ -1211,6 +1467,7 @@ struct block_device_operations {
void (*unlock_native_capacity) (struct gendisk *);
int (*getgeo)(struct block_device *, struct hd_geometry *);
int (*set_read_only)(struct block_device *bdev, bool ro);
+ void (*free_disk)(struct gendisk *disk);
/* this callback is with swap_lock and sometimes page table lock held */
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
int (*report_zones)(struct gendisk *, sector_t sector,
@@ -1267,7 +1524,7 @@ void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
/**
* bio_end_io_acct - end I/O accounting for bio based drivers
* @bio: bio to end account for
- * @start: start time returned by bio_start_io_acct()
+ * @start_time: start time returned by bio_start_io_acct()
*/
static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
{
@@ -1312,6 +1569,7 @@ void invalidate_bdev(struct block_device *bdev);
int sync_blockdev(struct block_device *bdev);
int sync_blockdev_nowait(struct block_device *bdev);
void sync_bdevs(bool wait);
+void printk_all_partitions(void);
#else
static inline void invalidate_bdev(struct block_device *bdev)
{
@@ -1327,7 +1585,11 @@ static inline int sync_blockdev_nowait(struct block_device *bdev)
static inline void sync_bdevs(bool wait)
{
}
-#endif
+static inline void printk_all_partitions(void)
+{
+}
+#endif /* CONFIG_BLOCK */
+
int fsync_bdev(struct block_device *bdev);
int freeze_bdev(struct block_device *bdev);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
deleted file mode 100644
index 6906a45bc761..000000000000
--- a/include/linux/genhd.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_GENHD_H
-#define _LINUX_GENHD_H
-
-/*
- * genhd.h Copyright (C) 1992 Drew Eckhardt
- * Generic hard disk header file by
- * Drew Eckhardt
- *
- * <drew@colorado.edu>
- */
-
-#include <linux/types.h>
-#include <linux/kdev_t.h>
-#include <linux/uuid.h>
-#include <linux/blk_types.h>
-#include <linux/device.h>
-#include <linux/xarray.h>
-
-extern const struct device_type disk_type;
-extern struct device_type part_type;
-extern struct class block_class;
-
-#define DISK_MAX_PARTS 256
-#define DISK_NAME_LEN 32
-
-#define PARTITION_META_INFO_VOLNAMELTH 64
-/*
- * Enough for the string representation of any kind of UUID plus NULL.
- * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
- */
-#define PARTITION_META_INFO_UUIDLTH (UUID_STRING_LEN + 1)
-
-struct partition_meta_info {
- char uuid[PARTITION_META_INFO_UUIDLTH];
- u8 volname[PARTITION_META_INFO_VOLNAMELTH];
-};
-
-/**
- * DOC: genhd capability flags
- *
- * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to
- * removable media. When set, the device remains present even when media is not
- * inserted. Shall not be set for devices which are removed entirely when the
- * media is removed.
- *
- * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events,
- * doesn't appear in sysfs, and can't be opened from userspace or using
- * blkdev_get*. Used for the underlying components of multipath devices.
- *
- * ``GENHD_FL_NO_PART``: partition support is disabled. The kernel will not
- * scan for partitions from add_disk, and users can't add partitions manually.
- *
- */
-enum {
- GENHD_FL_REMOVABLE = 1 << 0,
- GENHD_FL_HIDDEN = 1 << 1,
- GENHD_FL_NO_PART = 1 << 2,
-};
-
-enum {
- DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */
- DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */
-};
-
-enum {
- /* Poll even if events_poll_msecs is unset */
- DISK_EVENT_FLAG_POLL = 1 << 0,
- /* Forward events to udev */
- DISK_EVENT_FLAG_UEVENT = 1 << 1,
- /* Block event polling when open for exclusive write */
- DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE = 1 << 2,
-};
-
-struct disk_events;
-struct badblocks;
-
-struct blk_integrity {
- const struct blk_integrity_profile *profile;
- unsigned char flags;
- unsigned char tuple_size;
- unsigned char interval_exp;
- unsigned char tag_size;
-};
-
-struct gendisk {
- /*
- * major/first_minor/minors should not be set by any new driver, the
- * block core will take care of allocating them automatically.
- */
- int major;
- int first_minor;
- int minors;
-
- char disk_name[DISK_NAME_LEN]; /* name of major driver */
-
- unsigned short events; /* supported events */
- unsigned short event_flags; /* flags related to event processing */
-
- struct xarray part_tbl;
- struct block_device *part0;
-
- const struct block_device_operations *fops;
- struct request_queue *queue;
- void *private_data;
-
- int flags;
- unsigned long state;
-#define GD_NEED_PART_SCAN 0
-#define GD_READ_ONLY 1
-#define GD_DEAD 2
-#define GD_NATIVE_CAPACITY 3
-
- struct mutex open_mutex; /* open/close mutex */
- unsigned open_partitions; /* number of open partitions */
-
- struct backing_dev_info *bdi;
- struct kobject *slave_dir;
-#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
- struct list_head slave_bdevs;
-#endif
- struct timer_rand_state *random;
- atomic_t sync_io; /* RAID */
- struct disk_events *ev;
-#ifdef CONFIG_BLK_DEV_INTEGRITY
- struct kobject integrity_kobj;
-#endif /* CONFIG_BLK_DEV_INTEGRITY */
-#if IS_ENABLED(CONFIG_CDROM)
- struct cdrom_device_info *cdi;
-#endif
- int node_id;
- struct badblocks *bb;
- struct lockdep_map lockdep_map;
- u64 diskseq;
-};
-
-static inline bool disk_live(struct gendisk *disk)
-{
- return !inode_unhashed(disk->part0->bd_inode);
-}
-
-/*
- * The gendisk is refcounted by the part0 block_device, and the bd_device
- * therein is also used for device model presentation in sysfs.
- */
-#define dev_to_disk(device) \
- (dev_to_bdev(device)->bd_disk)
-#define disk_to_dev(disk) \
- (&((disk)->part0->bd_device))
-
-#if IS_REACHABLE(CONFIG_CDROM)
-#define disk_to_cdi(disk) ((disk)->cdi)
-#else
-#define disk_to_cdi(disk) NULL
-#endif
-
-static inline dev_t disk_devt(struct gendisk *disk)
-{
- return MKDEV(disk->major, disk->first_minor);
-}
-
-void disk_uevent(struct gendisk *disk, enum kobject_action action);
-
-/* block/genhd.c */
-int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups);
-static inline int __must_check add_disk(struct gendisk *disk)
-{
- return device_add_disk(NULL, disk, NULL);
-}
-extern void del_gendisk(struct gendisk *gp);
-
-void invalidate_disk(struct gendisk *disk);
-
-void set_disk_ro(struct gendisk *disk, bool read_only);
-
-static inline int get_disk_ro(struct gendisk *disk)
-{
- return disk->part0->bd_read_only ||
- test_bit(GD_READ_ONLY, &disk->state);
-}
-
-static inline int bdev_read_only(struct block_device *bdev)
-{
- return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
-}
-
-extern void disk_block_events(struct gendisk *disk);
-extern void disk_unblock_events(struct gendisk *disk);
-extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
-bool set_capacity_and_notify(struct gendisk *disk, sector_t size);
-bool disk_force_media_change(struct gendisk *disk, unsigned int events);
-
-/* drivers/char/random.c */
-extern void add_disk_randomness(struct gendisk *disk) __latent_entropy;
-extern void rand_initialize_disk(struct gendisk *disk);
-
-static inline sector_t get_start_sect(struct block_device *bdev)
-{
- return bdev->bd_start_sect;
-}
-
-static inline sector_t bdev_nr_sectors(struct block_device *bdev)
-{
- return bdev->bd_nr_sectors;
-}
-
-static inline loff_t bdev_nr_bytes(struct block_device *bdev)
-{
- return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT;
-}
-
-static inline sector_t get_capacity(struct gendisk *disk)
-{
- return bdev_nr_sectors(disk->part0);
-}
-
-static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
-{
- return bdev_nr_sectors(sb->s_bdev) >>
- (sb->s_blocksize_bits - SECTOR_SHIFT);
-}
-
-int bdev_disk_changed(struct gendisk *disk, bool invalidate);
-void blk_drop_partitions(struct gendisk *disk);
-
-struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
- struct lock_class_key *lkclass);
-extern void put_disk(struct gendisk *disk);
-struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass);
-
-/**
- * blk_alloc_disk - allocate a gendisk structure
- * @node_id: numa node to allocate on
- *
- * Allocate and pre-initialize a gendisk structure for use with BIO based
- * drivers.
- *
- * Context: can sleep
- */
-#define blk_alloc_disk(node_id) \
-({ \
- static struct lock_class_key __key; \
- \
- __blk_alloc_disk(node_id, &__key); \
-})
-void blk_cleanup_disk(struct gendisk *disk);
-
-int __register_blkdev(unsigned int major, const char *name,
- void (*probe)(dev_t devt));
-#define register_blkdev(major, name) \
- __register_blkdev(major, name, NULL)
-void unregister_blkdev(unsigned int major, const char *name);
-
-bool bdev_check_media_change(struct block_device *bdev);
-int __invalidate_device(struct block_device *bdev, bool kill_dirty);
-void set_capacity(struct gendisk *disk, sector_t size);
-
-#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
-int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
-void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
-int bd_register_pending_holders(struct gendisk *disk);
-#else
-static inline int bd_link_disk_holder(struct block_device *bdev,
- struct gendisk *disk)
-{
- return 0;
-}
-static inline void bd_unlink_disk_holder(struct block_device *bdev,
- struct gendisk *disk)
-{
-}
-static inline int bd_register_pending_holders(struct gendisk *disk)
-{
- return 0;
-}
-#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */
-
-dev_t part_devt(struct gendisk *disk, u8 partno);
-void inc_diskseq(struct gendisk *disk);
-dev_t blk_lookup_devt(const char *name, int partno);
-void blk_request_module(dev_t devt);
-#ifdef CONFIG_BLOCK
-void printk_all_partitions(void);
-#else /* CONFIG_BLOCK */
-static inline void printk_all_partitions(void)
-{
-}
-#endif /* CONFIG_BLOCK */
-
-#endif /* _LINUX_GENHD_H */
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 6f7949b2fd8d..abeba356bc3f 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -2,7 +2,7 @@
#ifndef _LINUX_PART_STAT_H
#define _LINUX_PART_STAT_H
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <asm/local.h>
struct disk_stats {
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 95df357ec009..dffeb8281c2d 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -28,14 +28,9 @@ struct seq_file;
*/
struct sbitmap_word {
/**
- * @depth: Number of bits being used in @word/@cleared
- */
- unsigned long depth;
-
- /**
* @word: word holding free bits
*/
- unsigned long word ____cacheline_aligned_in_smp;
+ unsigned long word;
/**
* @cleared: word holding cleared bits
@@ -140,7 +135,7 @@ struct sbitmap_queue {
/**
* @min_shallow_depth: The minimum shallow depth which may be passed to
- * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
+ * sbitmap_queue_get_shallow()
*/
unsigned int min_shallow_depth;
};
@@ -164,6 +159,14 @@ struct sbitmap_queue {
int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
gfp_t flags, int node, bool round_robin, bool alloc_hint);
+/* sbitmap internal helper */
+static inline unsigned int __map_depth(const struct sbitmap *sb, int index)
+{
+ if (index == sb->map_nr - 1)
+ return sb->depth - (index << sb->shift);
+ return 1U << sb->shift;
+}
+
/**
* sbitmap_free() - Free memory used by a &struct sbitmap.
* @sb: Bitmap to free.
@@ -251,7 +254,7 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb,
while (scanned < sb->depth) {
unsigned long word;
unsigned int depth = min_t(unsigned int,
- sb->map[index].depth - nr,
+ __map_depth(sb, index) - nr,
sb->depth - scanned);
scanned += depth;
@@ -460,7 +463,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
unsigned int *offset);
/**
- * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
* sbitmap_queue, limiting the depth used from each word, with preemption
* already disabled.
* @sbq: Bitmap queue to allocate from.
@@ -472,8 +475,8 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
*
* Return: Non-negative allocated bit number if successful, -1 otherwise.
*/
-int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
- unsigned int shallow_depth);
+int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+ unsigned int shallow_depth);
/**
* sbitmap_queue_get() - Try to allocate a free bit from a &struct
@@ -496,32 +499,6 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
}
/**
- * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
- * sbitmap_queue, limiting the depth used from each word.
- * @sbq: Bitmap queue to allocate from.
- * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
- * sbitmap_queue_clear()).
- * @shallow_depth: The maximum number of bits to allocate from a single word.
- * See sbitmap_get_shallow().
- *
- * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
- * initializing @sbq.
- *
- * Return: Non-negative allocated bit number if successful, -1 otherwise.
- */
-static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
- unsigned int *cpu,
- unsigned int shallow_depth)
-{
- int nr;
-
- *cpu = get_cpu();
- nr = __sbitmap_queue_get_shallow(sbq, shallow_depth);
- put_cpu();
- return nr;
-}
-
-/**
* sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the
* minimum shallow depth that will be used.
* @sbq: Bitmap queue in question.
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 6794d7322cbd..e3a4c67794b1 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -13,7 +13,6 @@
#include <scsi/scsi_request.h>
struct Scsi_Host;
-struct scsi_driver;
/*
* MAX_COMMAND_SIZE is:
@@ -159,14 +158,6 @@ static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd)
return cmd + 1;
}
-/* make sure not to use it with passthrough commands */
-static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd)
-{
- struct request *rq = scsi_cmd_to_rq(cmd);
-
- return *(struct scsi_driver **)rq->q->disk->private_data;
-}
-
void scsi_done(struct scsi_cmnd *cmd);
extern void scsi_finish_command(struct scsi_cmnd *cmd);
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index 6dffa8555a39..4ce1988b2ba0 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -4,11 +4,10 @@
#include <linux/blk_types.h>
#include <linux/device.h>
+#include <scsi/scsi_cmnd.h>
struct module;
struct request;
-struct scsi_cmnd;
-struct scsi_device;
struct scsi_driver {
struct device_driver gendrv;
@@ -31,4 +30,10 @@ extern int scsi_register_interface(struct class_interface *);
#define scsi_unregister_interface(intf) \
class_interface_unregister(intf)
+/* make sure not to use it with passthrough commands */
+static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd)
+{
+ return to_scsi_driver(cmd->device->sdev_gendev.driver);
+}
+
#endif /* _SCSI_SCSI_DRIVER_H */
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 27170e40e8c9..7f4dfbdf12a6 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -100,19 +100,7 @@ TRACE_EVENT(block_rq_requeue,
__entry->nr_sector, 0)
);
-/**
- * block_rq_complete - block IO operation completed by device driver
- * @rq: block operations request
- * @error: status code
- * @nr_bytes: number of completed bytes
- *
- * The block_rq_complete tracepoint event indicates that some portion
- * of operation request has been completed by the device driver. If
- * the @rq->bio is %NULL, then there is absolutely no additional work to
- * do for the request. If @rq->bio is non-NULL then there is
- * additional work required to complete the request.
- */
-TRACE_EVENT(block_rq_complete,
+DECLARE_EVENT_CLASS(block_rq_completion,
TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),
@@ -144,6 +132,41 @@ TRACE_EVENT(block_rq_complete,
__entry->nr_sector, __entry->error)
);
+/**
+ * block_rq_complete - block IO operation completed by device driver
+ * @rq: block operations request
+ * @error: status code
+ * @nr_bytes: number of completed bytes
+ *
+ * The block_rq_complete tracepoint event indicates that some portion
+ * of operation request has been completed by the device driver. If
+ * the @rq->bio is %NULL, then there is absolutely no additional work to
+ * do for the request. If @rq->bio is non-NULL then there is
+ * additional work required to complete the request.
+ */
+DEFINE_EVENT(block_rq_completion, block_rq_complete,
+
+ TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),
+
+ TP_ARGS(rq, error, nr_bytes)
+);
+
+/**
+ * block_rq_error - block IO operation error reported by device driver
+ * @rq: block operations request
+ * @error: status code
+ * @nr_bytes: number of completed bytes
+ *
+ * The block_rq_error tracepoint event indicates that some portion
+ * of operation request has failed as reported by the device driver.
+ */
+DEFINE_EVENT(block_rq_completion, block_rq_error,
+
+ TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),
+
+ TP_ARGS(rq, error, nr_bytes)
+);
+
DECLARE_EVENT_CLASS(block_rq,
TP_PROTO(struct request *rq),
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 762b534978d9..7058e14ad5f7 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -8,7 +8,6 @@
#include <linux/root_dev.h>
#include <linux/security.h>
#include <linux/delay.h>
-#include <linux/genhd.h>
#include <linux/mount.h>
#include <linux/device.h>
#include <linux/init.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index c303cffe7fdb..192b90a9ce16 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -735,7 +735,7 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- WARN_ON(blk_needs_flush_plug(tsk));
+ WARN_ON(tsk->plug);
/*
* If do_dead is called because this processes oopsed, it's possible
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 0ac805b753e5..938d5c78b421 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,7 +28,6 @@
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
-#include <linux/genhd.h>
#include <linux/ktime.h>
#include <linux/security.h>
#include <linux/secretmem.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index c51f5507b34f..91fffdd2c7fb 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -16,7 +16,6 @@
#include <linux/file.h>
#include <linux/delay.h>
#include <linux/bitops.h>
-#include <linux/genhd.h>
#include <linux/device.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -277,10 +276,9 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
struct bio *bio;
int error = 0;
- bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
+ bio = bio_alloc(hib_resume_bdev, 1, op | op_flags,
+ GFP_NOIO | __GFP_HIGH);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
- bio_set_dev(bio, hib_resume_bdev);
- bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
pr_err("Adding page to bio failed at %llu\n",
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9745613d531c..6c56ee848e2a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6353,8 +6353,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
- if (blk_needs_flush_plug(tsk))
- blk_flush_plug(tsk->plug, true);
+ blk_flush_plug(tsk->plug, true);
}
static void sched_update_worker(struct task_struct *tsk)
@@ -8380,9 +8379,7 @@ int io_schedule_prepare(void)
int old_iowait = current->in_iowait;
current->in_iowait = 1;
- if (current->plug)
- blk_flush_plug(current->plug, true);
-
+ blk_flush_plug(current->plug, true);
return old_iowait;
}
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 09d293c30fd2..2eb3de18ded3 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -85,7 +85,6 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
bool alloc_hint)
{
unsigned int bits_per_word;
- unsigned int i;
if (shift < 0)
shift = sbitmap_calculate_shift(depth);
@@ -117,10 +116,6 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
return -ENOMEM;
}
- for (i = 0; i < sb->map_nr; i++) {
- sb->map[i].depth = min(depth, bits_per_word);
- depth -= sb->map[i].depth;
- }
return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_init_node);
@@ -135,11 +130,6 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
sb->depth = depth;
sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
-
- for (i = 0; i < sb->map_nr; i++) {
- sb->map[i].depth = min(depth, bits_per_word);
- depth -= sb->map[i].depth;
- }
}
EXPORT_SYMBOL_GPL(sbitmap_resize);
@@ -184,8 +174,8 @@ static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
int nr;
do {
- nr = __sbitmap_get_word(&map->word, map->depth, alloc_hint,
- !sb->round_robin);
+ nr = __sbitmap_get_word(&map->word, __map_depth(sb, index),
+ alloc_hint, !sb->round_robin);
if (nr != -1)
break;
if (!sbitmap_deferred_clear(map))
@@ -257,7 +247,9 @@ static int __sbitmap_get_shallow(struct sbitmap *sb,
for (i = 0; i < sb->map_nr; i++) {
again:
nr = __sbitmap_get_word(&sb->map[index].word,
- min(sb->map[index].depth, shallow_depth),
+ min_t(unsigned int,
+ __map_depth(sb, index),
+ shallow_depth),
SB_NR_TO_BIT(sb, alloc_hint), true);
if (nr != -1) {
nr += index << sb->shift;
@@ -315,11 +307,12 @@ static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
for (i = 0; i < sb->map_nr; i++) {
const struct sbitmap_word *word = &sb->map[i];
+ unsigned int word_depth = __map_depth(sb, i);
if (set)
- weight += bitmap_weight(&word->word, word->depth);
+ weight += bitmap_weight(&word->word, word_depth);
else
- weight += bitmap_weight(&word->cleared, word->depth);
+ weight += bitmap_weight(&word->cleared, word_depth);
}
return weight;
}
@@ -367,7 +360,7 @@ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
for (i = 0; i < sb->map_nr; i++) {
unsigned long word = READ_ONCE(sb->map[i].word);
unsigned long cleared = READ_ONCE(sb->map[i].cleared);
- unsigned int word_bits = READ_ONCE(sb->map[i].depth);
+ unsigned int word_bits = __map_depth(sb, i);
word &= ~cleared;
@@ -531,15 +524,16 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
for (i = 0; i < sb->map_nr; i++) {
struct sbitmap_word *map = &sb->map[index];
unsigned long get_mask;
+ unsigned int map_depth = __map_depth(sb, index);
sbitmap_deferred_clear(map);
- if (map->word == (1UL << (map->depth - 1)) - 1)
+ if (map->word == (1UL << (map_depth - 1)) - 1)
continue;
- nr = find_first_zero_bit(&map->word, map->depth);
- if (nr + nr_tags <= map->depth) {
+ nr = find_first_zero_bit(&map->word, map_depth);
+ if (nr + nr_tags <= map_depth) {
atomic_long_t *ptr = (atomic_long_t *) &map->word;
- int map_tags = min_t(int, nr_tags, map->depth);
+ int map_tags = min_t(int, nr_tags, map_depth);
unsigned long val, ret;
get_mask = ((1UL << map_tags) - 1) << nr;
@@ -563,14 +557,14 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
return 0;
}
-int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
- unsigned int shallow_depth)
+int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+ unsigned int shallow_depth)
{
WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth);
return sbitmap_get_shallow(&sbq->sb, shallow_depth);
}
-EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
+EXPORT_SYMBOL_GPL(sbitmap_queue_get_shallow);
void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
unsigned int min_shallow_depth)
diff --git a/mm/page_io.c b/mm/page_io.c
index 0bf8e40f4e57..61c792f916fa 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -338,10 +338,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
- bio = bio_alloc(GFP_NOIO, 1);
- bio_set_dev(bio, sis->bdev);
+ bio = bio_alloc(sis->bdev, 1,
+ REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
+ GFP_NOIO);
bio->bi_iter.bi_sector = swap_page_sector(page);
- bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
bio->bi_end_io = end_write_func;
bio_add_page(bio, page, thp_size(page), 0);
@@ -403,9 +403,7 @@ int swap_readpage(struct page *page, bool synchronous)
}
ret = 0;
- bio = bio_alloc(GFP_KERNEL, 1);
- bio_set_dev(bio, sis->bdev);
- bio->bi_opf = REQ_OP_READ;
+ bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
bio->bi_iter.bi_sector = swap_page_sector(page);
bio->bi_end_io = end_swap_bio_read;
bio_add_page(bio, page, thp_size(page), 0);
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 2a1f6418b10a..99d23ac4c35d 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -16,7 +16,6 @@
#include <linux/parser.h>
#include <linux/slab.h>
#include <linux/rculist.h>
-#include <linux/genhd.h>
#include <linux/seq_file.h>
#include <linux/ima.h>