summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-block10
-rw-r--r--Documentation/block/00-INDEX2
-rw-r--r--Documentation/block/bfq-iosched.txt531
-rw-r--r--Documentation/block/kyber-iosched.txt14
-rw-r--r--Documentation/block/queue-sysfs.txt11
-rw-r--r--Documentation/blockdev/mflash.txt84
-rw-r--r--Documentation/devicetree/bindings/ata/ahci-dm816.txt21
-rw-r--r--Documentation/devicetree/bindings/hwmon/ads7828.txt25
-rw-r--r--Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt68
-rw-r--r--Documentation/devicetree/bindings/hwmon/lm87.txt30
-rw-r--r--Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt4
-rw-r--r--Documentation/devicetree/bindings/leds/leds-cpcap.txt29
-rw-r--r--Documentation/devicetree/bindings/leds/leds-mt6323.txt60
-rw-r--r--Documentation/devicetree/bindings/leds/leds-pca9532.txt10
-rw-r--r--Documentation/devicetree/bindings/mailbox/brcm,iproc-flexrm-mbox.txt59
-rw-r--r--Documentation/devicetree/bindings/mailbox/brcm,iproc-pdc-mbox.txt6
-rw-r--r--Documentation/devicetree/bindings/power/reset/gemini-poweroff.txt17
-rw-r--r--Documentation/devicetree/bindings/power/reset/syscon-poweroff.txt11
-rw-r--r--Documentation/devicetree/bindings/power/supply/cpcap-charger.txt37
-rw-r--r--Documentation/devicetree/bindings/power/supply/lego_ev3_battery.txt21
-rw-r--r--Documentation/devicetree/bindings/power/supply/ltc2941.txt6
-rw-r--r--Documentation/devicetree/bindings/power/supply/max8925_battery.txt (renamed from Documentation/devicetree/bindings/power/supply/max8925_batter.txt)0
-rw-r--r--Documentation/devicetree/bindings/vendor-prefixes.txt1
-rw-r--r--Documentation/hwmon/aspeed-pwm-tacho22
-rw-r--r--Documentation/hwmon/tc6542
-rw-r--r--Documentation/lightnvm/pblk.txt21
-rw-r--r--MAINTAINERS9
-rw-r--r--Makefile2
-rw-r--r--arch/arc/Kconfig8
-rw-r--r--arch/arc/include/asm/atomic.h3
-rw-r--r--arch/arc/include/asm/entry-arcv2.h10
-rw-r--r--arch/arc/include/asm/ptrace.h4
-rw-r--r--arch/arc/kernel/setup.c30
-rw-r--r--arch/arm/configs/multi_v7_defconfig1
-rw-r--r--arch/arm/configs/pxa_defconfig3
-rw-r--r--arch/mips/Makefile6
-rw-r--r--arch/mips/include/asm/asm-prototypes.h1
-rw-r--r--arch/mips/kernel/cevt-r4k.c2
-rw-r--r--arch/mips/kernel/elf.c2
-rw-r--r--arch/mips/kernel/kgdb.c48
-rw-r--r--arch/mips/kernel/perf_event_mipsxx.c9
-rw-r--r--arch/mips/kernel/relocate.c2
-rw-r--r--arch/mips/kernel/smp-cps.c3
-rw-r--r--arch/mips/mti-malta/malta-int.c11
-rw-r--r--arch/mips/pci/pci-legacy.c2
-rw-r--r--arch/powerpc/configs/85xx-hw.config3
-rw-r--r--arch/powerpc/configs/85xx/ge_imp3a_defconfig1
-rw-r--r--arch/powerpc/configs/85xx/xes_mpc85xx_defconfig1
-rw-r--r--arch/powerpc/configs/cell_defconfig1
-rw-r--r--arch/powerpc/configs/pasemi_defconfig1
-rw-r--r--arch/powerpc/configs/ppc64_defconfig1
-rw-r--r--arch/powerpc/configs/ppc64e_defconfig1
-rw-r--r--arch/powerpc/configs/ppc6xx_defconfig3
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/include/asm/ptrace.h3
-rw-r--r--arch/sparc/include/uapi/asm/unistd.h8
-rw-r--r--arch/sparc/kernel/ptrace_64.c36
-rw-r--r--arch/sparc/kernel/systbls_32.S1
-rw-r--r--arch/sparc/kernel/systbls_64.S2
-rw-r--r--arch/tile/configs/tilegx_defconfig1
-rw-r--r--arch/tile/configs/tilepro_defconfig1
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/kernel/ftrace.c18
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/lib/delay.c7
-rw-r--r--block/Kconfig12
-rw-r--r--block/Kconfig.iosched30
-rw-r--r--block/Makefile3
-rw-r--r--block/bfq-cgroup.c1139
-rw-r--r--block/bfq-iosched.c5047
-rw-r--r--block/bfq-iosched.h941
-rw-r--r--block/bfq-wf2q.c1616
-rw-r--r--block/bio.c19
-rw-r--r--block/blk-cgroup.c123
-rw-r--r--block/blk-core.c143
-rw-r--r--block/blk-exec.c11
-rw-r--r--block/blk-flush.c5
-rw-r--r--block/blk-integrity.c24
-rw-r--r--block/blk-lib.c78
-rw-r--r--block/blk-merge.c17
-rw-r--r--block/blk-mq-debugfs.c331
-rw-r--r--block/blk-mq-pci.c2
-rw-r--r--block/blk-mq-sched.c103
-rw-r--r--block/blk-mq-sched.h18
-rw-r--r--block/blk-mq-sysfs.c61
-rw-r--r--block/blk-mq-tag.c5
-rw-r--r--block/blk-mq.c565
-rw-r--r--block/blk-mq.h16
-rw-r--r--block/blk-settings.c3
-rw-r--r--block/blk-stat.c323
-rw-r--r--block/blk-stat.h204
-rw-r--r--block/blk-sysfs.c82
-rw-r--r--block/blk-throttle.c985
-rw-r--r--block/blk-timeout.c1
-rw-r--r--block/blk-wbt.c95
-rw-r--r--block/blk-wbt.h16
-rw-r--r--block/blk.h15
-rw-r--r--block/bsg-lib.c8
-rw-r--r--block/bsg.c12
-rw-r--r--block/cfq-iosched.c17
-rw-r--r--block/compat_ioctl.c2
-rw-r--r--block/elevator.c3
-rw-r--r--block/genhd.c13
-rw-r--r--block/ioctl.c4
-rw-r--r--block/ioprio.c12
-rw-r--r--block/kyber-iosched.c719
-rw-r--r--block/partition-generic.c1
-rw-r--r--block/scsi_ioctl.c23
-rw-r--r--block/sed-opal.c153
-rw-r--r--block/t10-pi.c8
-rw-r--r--drivers/acpi/Kconfig5
-rw-r--r--drivers/acpi/acpi_extlog.c8
-rw-r--r--drivers/acpi/acpi_ipmi.c3
-rw-r--r--drivers/ata/Kconfig18
-rw-r--r--drivers/ata/Makefile2
-rw-r--r--drivers/ata/ahci_dm816.c200
-rw-r--r--drivers/ata/ahci_octeon.c5
-rw-r--r--drivers/ata/libata-core.c8
-rw-r--r--drivers/ata/libata-scsi.c140
-rw-r--r--drivers/ata/pata_at91.c503
-rw-r--r--drivers/ata/pata_macio.c2
-rw-r--r--drivers/ata/pata_mpc52xx.c2
-rw-r--r--drivers/ata/pata_of_platform.c2
-rw-r--r--drivers/ata/sata_fsl.c2
-rw-r--r--drivers/ata/sata_mv.c2
-rw-r--r--drivers/block/Kconfig47
-rw-r--r--drivers/block/Makefile3
-rw-r--r--drivers/block/ataflop.c12
-rw-r--r--drivers/block/brd.c54
-rw-r--r--drivers/block/cciss.c42
-rw-r--r--drivers/block/drbd/drbd_debugfs.c3
-rw-r--r--drivers/block/drbd/drbd_int.h6
-rw-r--r--drivers/block/drbd/drbd_main.c5
-rw-r--r--drivers/block/drbd/drbd_nl.c9
-rw-r--r--drivers/block/drbd/drbd_receiver.c105
-rw-r--r--drivers/block/drbd/drbd_req.c13
-rw-r--r--drivers/block/drbd/drbd_worker.c4
-rw-r--r--drivers/block/floppy.c10
-rw-r--r--drivers/block/hd.c803
-rw-r--r--drivers/block/loop.c38
-rw-r--r--drivers/block/loop.h1
-rw-r--r--drivers/block/mg_disk.c1112
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c58
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h1
-rw-r--r--drivers/block/nbd.c1417
-rw-r--r--drivers/block/null_blk.c22
-rw-r--r--drivers/block/osdblk.c693
-rw-r--r--drivers/block/paride/pcd.c57
-rw-r--r--drivers/block/paride/pd.c57
-rw-r--r--drivers/block/paride/pf.c57
-rw-r--r--drivers/block/pktcdvd.c2
-rw-r--r--drivers/block/rbd.c3
-rw-r--r--drivers/block/rsxx/dev.c1
-rw-r--r--drivers/block/swim.c55
-rw-r--r--drivers/block/swim3.c4
-rw-r--r--drivers/block/virtio_blk.c21
-rw-r--r--drivers/block/xen-blkfront.c41
-rw-r--r--drivers/block/zram/zram_drv.c13
-rw-r--r--drivers/cdrom/cdrom.c3
-rw-r--r--drivers/char/ipmi/bt-bmc.c1
-rw-r--r--drivers/char/ipmi/ipmi_si_intf.c19
-rw-r--r--drivers/char/ipmi/ipmi_ssif.c9
-rw-r--r--drivers/char/ipmi/ipmi_watchdog.c8
-rw-r--r--drivers/clk/sunxi-ng/Kconfig2
-rw-r--r--drivers/edac/Kconfig129
-rw-r--r--drivers/edac/Makefile8
-rw-r--r--drivers/edac/altera_edac.c22
-rw-r--r--drivers/edac/edac_mc.c99
-rw-r--r--drivers/edac/edac_stub.c68
-rw-r--r--drivers/edac/pnd2_edac.c2
-rw-r--r--drivers/edac/sb_edac.c4
-rw-r--r--drivers/edac/skx_edac.c2
-rw-r--r--drivers/edac/thunderx_edac.c2174
-rw-r--r--drivers/extcon/devres.c61
-rw-r--r--drivers/extcon/extcon.c66
-rw-r--r--drivers/extcon/extcon.h3
-rw-r--r--drivers/hsi/clients/ssi_protocol.c5
-rw-r--r--drivers/hwmon/Kconfig19
-rw-r--r--drivers/hwmon/Makefile2
-rw-r--r--drivers/hwmon/ad7414.c7
-rw-r--r--drivers/hwmon/adc128d818.c7
-rw-r--r--drivers/hwmon/ads1015.c22
-rw-r--r--drivers/hwmon/ads7828.c39
-rw-r--r--drivers/hwmon/adt7475.c44
-rw-r--r--drivers/hwmon/aspeed-pwm-tacho.c835
-rw-r--r--drivers/hwmon/dell-smm-hwmon.c7
-rw-r--r--drivers/hwmon/hwmon.c2
-rw-r--r--drivers/hwmon/ina209.c11
-rw-r--r--drivers/hwmon/ina2xx.c35
-rw-r--r--drivers/hwmon/lm63.c23
-rw-r--r--drivers/hwmon/lm75.c98
-rw-r--r--drivers/hwmon/lm85.c56
-rw-r--r--drivers/hwmon/lm87.c37
-rw-r--r--drivers/hwmon/lm90.c100
-rw-r--r--drivers/hwmon/lm95245.c8
-rw-r--r--drivers/hwmon/max6697.c52
-rw-r--r--drivers/hwmon/pmbus/adm1275.c4
-rw-r--r--drivers/hwmon/pmbus/ucd9000.c39
-rw-r--r--drivers/hwmon/pmbus/ucd9200.c48
-rw-r--r--drivers/hwmon/stts751.c7
-rw-r--r--drivers/hwmon/tmp102.c7
-rw-r--r--drivers/hwmon/tmp103.c24
-rw-r--r--drivers/hwmon/tmp421.c35
-rw-r--r--drivers/hwmon/twl4030-madc-hwmon.c118
-rw-r--r--drivers/hwmon/w83627ehf.c35
-rw-r--r--drivers/ide/ide-atapi.c11
-rw-r--r--drivers/ide/ide-cd.c21
-rw-r--r--drivers/ide/ide-cd_ioctl.c3
-rw-r--r--drivers/ide/ide-devsets.c8
-rw-r--r--drivers/ide/ide-disk.c3
-rw-r--r--drivers/ide/ide-dma.c2
-rw-r--r--drivers/ide/ide-eh.c36
-rw-r--r--drivers/ide/ide-floppy.c10
-rw-r--r--drivers/ide/ide-io.c10
-rw-r--r--drivers/ide/ide-ioctls.c7
-rw-r--r--drivers/ide/ide-park.c3
-rw-r--r--drivers/ide/ide-pm.c9
-rw-r--r--drivers/ide/ide-tape.c4
-rw-r--r--drivers/ide/ide-taskfile.c8
-rw-r--r--drivers/input/serio/i8042-x86ia64io.h7
-rw-r--r--drivers/leds/Kconfig27
-rw-r--r--drivers/leds/Makefile3
-rw-r--r--drivers/leds/led-class.c26
-rw-r--r--drivers/leds/leds-cpcap.c239
-rw-r--r--drivers/leds/leds-gpio.c12
-rw-r--r--drivers/leds/leds-lp3952.c18
-rw-r--r--drivers/leds/leds-mt6323.c502
-rw-r--r--drivers/leds/leds-pca9532.c27
-rw-r--r--drivers/leds/trigger/ledtrig-cpu.c33
-rw-r--r--drivers/lightnvm/Kconfig9
-rw-r--r--drivers/lightnvm/Makefile5
-rw-r--r--drivers/lightnvm/core.c147
-rw-r--r--drivers/lightnvm/pblk-cache.c114
-rw-r--r--drivers/lightnvm/pblk-core.c1667
-rw-r--r--drivers/lightnvm/pblk-gc.c555
-rw-r--r--drivers/lightnvm/pblk-init.c962
-rw-r--r--drivers/lightnvm/pblk-map.c136
-rw-r--r--drivers/lightnvm/pblk-rb.c852
-rw-r--r--drivers/lightnvm/pblk-read.c529
-rw-r--r--drivers/lightnvm/pblk-recovery.c998
-rw-r--r--drivers/lightnvm/pblk-rl.c184
-rw-r--r--drivers/lightnvm/pblk-sysfs.c507
-rw-r--r--drivers/lightnvm/pblk-write.c414
-rw-r--r--drivers/lightnvm/pblk.h1121
-rw-r--r--drivers/lightnvm/rrpc.c25
-rw-r--r--drivers/mailbox/Kconfig18
-rw-r--r--drivers/mailbox/Makefile2
-rw-r--r--drivers/mailbox/bcm-flexrm-mailbox.c1595
-rw-r--r--drivers/mailbox/bcm-pdc-mailbox.c61
-rw-r--r--drivers/mailbox/hi6220-mailbox.c2
-rw-r--r--drivers/mailbox/mailbox-xgene-slimpro.c2
-rw-r--r--drivers/mailbox/mailbox.c19
-rw-r--r--drivers/md/dm-cache-target.c1
-rw-r--r--drivers/md/dm-core.h1
-rw-r--r--drivers/md/dm-crypt.c1
-rw-r--r--drivers/md/dm-io.c18
-rw-r--r--drivers/md/dm-kcopyd.c6
-rw-r--r--drivers/md/dm-linear.c1
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/dm-raid.c6
-rw-r--r--drivers/md/dm-raid1.c1
-rw-r--r--drivers/md/dm-rq.c15
-rw-r--r--drivers/md/dm-stripe.c2
-rw-r--r--drivers/md/dm-table.c49
-rw-r--r--drivers/md/dm-thin.c2
-rw-r--r--drivers/md/dm.c32
-rw-r--r--drivers/md/linear.c1
-rw-r--r--drivers/md/md.h7
-rw-r--r--drivers/md/multipath.c1
-rw-r--r--drivers/md/raid0.c2
-rw-r--r--drivers/md/raid1.c4
-rw-r--r--drivers/md/raid10.c1
-rw-r--r--drivers/md/raid5.c53
-rw-r--r--drivers/mmc/core/queue.c2
-rw-r--r--drivers/mtd/mtdcore.c23
-rw-r--r--drivers/mtd/mtdsuper.c6
-rw-r--r--drivers/mtd/ubi/block.c2
-rw-r--r--drivers/net/bonding/bond_main.c2
-rw-r--r--drivers/net/can/usb/Kconfig2
-rw-r--r--drivers/net/can/usb/gs_usb.c17
-rw-r--r--drivers/net/can/usb/peak_usb/pcan_usb_core.c2
-rw-r--r--drivers/net/can/usb/peak_usb/pcan_usb_core.h2
-rw-r--r--drivers/net/can/usb/peak_usb/pcan_usb_fd.c72
-rw-r--r--drivers/net/dsa/b53/b53_common.c37
-rw-r--r--drivers/net/dsa/b53/b53_regs.h5
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_tc.c87
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c36
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/uar.c1
-rw-r--r--drivers/net/ethernet/qlogic/qed/qed_dcbx.c10
-rw-r--r--drivers/net/ethernet/renesas/ravb_main.c7
-rw-r--r--drivers/net/ethernet/sfc/efx.h5
-rw-r--r--drivers/net/ethernet/sfc/workarounds.h1
-rw-r--r--drivers/net/ethernet/ti/Kconfig4
-rw-r--r--drivers/net/ethernet/toshiba/tc35815.c2
-rw-r--r--drivers/net/hyperv/hyperv_net.h1
-rw-r--r--drivers/net/hyperv/netvsc.c9
-rw-r--r--drivers/net/macsec.c27
-rw-r--r--drivers/net/macvlan.c11
-rw-r--r--drivers/net/phy/micrel.c11
-rw-r--r--drivers/net/phy/phy.c40
-rw-r--r--drivers/net/team/team.c8
-rw-r--r--drivers/net/usb/Kconfig2
-rw-r--r--drivers/net/usb/hso.c16
-rw-r--r--drivers/net/usb/plusb.c15
-rw-r--r--drivers/nvme/host/core.c132
-rw-r--r--drivers/nvme/host/fabrics.c28
-rw-r--r--drivers/nvme/host/fabrics.h10
-rw-r--r--drivers/nvme/host/fc.c1300
-rw-r--r--drivers/nvme/host/lightnvm.c51
-rw-r--r--drivers/nvme/host/nvme.h48
-rw-r--r--drivers/nvme/host/pci.c223
-rw-r--r--drivers/nvme/host/rdma.c154
-rw-r--r--drivers/nvme/host/scsi.c15
-rw-r--r--drivers/nvme/target/admin-cmd.c31
-rw-r--r--drivers/nvme/target/core.c21
-rw-r--r--drivers/nvme/target/discovery.c19
-rw-r--r--drivers/nvme/target/fabrics-cmd.c36
-rw-r--r--drivers/nvme/target/fc.c282
-rw-r--r--drivers/nvme/target/fcloop.c201
-rw-r--r--drivers/nvme/target/io-cmd.c24
-rw-r--r--drivers/nvme/target/loop.c92
-rw-r--r--drivers/nvme/target/nvmet.h11
-rw-r--r--drivers/nvme/target/rdma.c47
-rw-r--r--drivers/platform/x86/Kconfig8
-rw-r--r--drivers/platform/x86/Makefile1
-rw-r--r--drivers/platform/x86/dell-laptop.c28
-rw-r--r--drivers/platform/x86/dell-wmi-led.c (renamed from drivers/leds/dell-led.c)141
-rw-r--r--drivers/power/reset/Kconfig9
-rw-r--r--drivers/power/reset/Makefile1
-rw-r--r--drivers/power/reset/gemini-poweroff.c160
-rw-r--r--drivers/power/reset/syscon-poweroff.c19
-rw-r--r--drivers/power/supply/Kconfig15
-rw-r--r--drivers/power/supply/Makefile2
-rw-r--r--drivers/power/supply/ab8500_bmdata.c8
-rw-r--r--drivers/power/supply/bq24190_charger.c427
-rw-r--r--drivers/power/supply/bq25890_charger.c2
-rw-r--r--drivers/power/supply/charger-manager.c35
-rw-r--r--drivers/power/supply/cpcap-charger.c681
-rw-r--r--drivers/power/supply/lego_ev3_battery.c228
-rw-r--r--drivers/power/supply/lp8788-charger.c2
-rw-r--r--drivers/power/supply/ltc2941-battery-gauge.c19
-rw-r--r--drivers/power/supply/max17040_battery.c8
-rw-r--r--drivers/power/supply/sbs-charger.c5
-rw-r--r--drivers/power/supply/tps65217_charger.c4
-rw-r--r--drivers/power/supply/twl4030_charger.c4
-rw-r--r--drivers/sbus/char/jsflash.c50
-rw-r--r--drivers/scsi/Makefile1
-rw-r--r--drivers/scsi/lpfc/lpfc.h5
-rw-r--r--drivers/scsi/lpfc/lpfc_attr.c10
-rw-r--r--drivers/scsi/lpfc/lpfc_bsg.c4
-rw-r--r--drivers/scsi/lpfc/lpfc_crtn.h9
-rw-r--r--drivers/scsi/lpfc/lpfc_ct.c68
-rw-r--r--drivers/scsi/lpfc/lpfc_debugfs.c67
-rw-r--r--drivers/scsi/lpfc/lpfc_disc.h1
-rw-r--r--drivers/scsi/lpfc/lpfc_els.c68
-rw-r--r--drivers/scsi/lpfc/lpfc_hbadisc.c133
-rw-r--r--drivers/scsi/lpfc/lpfc_hw.h3
-rw-r--r--drivers/scsi/lpfc/lpfc_hw4.h4
-rw-r--r--drivers/scsi/lpfc/lpfc_init.c202
-rw-r--r--drivers/scsi/lpfc/lpfc_mbox.c7
-rw-r--r--drivers/scsi/lpfc/lpfc_nportdisc.c8
-rw-r--r--drivers/scsi/lpfc/lpfc_nvme.c157
-rw-r--r--drivers/scsi/lpfc/lpfc_nvme.h11
-rw-r--r--drivers/scsi/lpfc/lpfc_nvmet.c433
-rw-r--r--drivers/scsi/lpfc/lpfc_nvmet.h13
-rw-r--r--drivers/scsi/lpfc/lpfc_sli.c22
-rw-r--r--drivers/scsi/lpfc/lpfc_sli4.h2
-rw-r--r--drivers/scsi/lpfc/lpfc_version.h2
-rw-r--r--drivers/scsi/lpfc/lpfc_vport.c3
-rw-r--r--drivers/scsi/osd/osd_initiator.c9
-rw-r--r--drivers/scsi/osst.c4
-rw-r--r--drivers/scsi/qla2xxx/qla_bsg.c6
-rw-r--r--drivers/scsi/scsi_debugfs.c13
-rw-r--r--drivers/scsi/scsi_debugfs.h4
-rw-r--r--drivers/scsi/scsi_error.c2
-rw-r--r--drivers/scsi/scsi_lib.c29
-rw-r--r--drivers/scsi/scsi_transport_sas.c4
-rw-r--r--drivers/scsi/sd.c259
-rw-r--r--drivers/scsi/sd.h8
-rw-r--r--drivers/scsi/sd_zbc.c1
-rw-r--r--drivers/scsi/sg.c4
-rw-r--r--drivers/scsi/st.c8
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_disk.h4
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_lib.c24
-rw-r--r--drivers/target/iscsi/iscsi_target_configfs.c46
-rw-r--r--drivers/target/target_core_device.c2
-rw-r--r--drivers/target/target_core_pscsi.c4
-rw-r--r--fs/9p/v9fs.c10
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_super.c15
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/super.c5
-rw-r--r--fs/afs/volume.c8
-rw-r--r--fs/block_dev.c28
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/disk-io.c36
-rw-r--r--fs/btrfs/qgroup.c11
-rw-r--r--fs/btrfs/super.c7
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/inode.c22
-rw-r--r--fs/ceph/super.c35
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifsfs.c7
-rw-r--r--fs/cifs/connect.c10
-rw-r--r--fs/coda/inode.c11
-rw-r--r--fs/dax.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h1
-rw-r--r--fs/ecryptfs/main.c4
-rw-r--r--fs/exofs/exofs.h1
-rw-r--r--fs/exofs/super.c17
-rw-r--r--fs/fuse/dev.c13
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/fuse/inode.c42
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/ncpfs/inode.c8
-rw-r--r--fs/ncpfs/ncp_fs_sb.h1
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/direct.c27
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/super.c33
-rw-r--r--fs/nfs/write.c13
-rw-r--r--fs/nfsd/blocklayout.c7
-rw-r--r--fs/nfsd/nfs3xdr.c13
-rw-r--r--fs/nfsd/nfssvc.c36
-rw-r--r--fs/nfsd/nfsxdr.c10
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/orangefs/orangefs-bufmap.c4
-rw-r--r--fs/stat.c11
-rw-r--r--fs/super.c53
-rw-r--r--fs/ubifs/super.c25
-rw-r--r--fs/ubifs/ubifs.h3
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--include/linux/ata.h5
-rw-r--r--include/linux/backing-dev-defs.h8
-rw-r--r--include/linux/backing-dev.h16
-rw-r--r--include/linux/bio.h2
-rw-r--r--include/linux/blk-mq.h21
-rw-r--r--include/linux/blk_types.h47
-rw-r--r--include/linux/blkdev.h76
-rw-r--r--include/linux/cgroup-defs.h12
-rw-r--r--include/linux/cgroup.h8
-rw-r--r--include/linux/coda_psdev.h1
-rw-r--r--include/linux/cpuset.h4
-rw-r--r--include/linux/dell-led.h6
-rw-r--r--include/linux/device-mapper.h11
-rw-r--r--include/linux/edac.h30
-rw-r--r--include/linux/elevator.h4
-rw-r--r--include/linux/extcon.h21
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/genhd.h12
-rw-r--r--include/linux/hwmon.h2
-rw-r--r--include/linux/ide.h2
-rw-r--r--include/linux/inet.h6
-rw-r--r--include/linux/kobject.h2
-rw-r--r--include/linux/leds-pca9532.h4
-rw-r--r--include/linux/leds.h14
-rw-r--r--include/linux/lightnvm.h13
-rw-r--r--include/linux/mailbox/brcm-message.h14
-rw-r--r--include/linux/mfd/motorola-cpcap.h3
-rw-r--r--include/linux/mg_disk.h45
-rw-r--r--include/linux/mtd/mtd.h5
-rw-r--r--include/linux/nfs_fs_sb.h1
-rw-r--r--include/linux/nvme-fc-driver.h104
-rw-r--r--include/linux/nvme-fc.h68
-rw-r--r--include/linux/nvme.h13
-rw-r--r--include/linux/phy.h1
-rw-r--r--include/linux/power/bq24190_charger.h16
-rw-r--r--include/linux/sbitmap.h55
-rw-r--r--include/linux/t10-pi.h8
-rw-r--r--include/linux/writeback.h1
-rw-r--r--include/scsi/scsi_request.h2
-rw-r--r--include/trace/events/block.h61
-rw-r--r--include/uapi/linux/lightnvm.h4
-rw-r--r--include/uapi/linux/nbd-netlink.h98
-rw-r--r--include/uapi/linux/nbd.h6
-rw-r--r--include/uapi/linux/stat.h8
-rw-r--r--kernel/cgroup/cgroup-internal.h7
-rw-r--r--kernel/cgroup/cgroup-v1.c20
-rw-r--r--kernel/cgroup/cgroup.c40
-rw-r--r--kernel/cgroup/cpuset.c11
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/sched/cputime.c27
-rw-r--r--kernel/sched/sched.h9
-rw-r--r--kernel/trace/blktrace.c35
-rw-r--r--kernel/workqueue.c5
-rw-r--r--lib/iov_iter.c2
-rw-r--r--lib/kobject.c5
-rw-r--r--lib/sbitmap.c75
-rw-r--r--mm/backing-dev.c186
-rw-r--r--net/9p/client.c4
-rw-r--r--net/bridge/br_device.c1
-rw-r--r--net/bridge/br_if.c1
-rw-r--r--net/core/dev.c3
-rw-r--r--net/core/skbuff.c2
-rw-r--r--net/core/utils.c103
-rw-r--r--net/ipv4/af_inet.c3
-rw-r--r--net/ipv4/route.c3
-rw-r--r--net/ipv4/tcp_cong.c11
-rw-r--r--net/ipv4/tcp_output.c19
-rw-r--r--net/ipv4/udp_offload.c3
-rw-r--r--net/ipv6/addrconf.c14
-rw-r--r--net/ipv6/af_inet6.c6
-rw-r--r--net/ipv6/exthdrs.c4
-rw-r--r--net/ipv6/ip6_tunnel.c34
-rw-r--r--net/ipv6/ndisc.c3
-rw-r--r--net/ipv6/raw.c3
-rw-r--r--net/packet/af_packet.c2
-rw-r--r--net/tipc/socket.c40
-rw-r--r--net/xfrm/xfrm_input.c2
-rw-r--r--net/xfrm/xfrm_policy.c4
-rw-r--r--sound/core/seq/seq_lock.c9
-rw-r--r--sound/firewire/lib.h2
-rw-r--r--sound/firewire/oxfw/oxfw.c4
-rw-r--r--sound/pci/hda/dell_wmi_helper.c30
-rw-r--r--sound/soc/intel/boards/bytcr_rt5640.c4
-rw-r--r--sound/soc/intel/boards/bytcr_rt5651.c2
-rw-r--r--sound/soc/soc-topology.c1
-rw-r--r--sound/soc/sti/uniperif.h1
-rw-r--r--sound/soc/sti/uniperif_player.c35
-rw-r--r--sound/soc/sti/uniperif_reader.c24
527 files changed, 35908 insertions, 8649 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 2da04ce6aeef..dea212db9df3 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -213,14 +213,8 @@ What: /sys/block/<disk>/queue/discard_zeroes_data
Date: May 2011
Contact: Martin K. Petersen <martin.petersen@oracle.com>
Description:
- Devices that support discard functionality may return
- stale or random data when a previously discarded block
- is read back. This can cause problems if the filesystem
- expects discarded blocks to be explicitly cleared. If a
- device reports that it deterministically returns zeroes
- when a discarded area is read the discard_zeroes_data
- parameter will be set to one. Otherwise it will be 0 and
- the result of reading a discarded area is undefined.
+ Will always return 0. Don't rely on any specific behavior
+ for discards, and don't read this file.
What: /sys/block/<disk>/queue/write_same_max_bytes
Date: January 2012
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index e55103ace382..8d55b4bbb5e2 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,5 +1,7 @@
00-INDEX
- This file
+bfq-iosched.txt
+ - BFQ IO scheduler and its tunables
biodoc.txt
- Notes on the Generic Block Layer Rewrite in Linux 2.5
biovecs.txt
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
new file mode 100644
index 000000000000..1b87df6cd476
--- /dev/null
+++ b/Documentation/block/bfq-iosched.txt
@@ -0,0 +1,531 @@
+BFQ (Budget Fair Queueing)
+==========================
+
+BFQ is a proportional-share I/O scheduler, with some extra
+low-latency capabilities. In addition to cgroups support (blkio or io
+controllers), BFQ's main features are:
+- BFQ guarantees a high system and application responsiveness, and a
+ low latency for time-sensitive applications, such as audio or video
+ players;
+- BFQ distributes bandwidth, and not just time, among processes or
+ groups (switching back to time distribution when needed to keep
+ throughput high).
+
+On average CPUs, the current version of BFQ can handle devices
+performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
+reference, 30-50 KIOPS correspond to very high bandwidths with
+sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
+to 120-200 MB/s with 4KB random I/O. BFQ has not yet been tested on
+multi-queue devices.
+
+The table of contents follow. Impatients can just jump to Section 3.
+
+CONTENTS
+
+1. When may BFQ be useful?
+ 1-1 Personal systems
+ 1-2 Server systems
+2. How does BFQ work?
+3. What are BFQ's tunable?
+4. BFQ group scheduling
+ 4-1 Service guarantees provided
+ 4-2 Interface
+
+1. When may BFQ be useful?
+==========================
+
+BFQ provides the following benefits on personal and server systems.
+
+1-1 Personal systems
+--------------------
+
+Low latency for interactive applications
+
+Regardless of the actual background workload, BFQ guarantees that, for
+interactive tasks, the storage device is virtually as responsive as if
+it was idle. For example, even if one or more of the following
+background workloads are being executed:
+- one or more large files are being read, written or copied,
+- a tree of source files is being compiled,
+- one or more virtual machines are performing I/O,
+- a software update is in progress,
+- indexing daemons are scanning filesystems and updating their
+ databases,
+starting an application or loading a file from within an application
+takes about the same time as if the storage device was idle. As a
+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
+applications experience high latencies, or even become unresponsive
+until the background workload terminates (also on SSDs).
+
+Low latency for soft real-time applications
+
+Also soft real-time applications, such as audio and video
+players/streamers, enjoy a low latency and a low drop rate, regardless
+of the background I/O workload. As a consequence, these applications
+do not suffer from almost any glitch due to the background workload.
+
+Higher speed for code-development tasks
+
+If some additional workload happens to be executed in parallel, then
+BFQ executes the I/O-related components of typical code-development
+tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
+NOOP or DEADLINE.
+
+High throughput
+
+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
+up to 150% higher throughput than DEADLINE and NOOP, with all the
+sequential workloads considered in our tests. With random workloads,
+and with all the workloads on flash-based devices, BFQ achieves,
+instead, about the same throughput as the other schedulers.
+
+Strong fairness, bandwidth and delay guarantees
+
+BFQ distributes the device throughput, and not just the device time,
+among I/O-bound applications in proportion their weights, with any
+workload and regardless of the device parameters. From these bandwidth
+guarantees, it is possible to compute tight per-I/O-request delay
+guarantees by a simple formula. If not configured for strict service
+guarantees, BFQ switches to time-based resource sharing (only) for
+applications that would otherwise cause a throughput loss.
+
+1-2 Server systems
+------------------
+
+Most benefits for server systems follow from the same service
+properties as above. In particular, regardless of whether additional,
+possibly heavy workloads are being served, BFQ guarantees:
+
+. audio and video-streaming with zero or very low jitter and drop
+ rate;
+
+. fast retrieval of WEB pages and embedded objects;
+
+. real-time recording of data in live-dumping applications (e.g.,
+ packet logging);
+
+. responsiveness in local and remote access to a server.
+
+
+2. How does BFQ work?
+=====================
+
+BFQ is a proportional-share I/O scheduler, whose general structure,
+plus a lot of code, are borrowed from CFQ.
+
+- Each process doing I/O on a device is associated with a weight and a
+ (bfq_)queue.
+
+- BFQ grants exclusive access to the device, for a while, to one queue
+ (process) at a time, and implements this service model by
+ associating every queue with a budget, measured in number of
+ sectors.
+
+ - After a queue is granted access to the device, the budget of the
+ queue is decremented, on each request dispatch, by the size of the
+ request.
+
+ - The in-service queue is expired, i.e., its service is suspended,
+ only if one of the following events occurs: 1) the queue finishes
+ its budget, 2) the queue empties, 3) a "budget timeout" fires.
+
+ - The budget timeout prevents processes doing random I/O from
+ holding the device for too long and dramatically reducing
+ throughput.
+
+ - Actually, as in CFQ, a queue associated with a process issuing
+ sync requests may not be expired immediately when it empties. In
+ contrast, BFQ may idle the device for a short time interval,
+ giving the process the chance to go on being served if it issues
+ a new request in time. Device idling typically boosts the
+ throughput on rotational devices, if processes do synchronous
+ and sequential I/O. In addition, under BFQ, device idling is
+ also instrumental in guaranteeing the desired throughput
+ fraction to processes issuing sync requests (see the description
+ of the slice_idle tunable in this document, or [1, 2], for more
+ details).
+
+ - With respect to idling for service guarantees, if several
+ processes are competing for the device at the same time, but
+ all processes (and groups, after the following commit) have
+ the same weight, then BFQ guarantees the expected throughput
+ distribution without ever idling the device. Throughput is
+ thus as high as possible in this common scenario.
+
+ - If low-latency mode is enabled (default configuration), BFQ
+ executes some special heuristics to detect interactive and soft
+ real-time applications (e.g., video or audio players/streamers),
+ and to reduce their latency. The most important action taken to
+ achieve this goal is to give to the queues associated with these
+ applications more than their fair share of the device
+ throughput. For brevity, we call just "weight-raising" the whole
+ sets of actions taken by BFQ to privilege these queues. In
+ particular, BFQ provides a milder form of weight-raising for
+ interactive applications, and a stronger form for soft real-time
+ applications.
+
+ - BFQ automatically deactivates idling for queues born in a burst of
+ queue creations. In fact, these queues are usually associated with
+ the processes of applications and services that benefit mostly
+ from a high throughput. Examples are systemd during boot, or git
+ grep.
+
+ - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
+ performing random I/O that becomes mostly sequential if
+ merged. Differently from CFQ, BFQ achieves this goal with a more
+ reactive mechanism, called Early Queue Merge (EQM). EQM is so
+ responsive in detecting interleaved I/O (cooperating processes),
+ that it enables BFQ to achieve a high throughput, by queue
+ merging, even for queues for which CFQ needs a different
+ mechanism, preemption, to get a high throughput. As such EQM is a
+ unified mechanism to achieve a high throughput with interleaved
+ I/O.
+
+ - Queues are scheduled according to a variant of WF2Q+, named
+ B-WF2Q+, and implemented using an augmented rb-tree to preserve an
+ O(log N) overall complexity. See [2] for more details. B-WF2Q+ is
+ also ready for hierarchical scheduling. However, for a cleaner
+ logical breakdown, the code that enables and completes
+ hierarchical support is provided in the next commit, which focuses
+ exactly on this feature.
+
+ - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
+ perfectly fair, and smooth service. In particular, B-WF2Q+
+ guarantees that each queue receives a fraction of the device
+ throughput proportional to its weight, even if the throughput
+ fluctuates, and regardless of: the device parameters, the current
+ workload and the budgets assigned to the queue.
+
+ - The last, budget-independence, property (although probably
+ counterintuitive in the first place) is definitely beneficial, for
+ the following reasons:
+
+ - First, with any proportional-share scheduler, the maximum
+ deviation with respect to an ideal service is proportional to
+ the maximum budget (slice) assigned to queues. As a consequence,
+ BFQ can keep this deviation tight not only because of the
+ accurate service of B-WF2Q+, but also because BFQ *does not*
+ need to assign a larger budget to a queue to let the queue
+ receive a higher fraction of the device throughput.
+
+ - Second, BFQ is free to choose, for every process (queue), the
+ budget that best fits the needs of the process, or best
+ leverages the I/O pattern of the process. In particular, BFQ
+ updates queue budgets with a simple feedback-loop algorithm that
+ allows a high throughput to be achieved, while still providing
+ tight latency guarantees to time-sensitive applications. When
+ the in-service queue expires, this algorithm computes the next
+ budget of the queue so as to:
+
+ - Let large budgets be eventually assigned to the queues
+ associated with I/O-bound applications performing sequential
+ I/O: in fact, the longer these applications are served once
+ got access to the device, the higher the throughput is.
+
+ - Let small budgets be eventually assigned to the queues
+ associated with time-sensitive applications (which typically
+ perform sporadic and short I/O), because, the smaller the
+ budget assigned to a queue waiting for service is, the sooner
+ B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
+
+- If several processes are competing for the device at the same time,
+ but all processes and groups have the same weight, then BFQ
+ guarantees the expected throughput distribution without ever idling
+ the device. It uses preemption instead. Throughput is then much
+ higher in this common scenario.
+
+- ioprio classes are served in strict priority order, i.e.,
+ lower-priority queues are not served as long as there are
+ higher-priority queues. Among queues in the same class, the
+ bandwidth is distributed in proportion to the weight of each
+ queue. A very thin extra bandwidth is however guaranteed to
+ the Idle class, to prevent it from starving.
+
+
+3. What are BFQ's tunable?
+==========================
+
+The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
+fifo_expire_sync below are the same as in CFQ. Their description is
+just copied from that for CFQ. Some considerations in the description
+of slice_idle are copied from CFQ too.
+
+per-process ioprio and weight
+-----------------------------
+
+Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+weights can be assigned to processes only indirectly, through I/O
+priorities, and according to the relation:
+weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+Beware that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+slice_idle
+----------
+
+This parameter specifies how long BFQ should idle for next I/O
+request, when certain sync BFQ queues become empty. By default
+slice_idle is a non-zero value. Idling has a double purpose: boosting
+throughput and making sure that the desired throughput distribution is
+respected (see the description of how BFQ works, and, if needed, the
+papers referred there).
+
+As for throughput, idling can be very helpful on highly seeky media
+like single spindle SATA/SAS disks where we can cut down on overall
+number of seeks and see improved throughput.
+
+Setting slice_idle to 0 will remove all the idling on queues and one
+should see an overall improved throughput on faster storage devices
+like multiple SATA/SAS disks in hardware RAID configuration.
+
+So depending on storage and workload, it might be useful to set
+slice_idle=0. In general for SATA/SAS disks and software RAID of
+SATA/SAS disks keeping slice_idle enabled should be useful. For any
+configurations where there are multiple spindles behind single LUN
+(Host based hardware RAID controller or for storage arrays), setting
+slice_idle=0 might end up in better throughput and acceptable
+latencies.
+
+Idling is however necessary to have service guarantees enforced in
+case of differentiated weights or differentiated I/O-request lengths.
+To see why, suppose that a given BFQ queue A must get several I/O
+requests served for each request served for another queue B. Idling
+ensures that, if A makes a new I/O request slightly after becoming
+empty, then no request of B is dispatched in the middle, and thus A
+does not lose the possibility to get more than one request dispatched
+before the next request of B is dispatched. Note that idling
+guarantees the desired differentiated treatment of queues only in
+terms of I/O-request dispatches. To guarantee that the actual service
+order then corresponds to the dispatch order, the strict_guarantees
+tunable must be set too.
+
+There is an important flipside for idling: apart from the above cases
+where it is beneficial also for throughput, idling can severely impact
+throughput. One important case is random workload. Because of this
+issue, BFQ tends to avoid idling as much as possible, when it is not
+beneficial also for throughput. As a consequence of this behavior, and
+of further issues described for the strict_guarantees tunable,
+short-term service guarantees may be occasionally violated. And, in
+some cases, these guarantees may be more important than guaranteeing
+maximum throughput. For example, in video playing/streaming, a very
+low drop rate may be more important than maximum throughput. In these
+cases, consider setting the strict_guarantees parameter.
+
+strict_guarantees
+-----------------
+
+If this parameter is set (default: unset), then BFQ
+
+- always performs idling when the in-service queue becomes empty;
+
+- forces the device to serve one I/O request at a time, by dispatching a
+ new request only if there is no outstanding request.
+
+In the presence of differentiated weights or I/O-request sizes, both
+the above conditions are needed to guarantee that every BFQ queue
+receives its allotted share of the bandwidth. The first condition is
+needed for the reasons explained in the description of the slice_idle
+tunable. The second condition is needed because all modern storage
+devices reorder internally-queued requests, which may trivially break
+the service guarantees enforced by the I/O scheduler.
+
+Setting strict_guarantees may evidently affect throughput.
+
+back_seek_max
+-------------
+
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
+The distance is the amount of space from the current head location to the
+sectors that are backward in terms of distance.
+
+This parameter allows the scheduler to anticipate requests in the "backward"
+direction and consider them as being the "next" if they are within this
+distance from the current head location.
+
+back_seek_penalty
+-----------------
+
+This parameter is used to compute the cost of backward seeking. If the
+backward distance of request is just 1/back_seek_penalty from a "front"
+request, then the seeking cost of two requests is considered equivalent.
+
+So scheduler will not bias toward one or the other request (otherwise scheduler
+will bias toward front request). Default value of back_seek_penalty is 2.
+
+fifo_expire_async
+-----------------
+
+This parameter is used to set the timeout of asynchronous requests. Default
+value of this is 248ms.
+
+fifo_expire_sync
+----------------
+
+This parameter is used to set the timeout of synchronous requests. Default
+value of this is 124ms. In case to favor synchronous requests over asynchronous
+one, this value should be decreased relative to fifo_expire_async.
+
+low_latency
+-----------
+
+This parameter is used to enable/disable BFQ's low latency mode. By
+default, low latency mode is enabled. If enabled, interactive and soft
+real-time applications are privileged and experience a lower latency,
+as explained in more detail in the description of how BFQ works.
+
+DO NOT enable this mode if you need full control on bandwidth
+distribution. In fact, if it is enabled, then BFQ automatically
+increases the bandwidth share of privileged applications, as the main
+means to guarantee a lower latency to them.
+
+timeout_sync
+------------
+
+Maximum amount of device time that can be given to a task (queue) once
+it has been selected for service. On devices with costly seeks,
+increasing this time usually increases maximum throughput. On the
+opposite end, increasing this time coarsens the granularity of the
+short-term bandwidth and latency guarantees, especially if the
+following parameter is set to zero.
+
+max_budget
+----------
+
+Maximum amount of service, measured in sectors, that can be provided
+to a BFQ queue once it is set in service (of course within the limits
+of the above timeout). According to what said in the description of
+the algorithm, larger values increase the throughput in proportion to
+the percentage of sequential I/O requests issued. The price of larger
+values is that they coarsen the granularity of short-term bandwidth
+and latency guarantees.
+
+The default value is 0, which enables auto-tuning: BFQ sets max_budget
+to the maximum number of sectors that can be served during
+timeout_sync, according to the estimated peak rate.
+
+weights
+-------
+
+Read-only parameter, used to show the weights of the currently active
+BFQ queues.
+
+
+wr_ tunables
+------------
+
+BFQ exports a few parameters to control/tune the behavior of
+low-latency heuristics.
+
+wr_coeff
+
+Factor by which the weight of a weight-raised queue is multiplied. If
+the queue is deemed soft real-time, then the weight is further
+multiplied by an additional, constant factor.
+
+wr_max_time
+
+Maximum duration of a weight-raising period for an interactive task
+(ms). If set to zero (default value), then this value is computed
+automatically, as a function of the peak rate of the device. In any
+case, when the value of this parameter is read, it always reports the
+current duration, regardless of whether it has been set manually or
+computed automatically.
+
+wr_max_softrt_rate
+
+Maximum service rate below which a queue is deemed to be associated
+with a soft real-time application, and is then weight-raised
+accordingly (sectors/sec).
+
+wr_min_idle_time
+
+Minimum idle period after which interactive weight-raising may be
+reactivated for a queue (in ms).
+
+wr_rt_max_time
+
+Maximum weight-raising duration for soft real-time queues (in ms). The
+start time from which this duration is considered is automatically
+moved forward if the queue is detected to be still soft real-time
+before the current soft real-time weight-raising period finishes.
+
+wr_min_inter_arr_async
+
+Minimum period between I/O request arrivals after which weight-raising
+may be reactivated for an already busy async queue (in ms).
+
+
+4. Group scheduling with BFQ
+============================
+
+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
+blkio and io. In particular, BFQ supports weight-based proportional
+share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
+
+4-1 Service guarantees provided
+-------------------------------
+
+With BFQ, proportional share means true proportional share of the
+device bandwidth, according to group weights. For example, a group
+with weight 200 gets twice the bandwidth, and not just twice the time,
+of a group with weight 100.
+
+BFQ supports hierarchies (group trees) of any depth. Bandwidth is
+distributed among groups and processes in the expected way: for each
+group, the children of the group share the whole bandwidth of the
+group in proportion to their weights. In particular, this implies
+that, for each leaf group, every process of the group receives the
+same share of the whole group bandwidth, unless the ioprio of the
+process is modified.
+
+The resource-sharing guarantee for a group may partially or totally
+switch from bandwidth to time, if providing bandwidth guarantees to
+the group lowers the throughput too much. This switch occurs on a
+per-process basis: if a process of a leaf group causes throughput loss
+if served in such a way to receive its share of the bandwidth, then
+BFQ switches back to just time-based proportional share for that
+process.
+
+4-2 Interface
+-------------
+
+To get proportional sharing of bandwidth with BFQ for a given device,
+BFQ must of course be the active scheduler for that device.
+
+Within each group directory, the names of the files associated with
+BFQ-specific cgroup parameters and stats begin with the "bfq."
+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
+parameter to set the weight of a group with BFQ is blkio.bfq.weight
+or io.bfq.weight.
+
+Parameters to set
+-----------------
+
+For each group, there is only the following parameter to set.
+
+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
+group inside its parent. Available values: 1..10000 (default 100). The
+linear mapping between ioprio and weights, described at the beginning
+of the tunable section, is still valid, but all weights higher than
+IOPRIO_BE_NR*10 are mapped to ioprio 0.
+
+Recall that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+
+[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+ Scheduler", Proceedings of the First Workshop on Mobile System
+ Technologies (MST-2015), May 2015.
+ http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+
+[2] P. Valente and M. Andreolini, "Improving Application
+ Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
+ the 5th Annual International Systems and Storage Conference
+ (SYSTOR '12), June 2012.
+ Slightly extended version:
+ http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
+ results.pdf
diff --git a/Documentation/block/kyber-iosched.txt b/Documentation/block/kyber-iosched.txt
new file mode 100644
index 000000000000..e94feacd7edc
--- /dev/null
+++ b/Documentation/block/kyber-iosched.txt
@@ -0,0 +1,14 @@
+Kyber I/O scheduler tunables
+===========================
+
+The only two tunables for the Kyber scheduler are the target latencies for
+reads and synchronous writes. Kyber will throttle requests in order to meet
+these target latencies.
+
+read_lat_nsec
+-------------
+Target latency for reads (in nanoseconds).
+
+write_lat_nsec
+--------------
+Target latency for synchronous writes (in nanoseconds).
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index c0a3bb5a6e4e..2c1e67058fd3 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -43,11 +43,6 @@ large discards are issued, setting this value lower will make Linux issue
smaller discards and potentially help reduce latencies induced by large
discard operations.
-discard_zeroes_data (RO)
-------------------------
-When read, this file will show if the discarded block are zeroed by the
-device or not. If its value is '1' the blocks are zeroed otherwise not.
-
hw_sector_size (RO)
-------------------
This is the hardware sector size of the device, in bytes.
@@ -192,5 +187,11 @@ scaling back writes. Writing a value of '0' to this file disables the
feature. Writing a value of '-1' to this file resets the value to the
default setting.
+throttle_sample_time (RW)
+-------------------------
+This is the time window that blk-throttle samples data, in millisecond.
+blk-throttle makes decision based on the samplings. Lower time means cgroups
+have more smooth throughput, but higher CPU overhead. This exists only when
+CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt
deleted file mode 100644
index f7e050551487..000000000000
--- a/Documentation/blockdev/mflash.txt
+++ /dev/null
@@ -1,84 +0,0 @@
-This document describes m[g]flash support in linux.
-
-Contents
- 1. Overview
- 2. Reserved area configuration
- 3. Example of mflash platform driver registration
-
-1. Overview
-
-Mflash and gflash are embedded flash drive. The only difference is mflash is
-MCP(Multi Chip Package) device. These two device operate exactly same way.
-So the rest mflash repersents mflash and gflash altogether.
-
-Internally, mflash has nand flash and other hardware logics and supports
-2 different operation (ATA, IO) modes. ATA mode doesn't need any new
-driver and currently works well under standard IDE subsystem. Actually it's
-one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
-IDE interface.
-
-Following are brief descriptions about IO mode.
-A. IO mode based on ATA protocol and uses some custom command. (read confirm,
-write confirm)
-B. IO mode uses SRAM bus interface.
-C. IO mode supports 4kB boot area, so host can boot from mflash.
-
-2. Reserved area configuration
-If host boot from mflash, usually needs raw area for boot loader image. All of
-the mflash's block device operation will be taken this value as start offset.
-Note that boot loader's size of reserved area and kernel configuration value
-must be same.
-
-3. Example of mflash platform driver registration
-Working mflash is very straight forward. Adding platform device stuff to board
-configuration file is all. Here is some pseudo example.
-
-static struct mg_drv_data mflash_drv_data = {
- /* If you want to polling driver set to 1 */
- .use_polling = 0,
- /* device attribution */
- .dev_attr = MG_BOOT_DEV
-};
-
-static struct resource mg_mflash_rsc[] = {
- /* Base address of mflash */
- [0] = {
- .start = 0x08000000,
- .end = 0x08000000 + SZ_64K - 1,
- .flags = IORESOURCE_MEM
- },
- /* mflash interrupt pin */
- [1] = {
- .start = IRQ_GPIO(84),
- .end = IRQ_GPIO(84),
- .flags = IORESOURCE_IRQ
- },
- /* mflash reset pin */
- [2] = {
- .start = 43,
- .end = 43,
- .name = MG_RST_PIN,
- .flags = IORESOURCE_IO
- },
- /* mflash reset-out pin
- * If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
- * should assign this */
- [3] = {
- .start = 51,
- .end = 51,
- .name = MG_RSTOUT_PIN,
- .flags = IORESOURCE_IO
- }
-};
-
-static struct platform_device mflash_dev = {
- .name = MG_DEV_NAME,
- .id = -1,
- .dev = {
- .platform_data = &mflash_drv_data,
- },
- .num_resources = ARRAY_SIZE(mg_mflash_rsc),
- .resource = mg_mflash_rsc
-};
-
-platform_device_register(&mflash_dev);
diff --git a/Documentation/devicetree/bindings/ata/ahci-dm816.txt b/Documentation/devicetree/bindings/ata/ahci-dm816.txt
new file mode 100644
index 000000000000..f8c535f3541f
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/ahci-dm816.txt
@@ -0,0 +1,21 @@
+Device tree binding for the TI DM816 AHCI SATA Controller
+---------------------------------------------------------
+
+Required properties:
+ - compatible: must be "ti,dm816-ahci"
+ - reg: physical base address and size of the register region used by
+ the controller (as defined by the AHCI 1.1 standard)
+ - interrupts: interrupt specifier (refer to the interrupt binding)
+ - clocks: list of phandle and clock specifier pairs (or only
+ phandles for clock providers with '0' defined for
+ #clock-cells); two clocks must be specified: the functional
+ clock and an external reference clock
+
+Example:
+
+ sata: sata@4a140000 {
+ compatible = "ti,dm816-ahci";
+ reg = <0x4a140000 0x10000>;
+ interrupts = <16>;
+ clocks = <&sysclk5_ck>, <&sata_refclk>;
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ads7828.txt b/Documentation/devicetree/bindings/hwmon/ads7828.txt
new file mode 100644
index 000000000000..fe0cc4ad7ea9
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ads7828.txt
@@ -0,0 +1,25 @@
+ads7828 properties
+
+Required properties:
+- compatible: Should be one of
+ ti,ads7828
+ ti,ads7830
+- reg: I2C address
+
+Optional properties:
+
+- ti,differential-input
+ Set to use the device in differential mode.
+- vref-supply
+ The external reference on the device is set to this regulators output. If it
+ does not exists the internal reference will be used and output by the ads78xx
+ on the "external vref" pin.
+
+ Example ADS7828 node:
+
+ ads7828: ads@48 {
+ comatible = "ti,ads7828";
+ reg = <0x48>;
+ vref-supply = <&vref>;
+ ti,differential-input;
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt b/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt
new file mode 100644
index 000000000000..cf4460564adb
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt
@@ -0,0 +1,68 @@
+ASPEED AST2400/AST2500 PWM and Fan Tacho controller device driver
+
+The ASPEED PWM controller can support upto 8 PWM outputs. The ASPEED Fan Tacho
+controller can support upto 16 Fan tachometer inputs.
+
+There can be upto 8 fans supported. Each fan can have one PWM output and
+one/two Fan tach inputs.
+
+Required properties for pwm-tacho node:
+- #address-cells : should be 1.
+
+- #size-cells : should be 1.
+
+- reg : address and length of the register set for the device.
+
+- pinctrl-names : a pinctrl state named "default" must be defined.
+
+- pinctrl-0 : phandle referencing pin configuration of the PWM ports.
+
+- compatible : should be "aspeed,ast2400-pwm-tacho" for AST2400 and
+ "aspeed,ast2500-pwm-tacho" for AST2500.
+
+- clocks : a fixed clock providing input clock frequency(PWM
+ and Fan Tach clock)
+
+fan subnode format:
+===================
+Under fan subnode there can upto 8 child nodes, with each child node
+representing a fan. If there are 8 fans each fan can have one PWM port and
+one/two Fan tach inputs.
+
+Required properties for each child node:
+- reg : should specify PWM source port.
+ integer value in the range 0 to 7 with 0 indicating PWM port A and
+ 7 indicating PWM port H.
+
+- aspeed,fan-tach-ch : should specify the Fan tach input channel.
+ integer value in the range 0 through 15, with 0 indicating
+ Fan tach channel 0 and 15 indicating Fan tach channel 15.
+ Atleast one Fan tach input channel is required.
+
+Examples:
+
+pwm_tacho_fixed_clk: fixedclk {
+ compatible = "fixed-clock";
+ #clock-cells = <0>;
+ clock-frequency = <24000000>;
+};
+
+pwm_tacho: pwmtachocontroller@1e786000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ reg = <0x1E786000 0x1000>;
+ compatible = "aspeed,ast2500-pwm-tacho";
+ clocks = <&pwm_tacho_fixed_clk>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_pwm0_default &pinctrl_pwm1_default>;
+
+ fan@0 {
+ reg = <0x00>;
+ aspeed,fan-tach-ch = /bits/ 8 <0x00>;
+ };
+
+ fan@1 {
+ reg = <0x01>;
+ aspeed,fan-tach-ch = /bits/ 8 <0x01 0x02>;
+ };
+};
diff --git a/Documentation/devicetree/bindings/hwmon/lm87.txt b/Documentation/devicetree/bindings/hwmon/lm87.txt
new file mode 100644
index 000000000000..e1b79903f204
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/lm87.txt
@@ -0,0 +1,30 @@
+*LM87 hwmon sensor.
+
+Required properties:
+- compatible: Should be
+ "ti,lm87"
+
+- reg: I2C address
+
+optional properties:
+- has-temp3: This configures pins 18 and 19 to be used as a second
+ remote temperature sensing channel. By default the pins
+ are configured as voltage input pins in0 and in5.
+
+- has-in6: When set, pin 5 is configured to be used as voltage input
+ in6. Otherwise the pin is set as FAN1 input.
+
+- has-in7: When set, pin 6 is configured to be used as voltage input
+ in7. Otherwise the pin is set as FAN2 input.
+
+- vcc-supply: a Phandle for the regulator supplying power, can be
+ cofigured to measure 5.0V power supply. Default is 3.3V.
+
+Example:
+
+lm87@2e {
+ compatible = "ti,lm87";
+ reg = <0x2e>;
+ has-temp3;
+ vcc-supply = <&reg_5v0>;
+};
diff --git a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt
index 6f28969af9dc..028268fd99ee 100644
--- a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt
+++ b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt
@@ -6,7 +6,9 @@ perform in-band IPMI communication with their host.
Required properties:
-- compatible : should be "aspeed,ast2400-ibt-bmc"
+- compatible : should be one of
+ "aspeed,ast2400-ibt-bmc"
+ "aspeed,ast2500-ibt-bmc"
- reg: physical address and size of the registers
Optional properties:
diff --git a/Documentation/devicetree/bindings/leds/leds-cpcap.txt b/Documentation/devicetree/bindings/leds/leds-cpcap.txt
new file mode 100644
index 000000000000..ebf7cdc7f70c
--- /dev/null
+++ b/Documentation/devicetree/bindings/leds/leds-cpcap.txt
@@ -0,0 +1,29 @@
+Motorola CPCAP PMIC LEDs
+------------------------
+
+This module is part of the CPCAP. For more details about the whole
+chip see Documentation/devicetree/bindings/mfd/motorola-cpcap.txt.
+
+Requires node properties:
+- compatible: should be one of
+ * "motorola,cpcap-led-mdl" (Main Display Lighting)
+ * "motorola,cpcap-led-kl" (Keyboard Lighting)
+ * "motorola,cpcap-led-adl" (Aux Display Lighting)
+ * "motorola,cpcap-led-red" (Red Triode)
+ * "motorola,cpcap-led-green" (Green Triode)
+ * "motorola,cpcap-led-blue" (Blue Triode)
+ * "motorola,cpcap-led-cf" (Camera Flash)
+ * "motorola,cpcap-led-bt" (Bluetooth)
+ * "motorola,cpcap-led-cp" (Camera Privacy LED)
+- label: see Documentation/devicetree/bindings/leds/common.txt
+- vdd-supply: A phandle to the regulator powering the LED
+
+Example:
+
+&cpcap {
+ cpcap_led_red: red-led {
+ compatible = "motorola,cpcap-led-red";
+ label = "cpcap:red";
+ vdd-supply = <&sw5>;
+ };
+};
diff --git a/Documentation/devicetree/bindings/leds/leds-mt6323.txt b/Documentation/devicetree/bindings/leds/leds-mt6323.txt
new file mode 100644
index 000000000000..45bf9f7d85f3
--- /dev/null
+++ b/Documentation/devicetree/bindings/leds/leds-mt6323.txt
@@ -0,0 +1,60 @@
+Device Tree Bindings for LED support on MT6323 PMIC
+
+MT6323 LED controller is subfunction provided by MT6323 PMIC, so the LED
+controllers are defined as the subnode of the function node provided by MT6323
+PMIC controller that is being defined as one kind of Muti-Function Device (MFD)
+using shared bus called PMIC wrapper for each subfunction to access remote
+MT6323 PMIC hardware.
+
+For MT6323 MFD bindings see:
+Documentation/devicetree/bindings/mfd/mt6397.txt
+For MediaTek PMIC wrapper bindings see:
+Documentation/devicetree/bindings/soc/mediatek/pwrap.txt
+
+Required properties:
+- compatible : Must be "mediatek,mt6323-led"
+- address-cells : Must be 1
+- size-cells : Must be 0
+
+Each led is represented as a child node of the mediatek,mt6323-led that
+describes the initial behavior for each LED physically and currently only four
+LED child nodes can be supported.
+
+Required properties for the LED child node:
+- reg : LED channel number (0..3)
+
+Optional properties for the LED child node:
+- label : See Documentation/devicetree/bindings/leds/common.txt
+- linux,default-trigger : See Documentation/devicetree/bindings/leds/common.txt
+- default-state: See Documentation/devicetree/bindings/leds/common.txt
+
+Example:
+
+ mt6323: pmic {
+ compatible = "mediatek,mt6323";
+
+ ...
+
+ mt6323led: leds {
+ compatible = "mediatek,mt6323-led";
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ led@0 {
+ reg = <0>;
+ label = "LED0";
+ linux,default-trigger = "timer";
+ default-state = "on";
+ };
+ led@1 {
+ reg = <1>;
+ label = "LED1";
+ default-state = "off";
+ };
+ led@2 {
+ reg = <2>;
+ label = "LED2";
+ default-state = "on";
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/leds/leds-pca9532.txt b/Documentation/devicetree/bindings/leds/leds-pca9532.txt
index 198f3ba0e01f..f769c52e3643 100644
--- a/Documentation/devicetree/bindings/leds/leds-pca9532.txt
+++ b/Documentation/devicetree/bindings/leds/leds-pca9532.txt
@@ -17,6 +17,8 @@ Optional sub-node properties:
- label: see Documentation/devicetree/bindings/leds/common.txt
- type: Output configuration, see dt-bindings/leds/leds-pca9532.h (default NONE)
- linux,default-trigger: see Documentation/devicetree/bindings/leds/common.txt
+ - default-state: see Documentation/devicetree/bindings/leds/common.txt
+ This property is only valid for sub-nodes of type <PCA9532_TYPE_LED>.
Example:
#include <dt-bindings/leds/leds-pca9532.h>
@@ -33,6 +35,14 @@ Example:
label = "pca:green:power";
type = <PCA9532_TYPE_LED>;
};
+ kernel-booting {
+ type = <PCA9532_TYPE_LED>;
+ default-state = "on";
+ };
+ sys-stat {
+ type = <PCA9532_TYPE_LED>;
+ default-state = "keep"; // don't touch, was set by U-Boot
+ };
};
For more product information please see the link below:
diff --git a/Documentation/devicetree/bindings/mailbox/brcm,iproc-flexrm-mbox.txt b/Documentation/devicetree/bindings/mailbox/brcm,iproc-flexrm-mbox.txt
new file mode 100644
index 000000000000..752ae6b00d26
--- /dev/null
+++ b/Documentation/devicetree/bindings/mailbox/brcm,iproc-flexrm-mbox.txt
@@ -0,0 +1,59 @@
+Broadcom FlexRM Ring Manager
+============================
+The Broadcom FlexRM ring manager provides a set of rings which can be
+used to submit work to offload engines. An SoC may have multiple FlexRM
+hardware blocks. There is one device tree entry per FlexRM block. The
+FlexRM driver will create a mailbox-controller instance for given FlexRM
+hardware block where each mailbox channel is a separate FlexRM ring.
+
+Required properties:
+--------------------
+- compatible: Should be "brcm,iproc-flexrm-mbox"
+- reg: Specifies base physical address and size of the FlexRM
+ ring registers
+- msi-parent: Phandles (and potential Device IDs) to MSI controllers
+ The FlexRM engine will send MSIs (instead of wired
+ interrupts) to CPU. There is one MSI for each FlexRM ring.
+ Refer devicetree/bindings/interrupt-controller/msi.txt
+- #mbox-cells: Specifies the number of cells needed to encode a mailbox
+ channel. This should be 3.
+
+ The 1st cell is the mailbox channel number.
+
+ The 2nd cell contains MSI completion threshold. This is the
+ number of completion messages for which FlexRM will inject
+ one MSI interrupt to CPU.
+
+ The 3nd cell contains MSI timer value representing time for
+ which FlexRM will wait to accumulate N completion messages
+ where N is the value specified by 2nd cell above. If FlexRM
+ does not get required number of completion messages in time
+ specified by this cell then it will inject one MSI interrupt
+ to CPU provided atleast one completion message is available.
+
+Optional properties:
+--------------------
+- dma-coherent: Present if DMA operations made by the FlexRM engine (such
+ as DMA descriptor access, access to buffers pointed by DMA
+ descriptors and read/write pointer updates to DDR) are
+ cache coherent with the CPU.
+
+Example:
+--------
+crypto_mbox: mbox@67000000 {
+ compatible = "brcm,iproc-flexrm-mbox";
+ reg = <0x67000000 0x200000>;
+ msi-parent = <&gic_its 0x7f00>;
+ #mbox-cells = <3>;
+};
+
+crypto@672c0000 {
+ compatible = "brcm,spu2-v2-crypto";
+ reg = <0x672c0000 0x1000>;
+ mboxes = <&crypto_mbox 0 0x1 0xffff>,
+ <&crypto_mbox 1 0x1 0xffff>,
+ <&crypto_mbox 16 0x1 0xffff>,
+ <&crypto_mbox 17 0x1 0xffff>,
+ <&crypto_mbox 30 0x1 0xffff>,
+ <&crypto_mbox 31 0x1 0xffff>;
+};
diff --git a/Documentation/devicetree/bindings/mailbox/brcm,iproc-pdc-mbox.txt b/Documentation/devicetree/bindings/mailbox/brcm,iproc-pdc-mbox.txt
index 411ccf421584..0f3ee81d92c2 100644
--- a/Documentation/devicetree/bindings/mailbox/brcm,iproc-pdc-mbox.txt
+++ b/Documentation/devicetree/bindings/mailbox/brcm,iproc-pdc-mbox.txt
@@ -1,9 +1,11 @@
The PDC driver manages data transfer to and from various offload engines
on some Broadcom SoCs. An SoC may have multiple PDC hardware blocks. There is
-one device tree entry per block.
+one device tree entry per block. On some chips, the PDC functionality is
+handled by the FA2 (Northstar Plus).
Required properties:
-- compatible : Should be "brcm,iproc-pdc-mbox".
+- compatible : Should be "brcm,iproc-pdc-mbox" or "brcm,iproc-fa2-mbox" for
+ FA2/Northstar Plus.
- reg: Should contain PDC registers location and length.
- interrupts: Should contain the IRQ line for the PDC.
- #mbox-cells: 1
diff --git a/Documentation/devicetree/bindings/power/reset/gemini-poweroff.txt b/Documentation/devicetree/bindings/power/reset/gemini-poweroff.txt
new file mode 100644
index 000000000000..7fec3e100214
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/reset/gemini-poweroff.txt
@@ -0,0 +1,17 @@
+* Device-Tree bindings for Cortina Systems Gemini Poweroff
+
+This is a special IP block in the Cortina Gemini SoC that only
+deals with different ways to power the system down.
+
+Required properties:
+- compatible: should be "cortina,gemini-power-controller"
+- reg: should contain the physical memory base and size
+- interrupts: should contain the power management interrupt
+
+Example:
+
+power-controller@4b000000 {
+ compatible = "cortina,gemini-power-controller";
+ reg = <0x4b000000 0x100>;
+ interrupts = <26 IRQ_TYPE_EDGE_FALLING>;
+};
diff --git a/Documentation/devicetree/bindings/power/reset/syscon-poweroff.txt b/Documentation/devicetree/bindings/power/reset/syscon-poweroff.txt
index 1e2546f8b08a..022ed1f3bc80 100644
--- a/Documentation/devicetree/bindings/power/reset/syscon-poweroff.txt
+++ b/Documentation/devicetree/bindings/power/reset/syscon-poweroff.txt
@@ -3,13 +3,20 @@ Generic SYSCON mapped register poweroff driver
This is a generic poweroff driver using syscon to map the poweroff register.
The poweroff is generally performed with a write to the poweroff register
defined by the register map pointed by syscon reference plus the offset
-with the mask defined in the poweroff node.
+with the value and mask defined in the poweroff node.
Required properties:
- compatible: should contain "syscon-poweroff"
- regmap: this is phandle to the register map node
- offset: offset in the register map for the poweroff register (in bytes)
-- mask: the poweroff value written to the poweroff register (32 bit access)
+- value: the poweroff value written to the poweroff register (32 bit access)
+
+Optional properties:
+- mask: update only the register bits defined by the mask (32 bit)
+
+Legacy usage:
+If a node doesn't contain a value property but contains a mask property, the
+mask property is used as the value.
Default will be little endian mode, 32 bit access only.
diff --git a/Documentation/devicetree/bindings/power/supply/cpcap-charger.txt b/Documentation/devicetree/bindings/power/supply/cpcap-charger.txt
new file mode 100644
index 000000000000..80bd873c3b1d
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/supply/cpcap-charger.txt
@@ -0,0 +1,37 @@
+Motorola CPCAP PMIC battery charger binding
+
+Required properties:
+- compatible: Shall be "motorola,mapphone-cpcap-charger"
+- interrupts: Interrupt specifier for each name in interrupt-names
+- interrupt-names: Should contain the following entries:
+ "chrg_det", "rvrs_chrg", "chrg_se1b", "se0conn",
+ "rvrs_mode", "chrgcurr1", "vbusvld", "battdetb"
+- io-channels: IIO ADC channel specifier for each name in io-channel-names
+- io-channel-names: Should contain the following entries:
+ "battdetb", "battp", "vbus", "chg_isense", "batti"
+
+Optional properties:
+- mode-gpios: Optionally CPCAP charger can have a companion wireless
+ charge controller that is controlled with two GPIOs
+ that are active low.
+
+Example:
+
+cpcap_charger: charger {
+ compatible = "motorola,mapphone-cpcap-charger";
+ interrupts-extended = <
+ &cpcap 13 0 &cpcap 12 0 &cpcap 29 0 &cpcap 28 0
+ &cpcap 22 0 &cpcap 20 0 &cpcap 19 0 &cpcap 54 0
+ >;
+ interrupt-names =
+ "chrg_det", "rvrs_chrg", "chrg_se1b", "se0conn",
+ "rvrs_mode", "chrgcurr1", "vbusvld", "battdetb";
+ mode-gpios = <&gpio3 29 GPIO_ACTIVE_LOW
+ &gpio3 23 GPIO_ACTIVE_LOW>;
+ io-channels = <&cpcap_adc 0 &cpcap_adc 1
+ &cpcap_adc 2 &cpcap_adc 5
+ &cpcap_adc 6>;
+ io-channel-names = "battdetb", "battp",
+ "vbus", "chg_isense",
+ "batti";
+};
diff --git a/Documentation/devicetree/bindings/power/supply/lego_ev3_battery.txt b/Documentation/devicetree/bindings/power/supply/lego_ev3_battery.txt
new file mode 100644
index 000000000000..5485633b1faa
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/supply/lego_ev3_battery.txt
@@ -0,0 +1,21 @@
+LEGO MINDSTORMS EV3 Battery
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+LEGO MINDSTORMS EV3 has some built-in capability for monitoring the battery.
+It uses 6 AA batteries or a special Li-ion rechargeable battery pack that is
+detected by a key switch in the battery compartment.
+
+Required properties:
+ - compatible: Must be "lego,ev3-battery"
+ - io-channels: phandles to analog inputs for reading voltage and current
+ - io-channel-names: Must be "voltage", "current"
+ - rechargeable-gpios: phandle to the rechargeable battery indication gpio
+
+Example:
+
+ battery {
+ compatible = "lego,ev3-battery";
+ io-channels = <&adc 4>, <&adc 3>;
+ io-channel-names = "voltage", "current";
+ rechargeable-gpios = <&gpio 136 GPIO_ACTIVE_LOW>;
+ };
diff --git a/Documentation/devicetree/bindings/power/supply/ltc2941.txt b/Documentation/devicetree/bindings/power/supply/ltc2941.txt
index ea42ae12d924..a9d7aa60558b 100644
--- a/Documentation/devicetree/bindings/power/supply/ltc2941.txt
+++ b/Documentation/devicetree/bindings/power/supply/ltc2941.txt
@@ -6,8 +6,8 @@ temperature monitoring, and uses a slightly different conversion
formula for the charge counter.
Required properties:
-- compatible: Should contain "ltc2941" or "ltc2943" which also indicates the
- type of I2C chip attached.
+- compatible: Should contain "lltc,ltc2941" or "lltc,ltc2943" which also
+ indicates the type of I2C chip attached.
- reg: The 7-bit I2C address.
- lltc,resistor-sense: The sense resistor value in milli-ohms. Can be a 32-bit
negative value when the battery has been connected to the wrong end of the
@@ -20,7 +20,7 @@ Required properties:
Example from the Topic Miami Florida board:
fuelgauge: ltc2943@64 {
- compatible = "ltc2943";
+ compatible = "lltc,ltc2943";
reg = <0x64>;
lltc,resistor-sense = <15>;
lltc,prescaler-exponent = <5>; /* 2^(2*5) = 1024 */
diff --git a/Documentation/devicetree/bindings/power/supply/max8925_batter.txt b/Documentation/devicetree/bindings/power/supply/max8925_battery.txt
index d7e3e0c0f71d..d7e3e0c0f71d 100644
--- a/Documentation/devicetree/bindings/power/supply/max8925_batter.txt
+++ b/Documentation/devicetree/bindings/power/supply/max8925_battery.txt
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index ec0bfb9bbebd..830c9987fa02 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -265,6 +265,7 @@ sbs Smart Battery System
schindler Schindler
seagate Seagate Technology PLC
semtech Semtech Corporation
+sensirion Sensirion AG
sgx SGX Sensortech
sharp Sharp Corporation
si-en Si-En Technology Ltd.
diff --git a/Documentation/hwmon/aspeed-pwm-tacho b/Documentation/hwmon/aspeed-pwm-tacho
new file mode 100644
index 000000000000..7cfb34977460
--- /dev/null
+++ b/Documentation/hwmon/aspeed-pwm-tacho
@@ -0,0 +1,22 @@
+Kernel driver aspeed-pwm-tacho
+==============================
+
+Supported chips:
+ ASPEED AST2400/2500
+
+Authors:
+ <jaghu@google.com>
+
+Description:
+------------
+This driver implements support for ASPEED AST2400/2500 PWM and Fan Tacho
+controller. The PWM controller supports upto 8 PWM outputs. The Fan tacho
+controller supports up to 16 tachometer inputs.
+
+The driver provides the following sensor accesses in sysfs:
+
+fanX_input ro provide current fan rotation value in RPM as reported
+ by the fan to the device.
+
+pwmX rw get or set PWM fan control value. This is an integer
+ value between 0(off) and 255(full speed).
diff --git a/Documentation/hwmon/tc654 b/Documentation/hwmon/tc654
index 91a2843f5f98..47636a8077b4 100644
--- a/Documentation/hwmon/tc654
+++ b/Documentation/hwmon/tc654
@@ -2,7 +2,7 @@ Kernel driver tc654
===================
Supported chips:
- * Microship TC654 and TC655
+ * Microchip TC654 and TC655
Prefix: 'tc654'
Datasheet: http://ww1.microchip.com/downloads/en/DeviceDoc/20001734C.pdf
diff --git a/Documentation/lightnvm/pblk.txt b/Documentation/lightnvm/pblk.txt
new file mode 100644
index 000000000000..1040ed1cec81
--- /dev/null
+++ b/Documentation/lightnvm/pblk.txt
@@ -0,0 +1,21 @@
+pblk: Physical Block Device Target
+==================================
+
+pblk implements a fully associative, host-based FTL that exposes a traditional
+block I/O interface. Its primary responsibilities are:
+
+ - Map logical addresses onto physical addresses (4KB granularity) in a
+ logical-to-physical (L2P) table.
+ - Maintain the integrity and consistency of the L2P table as well as its
+ recovery from normal tear down and power outage.
+ - Deal with controller- and media-specific constrains.
+ - Handle I/O errors.
+ - Implement garbage collection.
+ - Maintain consistency across the I/O stack during synchronization points.
+
+For more information please refer to:
+
+ http://lightnvm.io
+
+which maintains updated FAQs, manual pages, technical documentation, tools,
+contacts, etc.
diff --git a/MAINTAINERS b/MAINTAINERS
index 56a92ec7d3f7..0f6df84fec54 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2544,6 +2544,14 @@ F: block/
F: kernel/trace/blktrace.c
F: lib/sbitmap.c
+BFQ I/O SCHEDULER
+M: Paolo Valente <paolo.valente@linaro.org>
+M: Jens Axboe <axboe@kernel.dk>
+L: linux-block@vger.kernel.org
+S: Maintained
+F: block/bfq-*
+F: Documentation/block/bfq-iosched.txt
+
BLOCK2MTD DRIVER
M: Joern Engel <joern@lazybastard.org>
L: linux-mtd@lists.infradead.org
@@ -4708,6 +4716,7 @@ L: linux-edac@vger.kernel.org
L: linux-mips@linux-mips.org
S: Supported
F: drivers/edac/octeon_edac*
+F: drivers/edac/thunderx_edac*
EDAC-E752X
M: Mark Gross <mark.gross@intel.com>
diff --git a/Makefile b/Makefile
index 779302695453..4b074a904106 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 4
PATCHLEVEL = 11
SUBLEVEL = 0
-EXTRAVERSION = -rc8
+EXTRAVERSION =
NAME = Fearless Coyote
# *DOCUMENTATION*
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index c9f30f4763ab..5d7fb3e7cb97 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -406,6 +406,14 @@ config ARC_HAS_DIV_REM
bool "Insn: div, divu, rem, remu"
default y
+config ARC_HAS_ACCL_REGS
+ bool "Reg Pair ACCL:ACCH (FPU and/or MPY > 6)"
+ default n
+ help
+ Depending on the configuration, CPU can contain accumulator reg-pair
+ (also referred to as r58:r59). These can also be used by gcc as GPR so
+ kernel needs to save/restore per process
+
endif # ISA_ARCV2
endmenu # "ARC CPU Configuration"
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index b65930a49589..54b54da6384c 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -17,10 +17,11 @@
#include <asm/barrier.h>
#include <asm/smp.h>
+#define ATOMIC_INIT(i) { (i) }
+
#ifndef CONFIG_ARC_PLAT_EZNPS
#define atomic_read(v) READ_ONCE((v)->counter)
-#define ATOMIC_INIT(i) { (i) }
#ifdef CONFIG_ARC_HAS_LLSC
diff --git a/arch/arc/include/asm/entry-arcv2.h b/arch/arc/include/asm/entry-arcv2.h
index aee1a77934cf..ac85380d14a4 100644
--- a/arch/arc/include/asm/entry-arcv2.h
+++ b/arch/arc/include/asm/entry-arcv2.h
@@ -16,6 +16,11 @@
;
; Now manually save: r12, sp, fp, gp, r25
+#ifdef CONFIG_ARC_HAS_ACCL_REGS
+ PUSH r59
+ PUSH r58
+#endif
+
PUSH r30
PUSH r12
@@ -75,6 +80,11 @@
POP r12
POP r30
+#ifdef CONFIG_ARC_HAS_ACCL_REGS
+ POP r58
+ POP r59
+#endif
+
.endm
/*------------------------------------------------------------------------*/
diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h
index 47111d565a95..5297faa8a378 100644
--- a/arch/arc/include/asm/ptrace.h
+++ b/arch/arc/include/asm/ptrace.h
@@ -86,6 +86,10 @@ struct pt_regs {
unsigned long r12, r30;
+#ifdef CONFIG_ARC_HAS_ACCL_REGS
+ unsigned long r58, r59; /* ACCL/ACCH used by FPU / DSP MPY */
+#endif
+
/*------- Below list auto saved by h/w -----------*/
unsigned long r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c
index fa62404ba58f..fc8211f338ad 100644
--- a/arch/arc/kernel/setup.c
+++ b/arch/arc/kernel/setup.c
@@ -319,7 +319,8 @@ static char *arc_extn_mumbojumbo(int cpu_id, char *buf, int len)
static void arc_chk_core_config(void)
{
struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()];
- int fpu_enabled;
+ int saved = 0, present = 0;
+ char *opt_nm = NULL;;
if (!cpu->extn.timer0)
panic("Timer0 is not present!\n");
@@ -346,17 +347,28 @@ static void arc_chk_core_config(void)
/*
* FP hardware/software config sanity
- * -If hardware contains DPFP, kernel needs to save/restore FPU state
+ * -If hardware present, kernel needs to save/restore FPU state
* -If not, it will crash trying to save/restore the non-existant regs
- *
- * (only DPDP checked since SP has no arch visible regs)
*/
- fpu_enabled = IS_ENABLED(CONFIG_ARC_FPU_SAVE_RESTORE);
- if (cpu->extn.fpu_dp && !fpu_enabled)
- pr_warn("CONFIG_ARC_FPU_SAVE_RESTORE needed for working apps\n");
- else if (!cpu->extn.fpu_dp && fpu_enabled)
- panic("FPU non-existent, disable CONFIG_ARC_FPU_SAVE_RESTORE\n");
+ if (is_isa_arcompact()) {
+ opt_nm = "CONFIG_ARC_FPU_SAVE_RESTORE";
+ saved = IS_ENABLED(CONFIG_ARC_FPU_SAVE_RESTORE);
+
+ /* only DPDP checked since SP has no arch visible regs */
+ present = cpu->extn.fpu_dp;
+ } else {
+ opt_nm = "CONFIG_ARC_HAS_ACCL_REGS";
+ saved = IS_ENABLED(CONFIG_ARC_HAS_ACCL_REGS);
+
+ /* Accumulator Low:High pair (r58:59) present if DSP MPY or FPU */
+ present = cpu->extn_mpy.dsp | cpu->extn.fpu_sp | cpu->extn.fpu_dp;
+ }
+
+ if (present && !saved)
+ pr_warn("Enable %s for working apps\n", opt_nm);
+ else if (!present && saved)
+ panic("Disable %s, hardware NOT present\n", opt_nm);
}
/*
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index a94126fb02c2..6aa7be191f1a 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -748,7 +748,6 @@ CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
CONFIG_LEDS_TRIGGER_TRANSIENT=y
CONFIG_LEDS_TRIGGER_CAMERA=y
CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
CONFIG_EDAC_HIGHBANK_MC=y
CONFIG_EDAC_HIGHBANK_L2=y
CONFIG_RTC_CLASS=y
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index 2aac99fd1c41..1318f61589dc 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -635,8 +635,7 @@ CONFIG_LEDS_TRIGGER_GPIO=m
CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
CONFIG_LEDS_TRIGGER_TRANSIENT=m
CONFIG_LEDS_TRIGGER_CAMERA=m
-CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=m
+CONFIG_EDAC=m
CONFIG_RTC_CLASS=y
CONFIG_RTC_DEBUG=y
CONFIG_RTC_DRV_DS1307=m
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index 8ef9c02747fa..02a1787c888c 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -489,7 +489,7 @@ $(generic_defconfigs):
$(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh \
-m -O $(objtree) $(srctree)/arch/$(ARCH)/configs/generic_defconfig $^ \
$(foreach board,$(BOARDS),$(generic_config_dir)/board-$(board).config)
- $(Q)$(MAKE) olddefconfig
+ $(Q)$(MAKE) -f $(srctree)/Makefile olddefconfig
#
# Prevent generic merge_config rules attempting to merge single fragments
@@ -503,8 +503,8 @@ $(generic_config_dir)/%.config: ;
#
.PHONY: sead3_defconfig
sead3_defconfig:
- $(Q)$(MAKE) 32r2el_defconfig BOARDS=sead-3
+ $(Q)$(MAKE) -f $(srctree)/Makefile 32r2el_defconfig BOARDS=sead-3
.PHONY: sead3micro_defconfig
sead3micro_defconfig:
- $(Q)$(MAKE) micro32r2el_defconfig BOARDS=sead-3
+ $(Q)$(MAKE) -f $(srctree)/Makefile micro32r2el_defconfig BOARDS=sead-3
diff --git a/arch/mips/include/asm/asm-prototypes.h b/arch/mips/include/asm/asm-prototypes.h
index a160cf69bb92..6e28971fe73a 100644
--- a/arch/mips/include/asm/asm-prototypes.h
+++ b/arch/mips/include/asm/asm-prototypes.h
@@ -3,3 +3,4 @@
#include <asm/fpu.h>
#include <asm-generic/asm-prototypes.h>
#include <asm/uaccess.h>
+#include <asm/ftrace.h>
diff --git a/arch/mips/kernel/cevt-r4k.c b/arch/mips/kernel/cevt-r4k.c
index 804d2a2a19fe..dd6a18bc10ab 100644
--- a/arch/mips/kernel/cevt-r4k.c
+++ b/arch/mips/kernel/cevt-r4k.c
@@ -80,7 +80,7 @@ static unsigned int calculate_min_delta(void)
}
/* Sorted insert of 75th percentile into buf2 */
- for (k = 0; k < i; ++k) {
+ for (k = 0; k < i && k < ARRAY_SIZE(buf2); ++k) {
if (buf1[ARRAY_SIZE(buf1) - 1] < buf2[k]) {
l = min_t(unsigned int,
i, ARRAY_SIZE(buf2) - 1);
diff --git a/arch/mips/kernel/elf.c b/arch/mips/kernel/elf.c
index 6430bff21fff..5c429d70e17f 100644
--- a/arch/mips/kernel/elf.c
+++ b/arch/mips/kernel/elf.c
@@ -257,7 +257,7 @@ int arch_check_elf(void *_ehdr, bool has_interpreter, void *_interp_ehdr,
else if ((prog_req.fr1 && prog_req.frdefault) ||
(prog_req.single && !prog_req.frdefault))
/* Make sure 64-bit MIPS III/IV/64R1 will not pick FR1 */
- state->overall_fp_mode = ((current_cpu_data.fpu_id & MIPS_FPIR_F64) &&
+ state->overall_fp_mode = ((raw_current_cpu_data.fpu_id & MIPS_FPIR_F64) &&
cpu_has_mips_r2_r6) ?
FP_FR1 : FP_FR0;
else if (prog_req.fr1)
diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c
index 1f4bd222ba76..eb6c0d582626 100644
--- a/arch/mips/kernel/kgdb.c
+++ b/arch/mips/kernel/kgdb.c
@@ -244,9 +244,6 @@ static int compute_signal(int tt)
void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
{
int reg;
- struct thread_info *ti = task_thread_info(p);
- unsigned long ksp = (unsigned long)ti + THREAD_SIZE - 32;
- struct pt_regs *regs = (struct pt_regs *)ksp - 1;
#if (KGDB_GDB_REG_SIZE == 32)
u32 *ptr = (u32 *)gdb_regs;
#else
@@ -254,25 +251,46 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
#endif
for (reg = 0; reg < 16; reg++)
- *(ptr++) = regs->regs[reg];
+ *(ptr++) = 0;
/* S0 - S7 */
- for (reg = 16; reg < 24; reg++)
- *(ptr++) = regs->regs[reg];
+ *(ptr++) = p->thread.reg16;
+ *(ptr++) = p->thread.reg17;
+ *(ptr++) = p->thread.reg18;
+ *(ptr++) = p->thread.reg19;
+ *(ptr++) = p->thread.reg20;
+ *(ptr++) = p->thread.reg21;
+ *(ptr++) = p->thread.reg22;
+ *(ptr++) = p->thread.reg23;
for (reg = 24; reg < 28; reg++)
*(ptr++) = 0;
/* GP, SP, FP, RA */
- for (reg = 28; reg < 32; reg++)
- *(ptr++) = regs->regs[reg];
-
- *(ptr++) = regs->cp0_status;
- *(ptr++) = regs->lo;
- *(ptr++) = regs->hi;
- *(ptr++) = regs->cp0_badvaddr;
- *(ptr++) = regs->cp0_cause;
- *(ptr++) = regs->cp0_epc;
+ *(ptr++) = (long)p;
+ *(ptr++) = p->thread.reg29;
+ *(ptr++) = p->thread.reg30;
+ *(ptr++) = p->thread.reg31;
+
+ *(ptr++) = p->thread.cp0_status;
+
+ /* lo, hi */
+ *(ptr++) = 0;
+ *(ptr++) = 0;
+
+ /*
+ * BadVAddr, Cause
+ * Ideally these would come from the last exception frame up the stack
+ * but that requires unwinding, otherwise we can't know much for sure.
+ */
+ *(ptr++) = 0;
+ *(ptr++) = 0;
+
+ /*
+ * PC
+ * use return address (RA), i.e. the moment after return from resume()
+ */
+ *(ptr++) = p->thread.reg31;
}
void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 8c35b3152e1e..9452b02ce079 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -1446,6 +1446,11 @@ static int mipsxx_pmu_handle_shared_irq(void)
HANDLE_COUNTER(0)
}
+#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS
+ read_unlock(&pmuint_rwlock);
+#endif
+ resume_local_counters();
+
/*
* Do all the work for the pending perf events. We can do this
* in here because the performance counter interrupt is a regular
@@ -1454,10 +1459,6 @@ static int mipsxx_pmu_handle_shared_irq(void)
if (handled == IRQ_HANDLED)
irq_work_run();
-#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS
- read_unlock(&pmuint_rwlock);
-#endif
- resume_local_counters();
return handled;
}
diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c
index 9103bebc9a8e..2d1a0c438771 100644
--- a/arch/mips/kernel/relocate.c
+++ b/arch/mips/kernel/relocate.c
@@ -18,7 +18,7 @@
#include <linux/kernel.h>
#include <linux/libfdt.h>
#include <linux/of_fdt.h>
-#include <linux/sched.h>
+#include <linux/sched/task.h>
#include <linux/start_kernel.h>
#include <linux/string.h>
#include <linux/printk.h>
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 6d45f05538c8..795b4aaf8927 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -422,13 +422,12 @@ void play_dead(void)
local_irq_disable();
idle_task_exit();
cpu = smp_processor_id();
+ core = cpu_data[cpu].core;
cpu_death = CPU_DEATH_POWER;
pr_debug("CPU%d going offline\n", cpu);
if (cpu_has_mipsmt || cpu_has_vp) {
- core = cpu_data[cpu].core;
-
/* Look for another online VPE within the core */
for_each_online_cpu(cpu_death_sibling) {
if (cpu_data[cpu_death_sibling].core != core)
diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c
index cb675ec6f283..54f56d5a96c4 100644
--- a/arch/mips/mti-malta/malta-int.c
+++ b/arch/mips/mti-malta/malta-int.c
@@ -232,6 +232,17 @@ void __init arch_init_irq(void)
{
int corehi_irq;
+ /*
+ * Preallocate the i8259's expected virq's here. Since irqchip_init()
+ * will probe the irqchips in hierarchial order, i8259 is probed last.
+ * If anything allocates a virq before the i8259 is probed, it will
+ * be given one of the i8259's expected range and consequently setup
+ * of the i8259 will fail.
+ */
+ WARN(irq_alloc_descs(I8259A_IRQ_BASE, I8259A_IRQ_BASE,
+ 16, numa_node_id()) < 0,
+ "Cannot reserve i8259 virqs at IRQ%d\n", I8259A_IRQ_BASE);
+
i8259_set_poll(mips_pcibios_iack);
irqchip_init();
diff --git a/arch/mips/pci/pci-legacy.c b/arch/mips/pci/pci-legacy.c
index 014649be158d..3a84f6c0c840 100644
--- a/arch/mips/pci/pci-legacy.c
+++ b/arch/mips/pci/pci-legacy.c
@@ -190,7 +190,7 @@ void register_pci_controller(struct pci_controller *hose)
}
INIT_LIST_HEAD(&hose->list);
- list_add(&hose->list, &controllers);
+ list_add_tail(&hose->list, &controllers);
/*
* Do not panic here but later - this might happen before console init.
diff --git a/arch/powerpc/configs/85xx-hw.config b/arch/powerpc/configs/85xx-hw.config
index 528ff0e714e6..c03d0fb16665 100644
--- a/arch/powerpc/configs/85xx-hw.config
+++ b/arch/powerpc/configs/85xx-hw.config
@@ -16,9 +16,8 @@ CONFIG_DAVICOM_PHY=y
CONFIG_DMADEVICES=y
CONFIG_E1000E=y
CONFIG_E1000=y
-CONFIG_EDAC_MM_EDAC=y
-CONFIG_EDAC_MPC85XX=y
CONFIG_EDAC=y
+CONFIG_EDAC_MPC85XX=y
CONFIG_EEPROM_AT24=y
CONFIG_EEPROM_LEGACY=y
CONFIG_FB_FSL_DIU=y
diff --git a/arch/powerpc/configs/85xx/ge_imp3a_defconfig b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
index c79283be5680..a917f7afb4f9 100644
--- a/arch/powerpc/configs/85xx/ge_imp3a_defconfig
+++ b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
@@ -155,7 +155,6 @@ CONFIG_USB_OHCI_HCD_PPC_OF_BE=y
CONFIG_USB_OHCI_HCD_PPC_OF_LE=y
CONFIG_USB_STORAGE=y
CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
CONFIG_EDAC_MPC85XX=y
CONFIG_RTC_CLASS=y
# CONFIG_RTC_INTF_PROC is not set
diff --git a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
index dbd961de251e..72900b84d3e0 100644
--- a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
+++ b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
@@ -116,7 +116,6 @@ CONFIG_LEDS_TRIGGERS=y
CONFIG_LEDS_TRIGGER_TIMER=y
CONFIG_LEDS_TRIGGER_HEARTBEAT=y
CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_DS1307=y
CONFIG_RTC_DRV_CMOS=y
diff --git a/arch/powerpc/configs/cell_defconfig b/arch/powerpc/configs/cell_defconfig
index 2d7fcbe047ac..aa564599e368 100644
--- a/arch/powerpc/configs/cell_defconfig
+++ b/arch/powerpc/configs/cell_defconfig
@@ -179,7 +179,6 @@ CONFIG_INFINIBAND_MTHCA=m
CONFIG_INFINIBAND_IPOIB=m
CONFIG_INFINIBAND_IPOIB_DEBUG_DATA=y
CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
CONFIG_EDAC_CELL=y
CONFIG_UIO=m
CONFIG_EXT2_FS=y
diff --git a/arch/powerpc/configs/pasemi_defconfig b/arch/powerpc/configs/pasemi_defconfig
index 5553c5ce4274..fe43ff47bd2f 100644
--- a/arch/powerpc/configs/pasemi_defconfig
+++ b/arch/powerpc/configs/pasemi_defconfig
@@ -142,7 +142,6 @@ CONFIG_USB_UHCI_HCD=y
CONFIG_USB_SL811_HCD=y
CONFIG_USB_STORAGE=y
CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
CONFIG_EDAC_PASEMI=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_DS1307=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index 4f1288b04303..f2e03f032041 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -262,7 +262,6 @@ CONFIG_INFINIBAND_IPOIB_CM=y
CONFIG_INFINIBAND_SRP=m
CONFIG_INFINIBAND_ISER=m
CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
CONFIG_EDAC_PASEMI=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_DS1307=y
diff --git a/arch/powerpc/configs/ppc64e_defconfig b/arch/powerpc/configs/ppc64e_defconfig
index 11a3473f9e2e..6340e6c53c54 100644
--- a/arch/powerpc/configs/ppc64e_defconfig
+++ b/arch/powerpc/configs/ppc64e_defconfig
@@ -173,7 +173,6 @@ CONFIG_INFINIBAND_MTHCA=m
CONFIG_INFINIBAND_IPOIB=m
CONFIG_INFINIBAND_ISER=m
CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_DS1307=y
CONFIG_FS_DAX=y
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index 1d2d69dd6409..18d0d60dadbf 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -988,8 +988,7 @@ CONFIG_LEDS_TRIGGER_BACKLIGHT=m
CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
CONFIG_ACCESSIBILITY=y
CONFIG_A11Y_BRAILLE_CONSOLE=y
-CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=m
+CONFIG_EDAC=m
CONFIG_RTC_CLASS=y
# CONFIG_RTC_HCTOSYS is not set
CONFIG_RTC_DRV_DS1307=m
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index a59deaef21e5..3db2543733a5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -82,6 +82,7 @@ config SPARC64
select HAVE_ARCH_AUDITSYSCALL
select ARCH_SUPPORTS_ATOMIC_RMW
select HAVE_NMI
+ select HAVE_REGS_AND_STACK_ACCESS_API
config ARCH_DEFCONFIG
string
diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h
index ca57f08bd3db..d73428e4333c 100644
--- a/arch/sparc/include/asm/ptrace.h
+++ b/arch/sparc/include/asm/ptrace.h
@@ -83,7 +83,8 @@ unsigned long profile_pc(struct pt_regs *);
#define MAX_REG_OFFSET (offsetof(struct pt_regs, magic))
-extern int regs_query_register_offset(const char *name);
+int regs_query_register_offset(const char *name);
+unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n);
/**
* regs_get_register() - get register value from its offset
diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h
index 36eee8132c22..ae77df75bffa 100644
--- a/arch/sparc/include/uapi/asm/unistd.h
+++ b/arch/sparc/include/uapi/asm/unistd.h
@@ -425,8 +425,9 @@
#define __NR_copy_file_range 357
#define __NR_preadv2 358
#define __NR_pwritev2 359
+#define __NR_statx 360
-#define NR_syscalls 360
+#define NR_syscalls 361
/* Bitmask values returned from kern_features system call. */
#define KERN_FEATURE_MIXED_MODE_STACK 0x00000001
@@ -442,4 +443,9 @@
#define __IGNORE_getresgid
#endif
+/* Sparc doesn't have protection keys. */
+#define __IGNORE_pkey_mprotect
+#define __IGNORE_pkey_alloc
+#define __IGNORE_pkey_free
+
#endif /* _UAPI_SPARC_UNISTD_H */
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index fc5124ccdb53..e1d965e90e16 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -1162,3 +1162,39 @@ int regs_query_register_offset(const char *name)
return roff->offset;
return -EINVAL;
}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @addr: address which is checked.
+ *
+ * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+ unsigned long addr)
+{
+ unsigned long ksp = kernel_stack_pointer(regs) + STACK_BIAS;
+ return ((addr & ~(THREAD_SIZE - 1)) ==
+ (ksp & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
+{
+ unsigned long ksp = kernel_stack_pointer(regs) + STACK_BIAS;
+ unsigned long *addr = (unsigned long *)ksp;
+ addr += n;
+ if (regs_within_kernel_stack(regs, (unsigned long)addr))
+ return *addr;
+ else
+ return 0;
+}
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index eac7f0db5c8c..5253e895b81b 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -89,3 +89,4 @@ sys_call_table:
/*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
/*350*/ .long sys_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
/*355*/ .long sys_setsockopt, sys_mlock2, sys_copy_file_range, sys_preadv2, sys_pwritev2
+/*360*/ .long sys_statx
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index b0f17ff2ddba..82339f6be0b2 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -90,6 +90,7 @@ sys_call_table32:
.word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
/*350*/ .word sys32_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
.word compat_sys_setsockopt, sys_mlock2, sys_copy_file_range, compat_sys_preadv2, compat_sys_pwritev2
+/*360*/ .word sys_statx
#endif /* CONFIG_COMPAT */
@@ -171,3 +172,4 @@ sys_call_table:
.word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
/*350*/ .word sys64_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
.word sys_setsockopt, sys_mlock2, sys_copy_file_range, sys_preadv2, sys_pwritev2
+/*360*/ .word sys_statx
diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig
index fd122ef45b00..0d925fa0f0c1 100644
--- a/arch/tile/configs/tilegx_defconfig
+++ b/arch/tile/configs/tilegx_defconfig
@@ -249,7 +249,6 @@ CONFIG_USB_EHCI_HCD=y
CONFIG_USB_OHCI_HCD=y
CONFIG_USB_STORAGE=y
CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_TILE=y
CONFIG_EXT2_FS=y
diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig
index eb6a55944191..149d8e8eacb8 100644
--- a/arch/tile/configs/tilepro_defconfig
+++ b/arch/tile/configs/tilepro_defconfig
@@ -358,7 +358,6 @@ CONFIG_WATCHDOG_NOWAYOUT=y
# CONFIG_VGA_ARB is not set
# CONFIG_USB_SUPPORT is not set
CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_TILE=y
CONFIG_EXT2_FS=y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a94a4d10f2df..49d160b781f0 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -154,6 +154,14 @@ ifdef CONFIG_FUNCTION_GRAPH_TRACER
else
ifeq ($(call cc-option-yn, -mfentry), n)
ACCUMULATE_OUTGOING_ARGS := 1
+
+ # GCC ignores '-maccumulate-outgoing-args' when used with '-Os'.
+ # If '-Os' is enabled, disable it and print a warning.
+ ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
+ undefine CONFIG_CC_OPTIMIZE_FOR_SIZE
+ $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
+ endif
+
endif
endif
endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cbd73eb42170..5b7153540727 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -29,12 +29,6 @@
#include <asm/ftrace.h>
#include <asm/nops.h>
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && \
- !defined(CC_USING_FENTRY) && \
- !defined(CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE)
-# error The following combination is not supported: ((compiler missing -mfentry) || (CONFIG_X86_32 and !CONFIG_DYNAMIC_FTRACE)) && CONFIG_FUNCTION_GRAPH_TRACER && CONFIG_CC_OPTIMIZE_FOR_SIZE
-#endif
-
#ifdef CONFIG_DYNAMIC_FTRACE
int ftrace_arch_code_modify_prepare(void)
@@ -989,6 +983,18 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
unsigned long return_hooker = (unsigned long)
&return_to_handler;
+ /*
+ * When resuming from suspend-to-ram, this function can be indirectly
+ * called from early CPU startup code while the CPU is in real mode,
+ * which would fail miserably. Make sure the stack pointer is a
+ * virtual address.
+ *
+ * This check isn't as accurate as virt_addr_valid(), but it should be
+ * good enough for this purpose, and it's fast.
+ */
+ if (unlikely((long)__builtin_frame_address(0) >= 0))
+ return;
+
if (unlikely(ftrace_graph_is_dead()))
return;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index a723ae9440ab..446c8aa09b9b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -222,17 +222,6 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
- /*
- * On some machines, PCI SERR line is used to report memory
- * errors. EDAC makes use of it.
- */
-#if defined(CONFIG_EDAC)
- if (edac_handler_set()) {
- edac_atomic_assert_error();
- return;
- }
-#endif
-
if (panic_on_unrecovered_nmi)
nmi_panic(regs, "NMI: Not continuing");
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index a8e91ae89fb3..29df077cb089 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -93,6 +93,13 @@ static void delay_mwaitx(unsigned long __loops)
{
u64 start, end, delay, loops = __loops;
+ /*
+ * Timer value of 0 causes MWAITX to wait indefinitely, unless there
+ * is a store on the memory monitored by MONITORX.
+ */
+ if (loops == 0)
+ return;
+
start = rdtsc_ordered();
for (;;) {
diff --git a/block/Kconfig b/block/Kconfig
index e9f780f815f5..89cd28f8d051 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -115,6 +115,18 @@ config BLK_DEV_THROTTLING
See Documentation/cgroups/blkio-controller.txt for more information.
+config BLK_DEV_THROTTLING_LOW
+ bool "Block throttling .low limit interface support (EXPERIMENTAL)"
+ depends on BLK_DEV_THROTTLING
+ default n
+ ---help---
+ Add .low limit interface for block throttling. The low limit is a best
+ effort limit to prioritize cgroups. Depending on the setting, the limit
+ can be used to protect cgroups in terms of bandwidth/iops and better
+ utilize disk resource.
+
+ Note, this is an experimental interface and could be changed someday.
+
config BLK_CMDLINE_PARSER
bool "Block device command line partition parser"
default n
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 58fc8684788d..fd2cefa47d35 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -40,6 +40,7 @@ config CFQ_GROUP_IOSCHED
Enable group IO scheduling in CFQ.
choice
+
prompt "Default I/O scheduler"
default DEFAULT_CFQ
help
@@ -69,6 +70,35 @@ config MQ_IOSCHED_DEADLINE
---help---
MQ version of the deadline IO scheduler.
+config MQ_IOSCHED_KYBER
+ tristate "Kyber I/O scheduler"
+ default y
+ ---help---
+ The Kyber I/O scheduler is a low-overhead scheduler suitable for
+ multiqueue and other fast devices. Given target latencies for reads and
+ synchronous writes, it will self-tune queue depths to achieve that
+ goal.
+
+config IOSCHED_BFQ
+ tristate "BFQ I/O scheduler"
+ default n
+ ---help---
+ BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
+ of the device among all processes according to their weights,
+ regardless of the device parameters and with any workload. It
+ also guarantees a low latency to interactive and soft
+ real-time applications. Details in
+ Documentation/block/bfq-iosched.txt
+
+config BFQ_GROUP_IOSCHED
+ bool "BFQ hierarchical scheduling support"
+ depends on IOSCHED_BFQ && BLK_CGROUP
+ default n
+ ---help---
+
+ Enable hierarchical scheduling in BFQ, using the blkio
+ (cgroups-v1) or io (cgroups-v2) controller.
+
endmenu
endif
diff --git a/block/Makefile b/block/Makefile
index 081bb680789b..2b281cf258a0 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -20,6 +20,9 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
+obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
+bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
+obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
new file mode 100644
index 000000000000..c8a32fb345cf
--- /dev/null
+++ b/block/bfq-cgroup.c
@@ -0,0 +1,1139 @@
+/*
+ * cgroups support for the BFQ I/O scheduler.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include <linux/sbitmap.h>
+#include <linux/delay.h>
+
+#include "bfq-iosched.h"
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+ BFQG_stats_waiting = 0,
+ BFQG_stats_idling,
+ BFQG_stats_empty,
+};
+
+#define BFQG_FLAG_FNS(name) \
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
+{ \
+ stats->flags |= (1 << BFQG_stats_##name); \
+} \
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
+{ \
+ stats->flags &= ~(1 << BFQG_stats_##name); \
+} \
+static int bfqg_stats_##name(struct bfqg_stats *stats) \
+{ \
+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
+} \
+
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+{
+ unsigned long long now;
+
+ if (!bfqg_stats_waiting(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_group_wait_time))
+ blkg_stat_add(&stats->group_wait_time,
+ now - stats->start_group_wait_time);
+ bfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+ struct bfq_group *curr_bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (bfqg_stats_waiting(stats))
+ return;
+ if (bfqg == curr_bfqg)
+ return;
+ stats->start_group_wait_time = sched_clock();
+ bfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+{
+ unsigned long long now;
+
+ if (!bfqg_stats_empty(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_empty_time))
+ blkg_stat_add(&stats->empty_time,
+ now - stats->start_empty_time);
+ bfqg_stats_clear_empty(stats);
+}
+
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+{
+ blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (blkg_rwstat_total(&stats->queued))
+ return;
+
+ /*
+ * group is already marked empty. This can happen if bfqq got new
+ * request in parent group and moved to this group while being added
+ * to service tree. Just ignore the event and move on.
+ */
+ if (bfqg_stats_empty(stats))
+ return;
+
+ stats->start_empty_time = sched_clock();
+ bfqg_stats_mark_empty(stats);
+}
+
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (bfqg_stats_idling(stats)) {
+ unsigned long long now = sched_clock();
+
+ if (time_after64(now, stats->start_idle_time))
+ blkg_stat_add(&stats->idle_time,
+ now - stats->start_idle_time);
+ bfqg_stats_clear_idling(stats);
+ }
+}
+
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ stats->start_idle_time = sched_clock();
+ bfqg_stats_mark_idling(stats);
+}
+
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ blkg_stat_add(&stats->avg_queue_size_sum,
+ blkg_rwstat_total(&stats->queued));
+ blkg_stat_add(&stats->avg_queue_size_samples, 1);
+ bfqg_stats_update_group_wait_time(stats);
+}
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+ */
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+
+struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+ return pd_to_blkg(&bfqg->pd);
+}
+
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+ return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
+}
+
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+
+ return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *group_entity = bfqq->entity.parent;
+
+ return group_entity ? container_of(group_entity, struct bfq_group,
+ entity) :
+ bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+ return blkg_get(bfqg_to_blkg(bfqg));
+}
+
+void bfqg_put(struct bfq_group *bfqg)
+{
+ return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+ unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+ bfqg_stats_end_empty_time(&bfqg->stats);
+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+}
+
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+}
+
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+ uint64_t io_start_time, unsigned int op)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+ unsigned long long now = sched_clock();
+
+ if (time_after64(now, io_start_time))
+ blkg_rwstat_add(&stats->service_time, op,
+ now - io_start_time);
+ if (time_after64(io_start_time, start_time))
+ blkg_rwstat_add(&stats->wait_time, op,
+ io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+ /* queued stats shouldn't be cleared */
+ blkg_rwstat_reset(&stats->merged);
+ blkg_rwstat_reset(&stats->service_time);
+ blkg_rwstat_reset(&stats->wait_time);
+ blkg_stat_reset(&stats->time);
+ blkg_stat_reset(&stats->avg_queue_size_sum);
+ blkg_stat_reset(&stats->avg_queue_size_samples);
+ blkg_stat_reset(&stats->dequeue);
+ blkg_stat_reset(&stats->group_wait_time);
+ blkg_stat_reset(&stats->idle_time);
+ blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+ if (!to || !from)
+ return;
+
+ /* queued stats shouldn't be cleared */
+ blkg_rwstat_add_aux(&to->merged, &from->merged);
+ blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+ blkg_stat_add_aux(&from->time, &from->time);
+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+ blkg_stat_add_aux(&to->avg_queue_size_samples,
+ &from->avg_queue_size_samples);
+ blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+ blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+ blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+ */
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+{
+ struct bfq_group *parent;
+
+ if (!bfqg) /* root_group */
+ return;
+
+ parent = bfqg_parent(bfqg);
+
+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+
+ if (unlikely(!parent))
+ return;
+
+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
+ bfqg_stats_reset(&bfqg->stats);
+}
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->weight = entity->new_weight;
+ entity->orig_weight = entity->new_weight;
+ if (bfqq) {
+ bfqq->ioprio = bfqq->new_ioprio;
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
+ bfqg_get(bfqg);
+ }
+ entity->parent = bfqg->my_entity; /* NULL for root group */
+ entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+ blkg_rwstat_exit(&stats->merged);
+ blkg_rwstat_exit(&stats->service_time);
+ blkg_rwstat_exit(&stats->wait_time);
+ blkg_rwstat_exit(&stats->queued);
+ blkg_stat_exit(&stats->time);
+ blkg_stat_exit(&stats->avg_queue_size_sum);
+ blkg_stat_exit(&stats->avg_queue_size_samples);
+ blkg_stat_exit(&stats->dequeue);
+ blkg_stat_exit(&stats->group_wait_time);
+ blkg_stat_exit(&stats->idle_time);
+ blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+ if (blkg_rwstat_init(&stats->merged, gfp) ||
+ blkg_rwstat_init(&stats->service_time, gfp) ||
+ blkg_rwstat_init(&stats->wait_time, gfp) ||
+ blkg_rwstat_init(&stats->queued, gfp) ||
+ blkg_stat_init(&stats->time, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+ blkg_stat_init(&stats->dequeue, gfp) ||
+ blkg_stat_init(&stats->group_wait_time, gfp) ||
+ blkg_stat_init(&stats->idle_time, gfp) ||
+ blkg_stat_init(&stats->empty_time, gfp)) {
+ bfqg_stats_exit(stats);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+{
+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+}
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+ struct bfq_group_data *bgd;
+
+ bgd = kzalloc(sizeof(*bgd), gfp);
+ if (!bgd)
+ return NULL;
+ return &bgd->pd;
+}
+
+void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+ struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+}
+
+void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+ kfree(cpd_to_bfqgd(cpd));
+}
+
+struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+ struct bfq_group *bfqg;
+
+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+ if (!bfqg)
+ return NULL;
+
+ if (bfqg_stats_init(&bfqg->stats, gfp)) {
+ kfree(bfqg);
+ return NULL;
+ }
+
+ return &bfqg->pd;
+}
+
+void bfq_pd_init(struct blkg_policy_data *pd)
+{
+ struct blkcg_gq *blkg = pd_to_blkg(pd);
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+ struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
+ struct bfq_entity *entity = &bfqg->entity;
+ struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
+
+ entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+ entity->my_sched_data = &bfqg->sched_data;
+ bfqg->my_entity = entity; /*
+ * the root_group's will be set to NULL
+ * in bfq_init_queue()
+ */
+ bfqg->bfqd = bfqd;
+ bfqg->active_entities = 0;
+ bfqg->rq_pos_tree = RB_ROOT;
+}
+
+void bfq_pd_free(struct blkg_policy_data *pd)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+ bfqg_stats_exit(&bfqg->stats);
+ return kfree(bfqg);
+}
+
+void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+ bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+ struct bfq_group *parent)
+{
+ struct bfq_entity *entity;
+
+ entity = &bfqg->entity;
+ entity->parent = parent->my_entity;
+ entity->sched_data = &parent->sched_data;
+}
+
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
+ struct blkcg *blkcg)
+{
+ struct blkcg_gq *blkg;
+
+ blkg = blkg_lookup(blkcg, bfqd->queue);
+ if (likely(blkg))
+ return blkg_to_bfqg(blkg);
+ return NULL;
+}
+
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+ struct blkcg *blkcg)
+{
+ struct bfq_group *bfqg, *parent;
+ struct bfq_entity *entity;
+
+ bfqg = bfq_lookup_bfqg(bfqd, blkcg);
+
+ if (unlikely(!bfqg))
+ return NULL;
+
+ /*
+ * Update chain of bfq_groups as we might be handling a leaf group
+ * which, along with some of its relatives, has not been hooked yet
+ * to the private hierarchy of BFQ.
+ */
+ entity = &bfqg->entity;
+ for_each_entity(entity) {
+ bfqg = container_of(entity, struct bfq_group, entity);
+ if (bfqg != bfqd->root_group) {
+ parent = bfqg_parent(bfqg);
+ if (!parent)
+ parent = bfqd->root_group;
+ bfq_group_set_parent(bfqg, parent);
+ }
+ }
+
+ return bfqg;
+}
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one. Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_group *bfqg)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ /* If bfqq is empty, then bfq_bfqq_expire also invokes
+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity
+ * from data structures related to current group. Otherwise we
+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
+ * we do below.
+ */
+ if (bfqq == bfqd->in_service_queue)
+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+ false, BFQQE_PREEMPTED);
+
+ if (bfq_bfqq_busy(bfqq))
+ bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+ else if (entity->on_st)
+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+ bfqg_put(bfqq_group(bfqq));
+
+ /*
+ * Here we use a reference to bfqg. We don't need a refcounter
+ * as the cgroup reference will not be dropped, so that its
+ * destroy() callback will not be invoked.
+ */
+ entity->parent = bfqg->my_entity;
+ entity->sched_data = &bfqg->sched_data;
+ bfqg_get(bfqg);
+
+ if (bfq_bfqq_busy(bfqq)) {
+ bfq_pos_tree_add_move(bfqd, bfqq);
+ bfq_activate_bfqq(bfqd, bfqq);
+ }
+
+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+ bfq_schedule_dispatch(bfqd);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ *
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic,
+ struct blkcg *blkcg)
+{
+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
+ struct bfq_group *bfqg;
+ struct bfq_entity *entity;
+
+ bfqg = bfq_find_set_group(bfqd, blkcg);
+
+ if (unlikely(!bfqg))
+ bfqg = bfqd->root_group;
+
+ if (async_bfqq) {
+ entity = &async_bfqq->entity;
+
+ if (entity->sched_data != &bfqg->sched_data) {
+ bic_set_bfqq(bic, NULL, 0);
+ bfq_log_bfqq(bfqd, async_bfqq,
+ "bic_change_group: %p %d",
+ async_bfqq, async_bfqq->ref);
+ bfq_put_queue(async_bfqq);
+ }
+ }
+
+ if (sync_bfqq) {
+ entity = &sync_bfqq->entity;
+ if (entity->sched_data != &bfqg->sched_data)
+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+ }
+
+ return bfqg;
+}
+
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
+ struct bfq_group *bfqg = NULL;
+ uint64_t serial_nr;
+
+ rcu_read_lock();
+ serial_nr = bio_blkcg(bio)->css.serial_nr;
+
+ /*
+ * Check whether blkcg has changed. The condition may trigger
+ * spuriously on a newly created cic but there's no harm.
+ */
+ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
+ goto out;
+
+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+ bic->blkcg_serial_nr = serial_nr;
+out:
+ rcu_read_unlock();
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+ struct bfq_entity *entity = st->first_idle;
+
+ for (; entity ; entity = st->first_idle)
+ __bfq_deactivate_entity(entity, false);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+}
+
+/**
+ * bfq_reparent_active_entities - move to the root group all active
+ * entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
+ *
+ * Needs queue_lock to be taken and reference to be valid over the call.
+ */
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+ struct bfq_group *bfqg,
+ struct bfq_service_tree *st)
+{
+ struct rb_root *active = &st->active;
+ struct bfq_entity *entity = NULL;
+
+ if (!RB_EMPTY_ROOT(&st->active))
+ entity = bfq_entity_of(rb_first(active));
+
+ for (; entity ; entity = bfq_entity_of(rb_first(active)))
+ bfq_reparent_leaf_entity(bfqd, entity);
+
+ if (bfqg->sched_data.in_service_entity)
+ bfq_reparent_leaf_entity(bfqd,
+ bfqg->sched_data.in_service_entity);
+}
+
+/**
+ * bfq_pd_offline - deactivate the entity associated with @pd,
+ * and reparent its children entities.
+ * @pd: descriptor of the policy going offline.
+ *
+ * blkio already grabs the queue_lock for us, so no need to use
+ * RCU-based magic
+ */
+void bfq_pd_offline(struct blkg_policy_data *pd)
+{
+ struct bfq_service_tree *st;
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+ struct bfq_data *bfqd = bfqg->bfqd;
+ struct bfq_entity *entity = bfqg->my_entity;
+ unsigned long flags;
+ int i;
+
+ if (!entity) /* root group */
+ return;
+
+ spin_lock_irqsave(&bfqd->lock, flags);
+ /*
+ * Empty all service_trees belonging to this group before
+ * deactivating the group itself.
+ */
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+ st = bfqg->sched_data.service_tree + i;
+
+ /*
+ * The idle tree may still contain bfq_queues belonging
+ * to exited task because they never migrated to a different
+ * cgroup from the one being destroyed now. No one else
+ * can access them so it's safe to act without any lock.
+ */
+ bfq_flush_idle_tree(st);
+
+ /*
+ * It may happen that some queues are still active
+ * (busy) upon group destruction (if the corresponding
+ * processes have been forced to terminate). We move
+ * all the leaf entities corresponding to these queues
+ * to the root_group.
+ * Also, it may happen that the group has an entity
+ * in service, which is disconnected from the active
+ * tree: it must be moved, too.
+ * There is no need to put the sync queues, as the
+ * scheduler has taken no reference.
+ */
+ bfq_reparent_active_entities(bfqd, bfqg, st);
+ }
+
+ __bfq_deactivate_entity(entity, false);
+ bfq_put_async_queues(bfqd, bfqg);
+
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ /*
+ * @blkg is going offline and will be ignored by
+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
+ * that they don't get lost. If IOs complete after this point, the
+ * stats for them will be lost. Oh well...
+ */
+ bfqg_stats_xfer_dead(bfqg);
+}
+
+void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+ struct blkcg_gq *blkg;
+
+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+ bfq_end_wr_async_queues(bfqd, bfqg);
+ }
+ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+ unsigned int val = 0;
+
+ if (bfqgd)
+ val = bfqgd->weight;
+
+ seq_printf(sf, "%u\n", val);
+
+ return 0;
+}
+
+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
+ struct cftype *cftype,
+ u64 val)
+{
+ struct blkcg *blkcg = css_to_blkcg(css);
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+ struct blkcg_gq *blkg;
+ int ret = -ERANGE;
+
+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+ return ret;
+
+ ret = 0;
+ spin_lock_irq(&blkcg->lock);
+ bfqgd->weight = (unsigned short)val;
+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+ if (!bfqg)
+ continue;
+ /*
+ * Setting the prio_changed flag of the entity
+ * to 1 with new_weight == weight would re-set
+ * the value of the weight to its ioprio mapping.
+ * Set the flag only if necessary.
+ */
+ if ((unsigned short)val != bfqg->entity.new_weight) {
+ bfqg->entity.new_weight = (unsigned short)val;
+ /*
+ * Make sure that the above new value has been
+ * stored in bfqg->entity.new_weight before
+ * setting the prio_changed flag. In fact,
+ * this flag may be read asynchronously (in
+ * critical sections protected by a different
+ * lock than that held here), and finding this
+ * flag set may cause the execution of the code
+ * for updating parameters whose value may
+ * depend also on bfqg->entity.new_weight (in
+ * __bfq_entity_update_weight_prio).
+ * This barrier makes sure that the new value
+ * of bfqg->entity.new_weight is correctly
+ * seen in that code.
+ */
+ smp_wmb();
+ bfqg->entity.prio_changed = 1;
+ }
+ }
+ spin_unlock_irq(&blkcg->lock);
+
+ return ret;
+}
+
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ u64 weight;
+ /* First unsigned long found in the file is used */
+ int ret = kstrtoull(strim(buf), 0, &weight);
+
+ if (ret)
+ return ret;
+
+ return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
+}
+
+static int bfqg_print_stat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+ &blkcg_policy_bfq, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+ &blkcg_policy_bfq, seq_cft(sf)->private, true);
+ return 0;
+}
+
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+ &blkcg_policy_bfq, off);
+ return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+ &blkcg_policy_bfq,
+ off);
+ return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+ seq_cft(sf)->private, false);
+ return 0;
+}
+
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+ seq_cft(sf)->private, true);
+ return 0;
+}
+
+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
+ return 0;
+}
+
+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+ offsetof(struct blkcg_gq, stat_bytes));
+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
+ false);
+ return 0;
+}
+
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+ u64 v = 0;
+
+ if (samples) {
+ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+ v = div64_u64(v, samples);
+ }
+ __blkg_prfill_u64(sf, pd, v);
+ return 0;
+}
+
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+ 0, false);
+ return 0;
+}
+
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+ int ret;
+
+ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+ if (ret)
+ return NULL;
+
+ return blkg_to_bfqg(bfqd->queue->root_blkg);
+}
+
+struct blkcg_policy blkcg_policy_bfq = {
+ .dfl_cftypes = bfq_blkg_files,
+ .legacy_cftypes = bfq_blkcg_legacy_files,
+
+ .cpd_alloc_fn = bfq_cpd_alloc,
+ .cpd_init_fn = bfq_cpd_init,
+ .cpd_bind_fn = bfq_cpd_init,
+ .cpd_free_fn = bfq_cpd_free,
+
+ .pd_alloc_fn = bfq_pd_alloc,
+ .pd_init_fn = bfq_pd_init,
+ .pd_offline_fn = bfq_pd_offline,
+ .pd_free_fn = bfq_pd_free,
+ .pd_reset_stats_fn = bfq_pd_reset_stats,
+};
+
+struct cftype bfq_blkcg_legacy_files[] = {
+ {
+ .name = "bfq.weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = bfq_io_show_weight,
+ .write_u64 = bfq_io_set_weight_legacy,
+ },
+
+ /* statistics, covers only the tasks in the bfqg */
+ {
+ .name = "bfq.time",
+ .private = offsetof(struct bfq_group, stats.time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.sectors",
+ .seq_show = bfqg_print_stat_sectors,
+ },
+ {
+ .name = "bfq.io_service_bytes",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_bytes,
+ },
+ {
+ .name = "bfq.io_serviced",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_ios,
+ },
+ {
+ .name = "bfq.io_service_time",
+ .private = offsetof(struct bfq_group, stats.service_time),
+ .seq_show = bfqg_print_rwstat,
+ },
+ {
+ .name = "bfq.io_wait_time",
+ .private = offsetof(struct bfq_group, stats.wait_time),
+ .seq_show = bfqg_print_rwstat,
+ },
+ {
+ .name = "bfq.io_merged",
+ .private = offsetof(struct bfq_group, stats.merged),
+ .seq_show = bfqg_print_rwstat,
+ },
+ {
+ .name = "bfq.io_queued",
+ .private = offsetof(struct bfq_group, stats.queued),
+ .seq_show = bfqg_print_rwstat,
+ },
+
+ /* the same statictics which cover the bfqg and its descendants */
+ {
+ .name = "bfq.time_recursive",
+ .private = offsetof(struct bfq_group, stats.time),
+ .seq_show = bfqg_print_stat_recursive,
+ },
+ {
+ .name = "bfq.sectors_recursive",
+ .seq_show = bfqg_print_stat_sectors_recursive,
+ },
+ {
+ .name = "bfq.io_service_bytes_recursive",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_bytes_recursive,
+ },
+ {
+ .name = "bfq.io_serviced_recursive",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_ios_recursive,
+ },
+ {
+ .name = "bfq.io_service_time_recursive",
+ .private = offsetof(struct bfq_group, stats.service_time),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.io_wait_time_recursive",
+ .private = offsetof(struct bfq_group, stats.wait_time),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.io_merged_recursive",
+ .private = offsetof(struct bfq_group, stats.merged),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.io_queued_recursive",
+ .private = offsetof(struct bfq_group, stats.queued),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.avg_queue_size",
+ .seq_show = bfqg_print_avg_queue_size,
+ },
+ {
+ .name = "bfq.group_wait_time",
+ .private = offsetof(struct bfq_group, stats.group_wait_time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.idle_time",
+ .private = offsetof(struct bfq_group, stats.idle_time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.empty_time",
+ .private = offsetof(struct bfq_group, stats.empty_time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.dequeue",
+ .private = offsetof(struct bfq_group, stats.dequeue),
+ .seq_show = bfqg_print_stat,
+ },
+ { } /* terminate */
+};
+
+struct cftype bfq_blkg_files[] = {
+ {
+ .name = "bfq.weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = bfq_io_show_weight,
+ .write = bfq_io_set_weight,
+ },
+ {} /* terminate */
+};
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+ unsigned int op) { }
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+ uint64_t io_start_time, unsigned int op) { }
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
+
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_group *bfqg) {}
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->weight = entity->new_weight;
+ entity->orig_weight = entity->new_weight;
+ if (bfqq) {
+ bfqq->ioprio = bfqq->new_ioprio;
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
+ }
+ entity->sched_data = &bfqg->sched_data;
+}
+
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
+
+void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg)
+{
+ return bfqd->root_group;
+}
+
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+ return bfqq->bfqd->root_group;
+}
+
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+ struct bfq_group *bfqg;
+ int i;
+
+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+ if (!bfqg)
+ return NULL;
+
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+ return bfqg;
+}
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644
index 000000000000..bd8499ef157c
--- /dev/null
+++ b/block/bfq-iosched.c
@@ -0,0 +1,5047 @@
+/*
+ * Budget Fair Queueing (BFQ) I/O scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ * Arianna Avanzini <avanzini@google.com>
+ *
+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BFQ is a proportional-share I/O scheduler, with some extra
+ * low-latency capabilities. BFQ also supports full hierarchical
+ * scheduling through cgroups. Next paragraphs provide an introduction
+ * on BFQ inner workings. Details on BFQ benefits, usage and
+ * limitations can be found in Documentation/block/bfq-iosched.txt.
+ *
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based
+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns
+ * budgets, measured in number of sectors, to processes instead of
+ * time slices. The device is not granted to the in-service process
+ * for a given time slice, but until it has exhausted its assigned
+ * budget. This change from the time to the service domain enables BFQ
+ * to distribute the device throughput among processes as desired,
+ * without any distortion due to throughput fluctuations, or to device
+ * internal queueing. BFQ uses an ad hoc internal scheduler, called
+ * B-WF2Q+, to schedule processes according to their budgets. More
+ * precisely, BFQ schedules queues associated with processes. Each
+ * process/queue is assigned a user-configurable weight, and B-WF2Q+
+ * guarantees that each queue receives a fraction of the throughput
+ * proportional to its weight. Thanks to the accurate policy of
+ * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
+ * processes issuing sequential requests (to boost the throughput),
+ * and yet guarantee a low latency to interactive and soft real-time
+ * applications.
+ *
+ * In particular, to provide these low-latency guarantees, BFQ
+ * explicitly privileges the I/O of two classes of time-sensitive
+ * applications: interactive and soft real-time. This feature enables
+ * BFQ to provide applications in these classes with a very low
+ * latency. Finally, BFQ also features additional heuristics for
+ * preserving both a low latency and a high throughput on NCQ-capable,
+ * rotational or flash-based devices, and to get the job done quickly
+ * for applications consisting in many I/O-bound processes.
+ *
+ * BFQ is described in [1], where also a reference to the initial, more
+ * theoretical paper on BFQ can be found. The interested reader can find
+ * in the latter paper full details on the main algorithm, as well as
+ * formulas of the guarantees and formal proofs of all the properties.
+ * With respect to the version of BFQ presented in these papers, this
+ * implementation adds a few more heuristics, such as the one that
+ * guarantees a low latency to soft real-time applications, and a
+ * hierarchical extension based on H-WF2Q+.
+ *
+ * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
+ * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
+ * with O(log N) complexity derives from the one introduced with EEVDF
+ * in [3].
+ *
+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+ * Scheduler", Proceedings of the First Workshop on Mobile System
+ * Technologies (MST-2015), May 2015.
+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+ *
+ * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
+ * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
+ * Oct 1997.
+ *
+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
+ *
+ * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
+ * First: A Flexible and Accurate Mechanism for Proportional Share
+ * Resource Allocation", technical report.
+ *
+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include <linux/sbitmap.h>
+#include <linux/delay.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+#include "bfq-iosched.h"
+
+#define BFQ_BFQQ_FNS(name) \
+void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
+{ \
+ __set_bit(BFQQF_##name, &(bfqq)->flags); \
+} \
+void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
+{ \
+ __clear_bit(BFQQF_##name, &(bfqq)->flags); \
+} \
+int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
+{ \
+ return test_bit(BFQQF_##name, &(bfqq)->flags); \
+}
+
+BFQ_BFQQ_FNS(just_created);
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS \
+
+/* Expiration time of sync (0) and async (1) requests, in ns. */
+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
+
+/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
+static const int bfq_back_max = 16 * 1024;
+
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 2;
+
+/* Idling period duration, in ns. */
+static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
+
+/* Minimum number of assigned budgets for which stats are safe to compute. */
+static const int bfq_stats_min_budgets = 194;
+
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = 16 * 1024;
+
+/*
+ * Async to sync throughput distribution is controlled as follows:
+ * when an async request is served, the entity is charged the number
+ * of sectors of the request, multiplied by the factor below
+ */
+static const int bfq_async_charge_factor = 10;
+
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+const int bfq_timeout = HZ / 8;
+
+static struct kmem_cache *bfq_pool;
+
+/* Below this threshold (in ns), we consider thinktime immediate. */
+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
+
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD 4
+#define BFQ_HW_QUEUE_SAMPLES 32
+
+#define BFQQ_SEEK_THR (sector_t)(8 * 100)
+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
+
+/* Min number of samples required to perform peak-rate update */
+#define BFQ_RATE_MIN_SAMPLES 32
+/* Min observation time interval required to perform a peak-rate update (ns) */
+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
+/* Target observation time interval for a peak-rate update (ns) */
+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
+
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT 16
+
+/*
+ * By default, BFQ computes the duration of the weight raising for
+ * interactive applications automatically, using the following formula:
+ * duration = (R / r) * T, where r is the peak rate of the device, and
+ * R and T are two reference parameters.
+ * In particular, R is the peak rate of the reference device (see below),
+ * and T is a reference time: given the systems that are likely to be
+ * installed on the reference device according to its speed class, T is
+ * about the maximum time needed, under BFQ and while reading two files in
+ * parallel, to load typical large applications on these systems.
+ * In practice, the slower/faster the device at hand is, the more/less it
+ * takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * applications.
+ *
+ * BFQ uses four different reference pairs (R, T), depending on:
+ * . whether the device is rotational or non-rotational;
+ * . whether the device is slow, such as old or portable HDDs, as well as
+ * SD cards, or fast, such as newer HDDs and SSDs.
+ *
+ * The device's speed class is dynamically (re)detected in
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ *
+ * In the following definitions, R_slow[0]/R_fast[0] and
+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast
+ * rotational device, whereas R_slow[1]/R_fast[1] and
+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast
+ * non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes. The reference
+ * rates are not the actual peak rates of the devices used as a
+ * reference, but slightly lower values. The reason for using these
+ * slightly lower values is that the peak-rate estimator tends to
+ * yield slightly lower values than the actual peak rate (it can yield
+ * the actual peak rate only if there is only one process doing I/O,
+ * and the process does sequential I/O).
+ *
+ * Both the reference peak rates and the thresholds are measured in
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ */
+static int R_slow[2] = {1000, 10700};
+static int R_fast[2] = {14000, 33000};
+/*
+ * To improve readability, a conversion function is used to initialize the
+ * following arrays, which entails that they can be initialized only in a
+ * function.
+ */
+static int T_slow[2];
+static int T_fast[2];
+static int device_speed_thresh[2];
+
+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
+
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+ return bic->bfqq[is_sync];
+}
+
+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
+{
+ bic->bfqq[is_sync] = bfqq;
+}
+
+struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+{
+ return bic->icq.q->elevator->elevator_data;
+}
+
+/**
+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
+ * @icq: the iocontext queue.
+ */
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+{
+ /* bic->icq is the first member, %NULL will convert to %NULL */
+ return container_of(icq, struct bfq_io_cq, icq);
+}
+
+/**
+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
+ * @bfqd: the lookup key.
+ * @ioc: the io_context of the process doing I/O.
+ * @q: the request queue.
+ */
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+ struct io_context *ioc,
+ struct request_queue *q)
+{
+ if (ioc) {
+ unsigned long flags;
+ struct bfq_io_cq *icq;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ icq = icq_to_bic(ioc_lookup_icq(ioc, q));
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ return icq;
+ }
+
+ return NULL;
+}
+
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+ if (bfqd->queued != 0) {
+ bfq_log(bfqd, "schedule dispatch");
+ blk_mq_run_hw_queues(bfqd->queue, true);
+ }
+}
+
+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+#define bfq_sample_valid(samples) ((samples) > 80)
+
+/*
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+ * We choose the request that is closesr to the head right now. Distance
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
+ struct request *rq1,
+ struct request *rq2,
+ sector_t last)
+{
+ sector_t s1, s2, d1 = 0, d2 = 0;
+ unsigned long back_max;
+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
+
+ if (!rq1 || rq1 == rq2)
+ return rq2;
+ if (!rq2)
+ return rq1;
+
+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+ return rq1;
+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+ return rq2;
+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+ return rq1;
+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
+ return rq2;
+
+ s1 = blk_rq_pos(rq1);
+ s2 = blk_rq_pos(rq2);
+
+ /*
+ * By definition, 1KiB is 2 sectors.
+ */
+ back_max = bfqd->bfq_back_max * 2;
+
+ /*
+ * Strict one way elevator _except_ in the case where we allow
+ * short backward seeks which are biased as twice the cost of a
+ * similar forward seek.
+ */
+ if (s1 >= last)
+ d1 = s1 - last;
+ else if (s1 + back_max >= last)
+ d1 = (last - s1) * bfqd->bfq_back_penalty;
+ else
+ wrap |= BFQ_RQ1_WRAP;
+
+ if (s2 >= last)
+ d2 = s2 - last;
+ else if (s2 + back_max >= last)
+ d2 = (last - s2) * bfqd->bfq_back_penalty;
+ else
+ wrap |= BFQ_RQ2_WRAP;
+
+ /* Found required data */
+
+ /*
+ * By doing switch() on the bit mask "wrap" we avoid having to
+ * check two variables for all permutations: --> faster!
+ */
+ switch (wrap) {
+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
+ if (d1 < d2)
+ return rq1;
+ else if (d2 < d1)
+ return rq2;
+
+ if (s1 >= s2)
+ return rq1;
+ else
+ return rq2;
+
+ case BFQ_RQ2_WRAP:
+ return rq1;
+ case BFQ_RQ1_WRAP:
+ return rq2;
+ case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
+ default:
+ /*
+ * Since both rqs are wrapped,
+ * start with the one that's further behind head
+ * (--> only *one* back seek required),
+ * since back seek takes more time than forward.
+ */
+ if (s1 <= s2)
+ return rq1;
+ else
+ return rq2;
+ }
+}
+
+static struct bfq_queue *
+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
+ sector_t sector, struct rb_node **ret_parent,
+ struct rb_node ***rb_link)
+{
+ struct rb_node **p, *parent;
+ struct bfq_queue *bfqq = NULL;
+
+ parent = NULL;
+ p = &root->rb_node;
+ while (*p) {
+ struct rb_node **n;
+
+ parent = *p;
+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+
+ /*
+ * Sort strictly based on sector. Smallest to the left,
+ * largest to the right.
+ */
+ if (sector > blk_rq_pos(bfqq->next_rq))
+ n = &(*p)->rb_right;
+ else if (sector < blk_rq_pos(bfqq->next_rq))
+ n = &(*p)->rb_left;
+ else
+ break;
+ p = n;
+ bfqq = NULL;
+ }
+
+ *ret_parent = parent;
+ if (rb_link)
+ *rb_link = p;
+
+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
+ (unsigned long long)sector,
+ bfqq ? bfqq->pid : 0);
+
+ return bfqq;
+}
+
+void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct rb_node **p, *parent;
+ struct bfq_queue *__bfqq;
+
+ if (bfqq->pos_root) {
+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
+ bfqq->pos_root = NULL;
+ }
+
+ if (bfq_class_idle(bfqq))
+ return;
+ if (!bfqq->next_rq)
+ return;
+
+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
+ blk_rq_pos(bfqq->next_rq), &parent, &p);
+ if (!__bfqq) {
+ rb_link_node(&bfqq->pos_node, parent, p);
+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
+ } else
+ bfqq->pos_root = NULL;
+}
+
+/*
+ * Tell whether there are active queues or groups with differentiated weights.
+ */
+static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+{
+ /*
+ * For weights to differ, at least one of the trees must contain
+ * at least two nodes.
+ */
+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+ (bfqd->queue_weights_tree.rb_node->rb_left ||
+ bfqd->queue_weights_tree.rb_node->rb_right)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ ) ||
+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+ (bfqd->group_weights_tree.rb_node->rb_left ||
+ bfqd->group_weights_tree.rb_node->rb_right)
+#endif
+ );
+}
+
+/*
+ * The following function returns true if every queue must receive the
+ * same share of the throughput (this condition is used when deciding
+ * whether idling may be disabled, see the comments in the function
+ * bfq_bfqq_may_idle()).
+ *
+ * Such a scenario occurs when:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ * weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ * number of children.
+ *
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore this function evaluates, instead, the following stronger
+ * sub-conditions, for which it is much easier to maintain the needed
+ * state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, thus no state needs
+ * to be maintained in this case.
+ */
+static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+{
+ return !bfq_differentiated_weights(bfqd);
+}
+
+/*
+ * If the weight-counter tree passed as input contains no counter for
+ * the weight of the input entity, then add that counter; otherwise just
+ * increment the existing counter.
+ *
+ * Note that weight-counter trees contain few nodes in mostly symmetric
+ * scenarios. For example, if all queues have the same weight, then the
+ * weight-counter tree for the queues may contain at most one node.
+ * This holds even if low_latency is on, because weight-raised queues
+ * are not inserted in the tree.
+ * In most scenarios, the rate at which nodes are created/destroyed
+ * should be low too.
+ */
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+ struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /*
+ * Do not insert if the entity is already associated with a
+ * counter, which happens if:
+ * 1) the entity is associated with a queue,
+ * 2) a request arrival has caused the queue to become both
+ * non-weight-raised, and hence change its weight, and
+ * backlogged; in this respect, each of the two events
+ * causes an invocation of this function,
+ * 3) this is the invocation of this function caused by the
+ * second event. This second invocation is actually useless,
+ * and we handle this fact by exiting immediately. More
+ * efficient or clearer solutions might possibly be adopted.
+ */
+ if (entity->weight_counter)
+ return;
+
+ while (*new) {
+ struct bfq_weight_counter *__counter = container_of(*new,
+ struct bfq_weight_counter,
+ weights_node);
+ parent = *new;
+
+ if (entity->weight == __counter->weight) {
+ entity->weight_counter = __counter;
+ goto inc_counter;
+ }
+ if (entity->weight < __counter->weight)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+ GFP_ATOMIC);
+
+ /*
+ * In the unlucky event of an allocation failure, we just
+ * exit. This will cause the weight of entity to not be
+ * considered in bfq_differentiated_weights, which, in its
+ * turn, causes the scenario to be deemed wrongly symmetric in
+ * case entity's weight would have been the only weight making
+ * the scenario asymmetric. On the bright side, no unbalance
+ * will however occur when entity becomes inactive again (the
+ * invocation of this function is triggered by an activation
+ * of entity). In fact, bfq_weights_tree_remove does nothing
+ * if !entity->weight_counter.
+ */
+ if (unlikely(!entity->weight_counter))
+ return;
+
+ entity->weight_counter->weight = entity->weight;
+ rb_link_node(&entity->weight_counter->weights_node, parent, new);
+ rb_insert_color(&entity->weight_counter->weights_node, root);
+
+inc_counter:
+ entity->weight_counter->num_active++;
+}
+
+/*
+ * Decrement the weight counter associated with the entity, and, if the
+ * counter reaches 0, remove the counter from the tree.
+ * See the comments to the function bfq_weights_tree_add() for considerations
+ * about overhead.
+ */
+void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
+ struct rb_root *root)
+{
+ if (!entity->weight_counter)
+ return;
+
+ entity->weight_counter->num_active--;
+ if (entity->weight_counter->num_active > 0)
+ goto reset_entity_pointer;
+
+ rb_erase(&entity->weight_counter->weights_node, root);
+ kfree(entity->weight_counter);
+
+reset_entity_pointer:
+ entity->weight_counter = NULL;
+}
+
+/*
+ * Return expired entry, or NULL to just start from scratch in rbtree.
+ */
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
+ struct request *last)
+{
+ struct request *rq;
+
+ if (bfq_bfqq_fifo_expire(bfqq))
+ return NULL;
+
+ bfq_mark_bfqq_fifo_expire(bfqq);
+
+ rq = rq_entry_fifo(bfqq->fifo.next);
+
+ if (rq == last || ktime_get_ns() < rq->fifo_time)
+ return NULL;
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
+ return rq;
+}
+
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ struct request *last)
+{
+ struct rb_node *rbnext = rb_next(&last->rb_node);
+ struct rb_node *rbprev = rb_prev(&last->rb_node);
+ struct request *next, *prev = NULL;
+
+ /* Follow expired path, else get first next available. */
+ next = bfq_check_fifo(bfqq, last);
+ if (next)
+ return next;
+
+ if (rbprev)
+ prev = rb_entry_rq(rbprev);
+
+ if (rbnext)
+ next = rb_entry_rq(rbnext);
+ else {
+ rbnext = rb_first(&bfqq->sort_list);
+ if (rbnext && rbnext != &last->rb_node)
+ next = rb_entry_rq(rbnext);
+ }
+
+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
+}
+
+/* see the definition of bfq_async_charge_factor for details */
+static unsigned long bfq_serv_to_charge(struct request *rq,
+ struct bfq_queue *bfqq)
+{
+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
+ return blk_rq_sectors(rq);
+
+ /*
+ * If there are no weight-raised queues, then amplify service
+ * by just the async charge factor; otherwise amplify service
+ * by twice the async charge factor, to further reduce latency
+ * for weight-raised queues.
+ */
+ if (bfqq->bfqd->wr_busy_queues == 0)
+ return blk_rq_sectors(rq) * bfq_async_charge_factor;
+
+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
+}
+
+/**
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
+ * @bfqd: the device data the queue belongs to.
+ * @bfqq: the queue to update.
+ *
+ * If the first request of a queue changes we make sure that the queue
+ * has enough budget to serve at least its first request (if the
+ * request has grown). We do this because if the queue has not enough
+ * budget for its first request, it has to go through two dispatch
+ * rounds to actually get it dispatched.
+ */
+static void bfq_updated_next_req(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ struct request *next_rq = bfqq->next_rq;
+ unsigned long new_budget;
+
+ if (!next_rq)
+ return;
+
+ if (bfqq == bfqd->in_service_queue)
+ /*
+ * In order not to break guarantees, budgets cannot be
+ * changed after an entity has been selected.
+ */
+ return;
+
+ new_budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(next_rq, bfqq));
+ if (entity->budget != new_budget) {
+ entity->budget = new_budget;
+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
+ new_budget);
+ bfq_requeue_bfqq(bfqd, bfqq);
+ }
+}
+
+static void
+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+ if (bic->saved_idle_window)
+ bfq_mark_bfqq_idle_window(bfqq);
+ else
+ bfq_clear_bfqq_idle_window(bfqq);
+
+ if (bic->saved_IO_bound)
+ bfq_mark_bfqq_IO_bound(bfqq);
+ else
+ bfq_clear_bfqq_IO_bound(bfqq);
+
+ bfqq->ttime = bic->saved_ttime;
+ bfqq->wr_coeff = bic->saved_wr_coeff;
+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
+
+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
+ time_is_before_jiffies(bfqq->last_wr_start_finish +
+ bfqq->wr_cur_max_time))) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "resume state: switching off wr");
+
+ bfqq->wr_coeff = 1;
+ }
+
+ /* make sure weight will be updated, however we got here */
+ bfqq->entity.prio_changed = 1;
+}
+
+static int bfqq_process_refs(struct bfq_queue *bfqq)
+{
+ return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
+}
+
+/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_queue *item;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
+ hlist_del_init(&item->burst_list_node);
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+ bfqd->burst_size = 1;
+ bfqd->burst_parent_entity = bfqq->entity.parent;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /* Increment burst size to take into account also bfqq */
+ bfqd->burst_size++;
+
+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+ struct bfq_queue *pos, *bfqq_item;
+ struct hlist_node *n;
+
+ /*
+ * Enough queues have been activated shortly after each
+ * other to consider this burst as large.
+ */
+ bfqd->large_burst = true;
+
+ /*
+ * We can now mark all queues in the burst list as
+ * belonging to a large burst.
+ */
+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
+ burst_list_node)
+ bfq_mark_bfqq_in_large_burst(bfqq_item);
+ bfq_mark_bfqq_in_large_burst(bfqq);
+
+ /*
+ * From now on, and until the current burst finishes, any
+ * new queue being activated shortly after the last queue
+ * was inserted in the burst can be immediately marked as
+ * belonging to a large burst. So the burst list is not
+ * needed any more. Remove it.
+ */
+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
+ burst_list_node)
+ hlist_del_init(&pos->burst_list_node);
+ } else /*
+ * Burst not yet large: add bfqq to the burst list. Do
+ * not increment the ref counter for bfqq, because bfqq
+ * is removed from the burst list before freeing bfqq
+ * in put_queue.
+ */
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues belonging to the same group happen to be created
+ * shortly after each other, then the processes associated with these
+ * queues have typically a common goal. In particular, bursts of queue
+ * creations are usually caused by services or applications that spawn
+ * many parallel threads/processes. Examples are systemd during boot,
+ * or git grep. To help these processes get their job done as soon as
+ * possible, it is usually better to not grant either weight-raising
+ * or device idling to their queues.
+ *
+ * In this comment we describe, firstly, the reasons why this fact
+ * holds, and, secondly, the next function, which implements the main
+ * steps needed to properly mark these queues so that they can then be
+ * treated in a different way.
+ *
+ * The above services or applications benefit mostly from a high
+ * throughput: the quicker the requests of the activated queues are
+ * cumulatively served, the sooner the target job of these queues gets
+ * completed. As a consequence, weight-raising any of these queues,
+ * which also implies idling the device for it, is almost always
+ * counterproductive. In most cases it just lowers throughput.
+ *
+ * On the other hand, a burst of queue creations may be caused also by
+ * the start of an application that does not consist of a lot of
+ * parallel I/O-bound threads. In fact, with a complex application,
+ * several short processes may need to be executed to start-up the
+ * application. In this respect, to start an application as quickly as
+ * possible, the best thing to do is in any case to privilege the I/O
+ * related to the application with respect to all other
+ * I/O. Therefore, the best strategy to start as quickly as possible
+ * an application that causes a burst of queue creations is to
+ * weight-raise all the queues created during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the
+ * two types of bursts need to be distinguished. Fortunately, this
+ * seems relatively easy, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that only bursts with a
+ * larger size than that threshold are apparently caused by
+ * services or commands such as systemd or git grep. For brevity,
+ * hereafter we call just 'large' these bursts. BFQ *does not*
+ * weight-raise queues whose creation occurs in a large burst. In
+ * addition, for each of these queues BFQ performs or does not perform
+ * idling depending on which choice boosts the throughput more. The
+ * exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Unfortunately, false positives may occur while an interactive task
+ * is starting (e.g., an application is being started). The
+ * consequence is that the queues associated with the task do not
+ * enjoy weight raising as expected. Fortunately these false positives
+ * are very rare. They typically occur if some service happens to
+ * start doing I/O exactly when the interactive task starts.
+ *
+ * Turning back to the next function, it implements all the steps
+ * needed to detect the occurrence of a large burst and to properly
+ * mark all the queues belonging to it (so that they can then be
+ * treated in a different way). This goal is achieved by maintaining a
+ * "burst list" that holds, temporarily, the queues that belong to the
+ * burst in progress. The list is then used to mark these queues as
+ * belonging to a large burst if the burst does become large. The main
+ * steps are the following.
+ *
+ * . when the very first queue is created, the queue is inserted into the
+ * list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ * not yet belong to the burst is activated shortly after the last time
+ * at which a new queue entered the burst list, then the function appends
+ * Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ * the large-burst threshold, then
+ *
+ * . all the queues in the burst list are marked as belonging to a
+ * large burst
+ *
+ * . the burst list is deleted; in fact, the burst list already served
+ * its purpose (keeping temporarily track of the queues in a burst,
+ * so as to be able to mark them as belonging to a large burst in the
+ * previous sub-step), and now is not needed any more
+ *
+ * . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is created while
+ * the device is in large-burst mode and shortly after the last time
+ * at which a queue either entered the burst list or was marked as
+ * belonging to the current large burst, then Q is immediately marked
+ * as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is created a while
+ * later, i.e., not shortly after, than the last time at which a queue
+ * either entered the burst list or was marked as belonging to the
+ * current large burst, then the current burst is deemed as finished and:
+ *
+ * . the large-burst mode is reset if set
+ *
+ * . the burst list is emptied
+ *
+ * . Q is inserted in the burst list, as Q may be the first queue
+ * in a possible new burst (then the burst list contains just Q
+ * after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /*
+ * If bfqq is already in the burst list or is part of a large
+ * burst, or finally has just been split, then there is
+ * nothing else to do.
+ */
+ if (!hlist_unhashed(&bfqq->burst_list_node) ||
+ bfq_bfqq_in_large_burst(bfqq) ||
+ time_is_after_eq_jiffies(bfqq->split_time +
+ msecs_to_jiffies(10)))
+ return;
+
+ /*
+ * If bfqq's creation happens late enough, or bfqq belongs to
+ * a different group than the burst group, then the current
+ * burst is finished, and related data structures must be
+ * reset.
+ *
+ * In this respect, consider the special case where bfqq is
+ * the very first queue created after BFQ is selected for this
+ * device. In this case, last_ins_in_burst and
+ * burst_parent_entity are not yet significant when we get
+ * here. But it is easy to verify that, whether or not the
+ * following condition is true, bfqq will end up being
+ * inserted into the burst list. In particular the list will
+ * happen to contain only bfqq. And this is exactly what has
+ * to happen, as bfqq may be the first queue of the first
+ * burst.
+ */
+ if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+ bfqd->bfq_burst_interval) ||
+ bfqq->entity.parent != bfqd->burst_parent_entity) {
+ bfqd->large_burst = false;
+ bfq_reset_burst_list(bfqd, bfqq);
+ goto end;
+ }
+
+ /*
+ * If we get here, then bfqq is being activated shortly after the
+ * last queue. So, if the current burst is also large, we can mark
+ * bfqq as belonging to this large burst immediately.
+ */
+ if (bfqd->large_burst) {
+ bfq_mark_bfqq_in_large_burst(bfqq);
+ goto end;
+ }
+
+ /*
+ * If we get here, then a large-burst state has not yet been
+ * reached, but bfqq is being activated shortly after the last
+ * queue. Then we add bfqq to the burst.
+ */
+ bfq_add_to_burst(bfqd, bfqq);
+end:
+ /*
+ * At this point, bfqq either has been added to the current
+ * burst or has caused the current burst to terminate and a
+ * possible new burst to start. In particular, in the second
+ * case, bfqq has become the first queue in the possible new
+ * burst. In both cases last_ins_in_burst needs to be moved
+ * forward.
+ */
+ bfqd->last_ins_in_burst = jiffies;
+}
+
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ return entity->budget - entity->service;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static int bfq_max_budget(struct bfq_data *bfqd)
+{
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ return bfq_default_max_budget;
+ else
+ return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static int bfq_min_budget(struct bfq_data *bfqd)
+{
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ return bfq_default_max_budget / 32;
+ else
+ return bfqd->bfq_max_budget / 32;
+}
+
+/*
+ * The next function, invoked after the input queue bfqq switches from
+ * idle to busy, updates the budget of bfqq. The function also tells
+ * whether the in-service queue should be expired, by returning
+ * true. The purpose of expiring the in-service queue is to give bfqq
+ * the chance to possibly preempt the in-service queue, and the reason
+ * for preempting the in-service queue is to achieve one of the two
+ * goals below.
+ *
+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
+ * expired because it has remained idle. In particular, bfqq may have
+ * expired for one of the following two reasons:
+ *
+ * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
+ * and did not make it to issue a new request before its last
+ * request was served;
+ *
+ * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
+ * a new request before the expiration of the idling-time.
+ *
+ * Even if bfqq has expired for one of the above reasons, the process
+ * associated with the queue may be however issuing requests greedily,
+ * and thus be sensitive to the bandwidth it receives (bfqq may have
+ * remained idle for other reasons: CPU high load, bfqq not enjoying
+ * idling, I/O throttling somewhere in the path from the process to
+ * the I/O scheduler, ...). But if, after every expiration for one of
+ * the above two reasons, bfqq has to wait for the service of at least
+ * one full budget of another queue before being served again, then
+ * bfqq is likely to get a much lower bandwidth or resource time than
+ * its reserved ones. To address this issue, two countermeasures need
+ * to be taken.
+ *
+ * First, the budget and the timestamps of bfqq need to be updated in
+ * a special way on bfqq reactivation: they need to be updated as if
+ * bfqq did not remain idle and did not expire. In fact, if they are
+ * computed as if bfqq expired and remained idle until reactivation,
+ * then the process associated with bfqq is treated as if, instead of
+ * being greedy, it stopped issuing requests when bfqq remained idle,
+ * and restarts issuing requests only on this reactivation. In other
+ * words, the scheduler does not help the process recover the "service
+ * hole" between bfqq expiration and reactivation. As a consequence,
+ * the process receives a lower bandwidth than its reserved one. In
+ * contrast, to recover this hole, the budget must be updated as if
+ * bfqq was not expired at all before this reactivation, i.e., it must
+ * be set to the value of the remaining budget when bfqq was
+ * expired. Along the same line, timestamps need to be assigned the
+ * value they had the last time bfqq was selected for service, i.e.,
+ * before last expiration. Thus timestamps need to be back-shifted
+ * with respect to their normal computation (see [1] for more details
+ * on this tricky aspect).
+ *
+ * Secondly, to allow the process to recover the hole, the in-service
+ * queue must be expired too, to give bfqq the chance to preempt it
+ * immediately. In fact, if bfqq has to wait for a full budget of the
+ * in-service queue to be completed, then it may become impossible to
+ * let the process recover the hole, even if the back-shifted
+ * timestamps of bfqq are lower than those of the in-service queue. If
+ * this happens for most or all of the holes, then the process may not
+ * receive its reserved bandwidth. In this respect, it is worth noting
+ * that, being the service of outstanding requests unpreemptible, a
+ * little fraction of the holes may however be unrecoverable, thereby
+ * causing a little loss of bandwidth.
+ *
+ * The last important point is detecting whether bfqq does need this
+ * bandwidth recovery. In this respect, the next function deems the
+ * process associated with bfqq greedy, and thus allows it to recover
+ * the hole, if: 1) the process is waiting for the arrival of a new
+ * request (which implies that bfqq expired for one of the above two
+ * reasons), and 2) such a request has arrived soon. The first
+ * condition is controlled through the flag non_blocking_wait_rq,
+ * while the second through the flag arrived_in_time. If both
+ * conditions hold, then the function computes the budget in the
+ * above-described special way, and signals that the in-service queue
+ * should be expired. Timestamp back-shifting is done later in
+ * __bfq_activate_entity.
+ *
+ * 2. Reduce latency. Even if timestamps are not backshifted to let
+ * the process associated with bfqq recover a service hole, bfqq may
+ * however happen to have, after being (re)activated, a lower finish
+ * timestamp than the in-service queue. That is, the next budget of
+ * bfqq may have to be completed before the one of the in-service
+ * queue. If this is the case, then preempting the in-service queue
+ * allows this goal to be achieved, apart from the unpreemptible,
+ * outstanding requests mentioned above.
+ *
+ * Unfortunately, regardless of which of the above two goals one wants
+ * to achieve, service trees need first to be updated to know whether
+ * the in-service queue must be preempted. To have service trees
+ * correctly updated, the in-service queue must be expired and
+ * rescheduled, and bfqq must be scheduled too. This is one of the
+ * most costly operations (in future versions, the scheduling
+ * mechanism may be re-designed in such a way to make it possible to
+ * know whether preemption is needed without needing to update service
+ * trees). In addition, queue preemptions almost always cause random
+ * I/O, and thus loss of throughput. Because of these facts, the next
+ * function adopts the following simple scheme to avoid both costly
+ * operations and too frequent preemptions: it requests the expiration
+ * of the in-service queue (unconditionally) only for queues that need
+ * to recover a hole, or that either are weight-raised or deserve to
+ * be weight-raised.
+ */
+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ bool arrived_in_time,
+ bool wr_or_deserves_wr)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
+ /*
+ * We do not clear the flag non_blocking_wait_rq here, as
+ * the latter is used in bfq_activate_bfqq to signal
+ * that timestamps need to be back-shifted (and is
+ * cleared right after).
+ */
+
+ /*
+ * In next assignment we rely on that either
+ * entity->service or entity->budget are not updated
+ * on expiration if bfqq is empty (see
+ * __bfq_bfqq_recalc_budget). Thus both quantities
+ * remain unchanged after such an expiration, and the
+ * following statement therefore assigns to
+ * entity->budget the remaining budget on such an
+ * expiration. For clarity, entity->service is not
+ * updated on expiration in any case, and, in normal
+ * operation, is reset only when bfqq is selected for
+ * service (see bfq_get_next_queue).
+ */
+ entity->budget = min_t(unsigned long,
+ bfq_bfqq_budget_left(bfqq),
+ bfqq->max_budget);
+
+ return true;
+ }
+
+ entity->budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(bfqq->next_rq, bfqq));
+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+ return wr_or_deserves_wr;
+}
+
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+ u64 dur;
+
+ if (bfqd->bfq_wr_max_time > 0)
+ return bfqd->bfq_wr_max_time;
+
+ dur = bfqd->RT_prod;
+ do_div(dur, bfqd->peak_rate);
+
+ /*
+ * Limit duration between 3 and 13 seconds. Tests show that
+ * higher values than 13 seconds often yield the opposite of
+ * the desired result, i.e., worsen responsiveness by letting
+ * non-interactive and non-soft-real-time applications
+ * preserve weight raising for a too long time interval.
+ *
+ * On the other end, lower values than 3 seconds make it
+ * difficult for most interactive tasks to complete their jobs
+ * before weight-raising finishes.
+ */
+ if (dur > msecs_to_jiffies(13000))
+ dur = msecs_to_jiffies(13000);
+ else if (dur < msecs_to_jiffies(3000))
+ dur = msecs_to_jiffies(3000);
+
+ return dur;
+}
+
+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ unsigned int old_wr_coeff,
+ bool wr_or_deserves_wr,
+ bool interactive,
+ bool in_burst,
+ bool soft_rt)
+{
+ if (old_wr_coeff == 1 && wr_or_deserves_wr) {
+ /* start a weight-raising period */
+ if (interactive) {
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ } else {
+ bfqq->wr_start_at_switch_to_srt = jiffies;
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+ BFQ_SOFTRT_WEIGHT_FACTOR;
+ bfqq->wr_cur_max_time =
+ bfqd->bfq_wr_rt_max_time;
+ }
+
+ /*
+ * If needed, further reduce budget to make sure it is
+ * close to bfqq's backlog, so as to reduce the
+ * scheduling-error component due to a too large
+ * budget. Do not care about throughput consequences,
+ * but only about latency. Finally, do not assign a
+ * too small budget either, to avoid increasing
+ * latency by causing too frequent expirations.
+ */
+ bfqq->entity.budget = min_t(unsigned long,
+ bfqq->entity.budget,
+ 2 * bfq_min_budget(bfqd));
+ } else if (old_wr_coeff > 1) {
+ if (interactive) { /* update wr coeff and duration */
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ } else if (in_burst)
+ bfqq->wr_coeff = 1;
+ else if (soft_rt) {
+ /*
+ * The application is now or still meeting the
+ * requirements for being deemed soft rt. We
+ * can then correctly and safely (re)charge
+ * the weight-raising duration for the
+ * application with the weight-raising
+ * duration for soft rt applications.
+ *
+ * In particular, doing this recharge now, i.e.,
+ * before the weight-raising period for the
+ * application finishes, reduces the probability
+ * of the following negative scenario:
+ * 1) the weight of a soft rt application is
+ * raised at startup (as for any newly
+ * created application),
+ * 2) since the application is not interactive,
+ * at a certain time weight-raising is
+ * stopped for the application,
+ * 3) at that time the application happens to
+ * still have pending requests, and hence
+ * is destined to not have a chance to be
+ * deemed soft rt before these requests are
+ * completed (see the comments to the
+ * function bfq_bfqq_softrt_next_start()
+ * for details on soft rt detection),
+ * 4) these pending requests experience a high
+ * latency because the application is not
+ * weight-raised while they are pending.
+ */
+ if (bfqq->wr_cur_max_time !=
+ bfqd->bfq_wr_rt_max_time) {
+ bfqq->wr_start_at_switch_to_srt =
+ bfqq->last_wr_start_finish;
+
+ bfqq->wr_cur_max_time =
+ bfqd->bfq_wr_rt_max_time;
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+ BFQ_SOFTRT_WEIGHT_FACTOR;
+ }
+ bfqq->last_wr_start_finish = jiffies;
+ }
+ }
+}
+
+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ return bfqq->dispatched == 0 &&
+ time_is_before_jiffies(
+ bfqq->budget_timeout +
+ bfqd->bfq_wr_min_idle_time);
+}
+
+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ int old_wr_coeff,
+ struct request *rq,
+ bool *interactive)
+{
+ bool soft_rt, in_burst, wr_or_deserves_wr,
+ bfqq_wants_to_preempt,
+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
+ /*
+ * See the comments on
+ * bfq_bfqq_update_budg_for_activation for
+ * details on the usage of the next variable.
+ */
+ arrived_in_time = ktime_get_ns() <=
+ bfqq->ttime.last_end_request +
+ bfqd->bfq_slice_idle * 3;
+
+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
+
+ /*
+ * bfqq deserves to be weight-raised if:
+ * - it is sync,
+ * - it does not belong to a large burst,
+ * - it has been idle for enough time or is soft real-time,
+ * - is linked to a bfq_io_cq (it is not shared in any sense).
+ */
+ in_burst = bfq_bfqq_in_large_burst(bfqq);
+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+ !in_burst &&
+ time_is_before_jiffies(bfqq->soft_rt_next_start);
+ *interactive = !in_burst && idle_for_long_time;
+ wr_or_deserves_wr = bfqd->low_latency &&
+ (bfqq->wr_coeff > 1 ||
+ (bfq_bfqq_sync(bfqq) &&
+ bfqq->bic && (*interactive || soft_rt)));
+
+ /*
+ * Using the last flag, update budget and check whether bfqq
+ * may want to preempt the in-service queue.
+ */
+ bfqq_wants_to_preempt =
+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
+ arrived_in_time,
+ wr_or_deserves_wr);
+
+ /*
+ * If bfqq happened to be activated in a burst, but has been
+ * idle for much more than an interactive queue, then we
+ * assume that, in the overall I/O initiated in the burst, the
+ * I/O associated with bfqq is finished. So bfqq does not need
+ * to be treated as a queue belonging to a burst
+ * anymore. Accordingly, we reset bfqq's in_large_burst flag
+ * if set, and remove bfqq from the burst list if it's
+ * there. We do not decrement burst_size, because the fact
+ * that bfqq does not need to belong to the burst list any
+ * more does not invalidate the fact that bfqq was created in
+ * a burst.
+ */
+ if (likely(!bfq_bfqq_just_created(bfqq)) &&
+ idle_for_long_time &&
+ time_is_before_jiffies(
+ bfqq->budget_timeout +
+ msecs_to_jiffies(10000))) {
+ hlist_del_init(&bfqq->burst_list_node);
+ bfq_clear_bfqq_in_large_burst(bfqq);
+ }
+
+ bfq_clear_bfqq_just_created(bfqq);
+
+
+ if (!bfq_bfqq_IO_bound(bfqq)) {
+ if (arrived_in_time) {
+ bfqq->requests_within_timer++;
+ if (bfqq->requests_within_timer >=
+ bfqd->bfq_requests_within_timer)
+ bfq_mark_bfqq_IO_bound(bfqq);
+ } else
+ bfqq->requests_within_timer = 0;
+ }
+
+ if (bfqd->low_latency) {
+ if (unlikely(time_is_after_jiffies(bfqq->split_time)))
+ /* wraparound */
+ bfqq->split_time =
+ jiffies - bfqd->bfq_wr_min_idle_time - 1;
+
+ if (time_is_before_jiffies(bfqq->split_time +
+ bfqd->bfq_wr_min_idle_time)) {
+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
+ old_wr_coeff,
+ wr_or_deserves_wr,
+ *interactive,
+ in_burst,
+ soft_rt);
+
+ if (old_wr_coeff != bfqq->wr_coeff)
+ bfqq->entity.prio_changed = 1;
+ }
+ }
+
+ bfqq->last_idle_bklogged = jiffies;
+ bfqq->service_from_backlogged = 0;
+ bfq_clear_bfqq_softrt_update(bfqq);
+
+ bfq_add_bfqq_busy(bfqd, bfqq);
+
+ /*
+ * Expire in-service queue only if preemption may be needed
+ * for guarantees. In this respect, the function
+ * next_queue_may_preempt just checks a simple, necessary
+ * condition, and not a sufficient condition based on
+ * timestamps. In fact, for the latter condition to be
+ * evaluated, timestamps would need first to be updated, and
+ * this operation is quite costly (see the comments on the
+ * function bfq_bfqq_update_budg_for_activation).
+ */
+ if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
+ next_queue_may_preempt(bfqd))
+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+ false, BFQQE_PREEMPTED);
+}
+
+static void bfq_add_request(struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ struct request *next_rq, *prev;
+ unsigned int old_wr_coeff = bfqq->wr_coeff;
+ bool interactive = false;
+
+ bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
+ bfqq->queued[rq_is_sync(rq)]++;
+ bfqd->queued++;
+
+ elv_rb_add(&bfqq->sort_list, rq);
+
+ /*
+ * Check if this request is a better next-serve candidate.
+ */
+ prev = bfqq->next_rq;
+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
+ bfqq->next_rq = next_rq;
+
+ /*
+ * Adjust priority tree position, if next_rq changes.
+ */
+ if (prev != bfqq->next_rq)
+ bfq_pos_tree_add_move(bfqd, bfqq);
+
+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
+ rq, &interactive);
+ else {
+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
+ time_is_before_jiffies(
+ bfqq->last_wr_start_finish +
+ bfqd->bfq_wr_min_inter_arr_async)) {
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+
+ bfqd->wr_busy_queues++;
+ bfqq->entity.prio_changed = 1;
+ }
+ if (prev != bfqq->next_rq)
+ bfq_updated_next_req(bfqd, bfqq);
+ }
+
+ /*
+ * Assign jiffies to last_wr_start_finish in the following
+ * cases:
+ *
+ * . if bfqq is not going to be weight-raised, because, for
+ * non weight-raised queues, last_wr_start_finish stores the
+ * arrival time of the last request; as of now, this piece
+ * of information is used only for deciding whether to
+ * weight-raise async queues
+ *
+ * . if bfqq is not weight-raised, because, if bfqq is now
+ * switching to weight-raised, then last_wr_start_finish
+ * stores the time when weight-raising starts
+ *
+ * . if bfqq is interactive, because, regardless of whether
+ * bfqq is currently weight-raised, the weight-raising
+ * period must start or restart (this case is considered
+ * separately because it is not detected by the above
+ * conditions, if bfqq is already weight-raised)
+ *
+ * last_wr_start_finish has to be updated also if bfqq is soft
+ * real-time, because the weight-raising period is constantly
+ * restarted on idle-to-busy transitions for these queues, but
+ * this is already done in bfq_bfqq_handle_idle_busy_switch if
+ * needed.
+ */
+ if (bfqd->low_latency &&
+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
+ bfqq->last_wr_start_finish = jiffies;
+}
+
+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
+ struct bio *bio,
+ struct request_queue *q)
+{
+ struct bfq_queue *bfqq = bfqd->bio_bfqq;
+
+
+ if (bfqq)
+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
+
+ return NULL;
+}
+
+static sector_t get_sdist(sector_t last_pos, struct request *rq)
+{
+ if (last_pos)
+ return abs(blk_rq_pos(rq) - last_pos);
+
+ return 0;
+}
+
+#if 0 /* Still not clear if we can do without next two functions */
+static void bfq_activate_request(struct request_queue *q, struct request *rq)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+
+ bfqd->rq_in_driver++;
+}
+
+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+
+ bfqd->rq_in_driver--;
+}
+#endif
+
+static void bfq_remove_request(struct request_queue *q,
+ struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ const int sync = rq_is_sync(rq);
+
+ if (bfqq->next_rq == rq) {
+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
+ bfq_updated_next_req(bfqd, bfqq);
+ }
+
+ if (rq->queuelist.prev != &rq->queuelist)
+ list_del_init(&rq->queuelist);
+ bfqq->queued[sync]--;
+ bfqd->queued--;
+ elv_rb_del(&bfqq->sort_list, rq);
+
+ elv_rqhash_del(q, rq);
+ if (q->last_merge == rq)
+ q->last_merge = NULL;
+
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ bfqq->next_rq = NULL;
+
+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
+ bfq_del_bfqq_busy(bfqd, bfqq, false);
+ /*
+ * bfqq emptied. In normal operation, when
+ * bfqq is empty, bfqq->entity.service and
+ * bfqq->entity.budget must contain,
+ * respectively, the service received and the
+ * budget used last time bfqq emptied. These
+ * facts do not hold in this case, as at least
+ * this last removal occurred while bfqq is
+ * not in service. To avoid inconsistencies,
+ * reset both bfqq->entity.service and
+ * bfqq->entity.budget, if bfqq has still a
+ * process that may issue I/O requests to it.
+ */
+ bfqq->entity.budget = bfqq->entity.service = 0;
+ }
+
+ /*
+ * Remove queue from request-position tree as it is empty.
+ */
+ if (bfqq->pos_root) {
+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
+ bfqq->pos_root = NULL;
+ }
+ }
+
+ if (rq->cmd_flags & REQ_META)
+ bfqq->meta_pending--;
+
+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
+}
+
+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+ struct request_queue *q = hctx->queue;
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct request *free = NULL;
+ /*
+ * bfq_bic_lookup grabs the queue_lock: invoke it now and
+ * store its return value for later use, to avoid nesting
+ * queue_lock inside the bfqd->lock. We assume that the bic
+ * returned by bfq_bic_lookup does not go away before
+ * bfqd->lock is taken.
+ */
+ struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
+ bool ret;
+
+ spin_lock_irq(&bfqd->lock);
+
+ if (bic)
+ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
+ else
+ bfqd->bio_bfqq = NULL;
+ bfqd->bio_bic = bic;
+
+ ret = blk_mq_sched_try_merge(q, bio, &free);
+
+ if (free)
+ blk_mq_free_request(free);
+ spin_unlock_irq(&bfqd->lock);
+
+ return ret;
+}
+
+static int bfq_request_merge(struct request_queue *q, struct request **req,
+ struct bio *bio)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct request *__rq;
+
+ __rq = bfq_find_rq_fmerge(bfqd, bio, q);
+ if (__rq && elv_bio_merge_ok(__rq, bio)) {
+ *req = __rq;
+ return ELEVATOR_FRONT_MERGE;
+ }
+
+ return ELEVATOR_NO_MERGE;
+}
+
+static void bfq_request_merged(struct request_queue *q, struct request *req,
+ enum elv_merge type)
+{
+ if (type == ELEVATOR_FRONT_MERGE &&
+ rb_prev(&req->rb_node) &&
+ blk_rq_pos(req) <
+ blk_rq_pos(container_of(rb_prev(&req->rb_node),
+ struct request, rb_node))) {
+ struct bfq_queue *bfqq = RQ_BFQQ(req);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ struct request *prev, *next_rq;
+
+ /* Reposition request in its sort_list */
+ elv_rb_del(&bfqq->sort_list, req);
+ elv_rb_add(&bfqq->sort_list, req);
+
+ /* Choose next request to be served for bfqq */
+ prev = bfqq->next_rq;
+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
+ bfqd->last_position);
+ bfqq->next_rq = next_rq;
+ /*
+ * If next_rq changes, update both the queue's budget to
+ * fit the new request and the queue's position in its
+ * rq_pos_tree.
+ */
+ if (prev != bfqq->next_rq) {
+ bfq_updated_next_req(bfqd, bfqq);
+ bfq_pos_tree_add_move(bfqd, bfqq);
+ }
+ }
+}
+
+static void bfq_requests_merged(struct request_queue *q, struct request *rq,
+ struct request *next)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+
+ if (!RB_EMPTY_NODE(&rq->rb_node))
+ goto end;
+ spin_lock_irq(&bfqq->bfqd->lock);
+
+ /*
+ * If next and rq belong to the same bfq_queue and next is older
+ * than rq, then reposition rq in the fifo (by substituting next
+ * with rq). Otherwise, if next and rq belong to different
+ * bfq_queues, never reposition rq: in fact, we would have to
+ * reposition it with respect to next's position in its own fifo,
+ * which would most certainly be too expensive with respect to
+ * the benefits.
+ */
+ if (bfqq == next_bfqq &&
+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+ next->fifo_time < rq->fifo_time) {
+ list_del_init(&rq->queuelist);
+ list_replace_init(&next->queuelist, &rq->queuelist);
+ rq->fifo_time = next->fifo_time;
+ }
+
+ if (bfqq->next_rq == next)
+ bfqq->next_rq = rq;
+
+ bfq_remove_request(q, next);
+
+ spin_unlock_irq(&bfqq->bfqd->lock);
+end:
+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
+}
+
+/* Must be called with bfqq != NULL */
+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+{
+ if (bfq_bfqq_busy(bfqq))
+ bfqq->bfqd->wr_busy_queues--;
+ bfqq->wr_coeff = 1;
+ bfqq->wr_cur_max_time = 0;
+ bfqq->last_wr_start_finish = jiffies;
+ /*
+ * Trigger a weight change on the next invocation of
+ * __bfq_entity_update_weight_prio.
+ */
+ bfqq->entity.prio_changed = 1;
+}
+
+void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+ struct bfq_group *bfqg)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_BE_NR; j++)
+ if (bfqg->async_bfqq[i][j])
+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
+ if (bfqg->async_idle_bfqq)
+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
+}
+
+static void bfq_end_wr(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq;
+
+ spin_lock_irq(&bfqd->lock);
+
+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+ bfq_bfqq_end_wr(bfqq);
+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
+ bfq_bfqq_end_wr(bfqq);
+ bfq_end_wr_async(bfqd);
+
+ spin_unlock_irq(&bfqd->lock);
+}
+
+static sector_t bfq_io_struct_pos(void *io_struct, bool request)
+{
+ if (request)
+ return blk_rq_pos(io_struct);
+ else
+ return ((struct bio *)io_struct)->bi_iter.bi_sector;
+}
+
+static int bfq_rq_close_to_sector(void *io_struct, bool request,
+ sector_t sector)
+{
+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
+ BFQQ_CLOSE_THR;
+}
+
+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ sector_t sector)
+{
+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ struct rb_node *parent, *node;
+ struct bfq_queue *__bfqq;
+
+ if (RB_EMPTY_ROOT(root))
+ return NULL;
+
+ /*
+ * First, if we find a request starting at the end of the last
+ * request, choose it.
+ */
+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
+ if (__bfqq)
+ return __bfqq;
+
+ /*
+ * If the exact sector wasn't found, the parent of the NULL leaf
+ * will contain the closest sector (rq_pos_tree sorted by
+ * next_request position).
+ */
+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+ return __bfqq;
+
+ if (blk_rq_pos(__bfqq->next_rq) < sector)
+ node = rb_next(&__bfqq->pos_node);
+ else
+ node = rb_prev(&__bfqq->pos_node);
+ if (!node)
+ return NULL;
+
+ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+ return __bfqq;
+
+ return NULL;
+}
+
+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
+ struct bfq_queue *cur_bfqq,
+ sector_t sector)
+{
+ struct bfq_queue *bfqq;
+
+ /*
+ * We shall notice if some of the queues are cooperating,
+ * e.g., working closely on the same area of the device. In
+ * that case, we can group them together and: 1) don't waste
+ * time idling, and 2) serve the union of their requests in
+ * the best possible order for throughput.
+ */
+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
+ if (!bfqq || bfqq == cur_bfqq)
+ return NULL;
+
+ return bfqq;
+}
+
+static struct bfq_queue *
+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+ int process_refs, new_process_refs;
+ struct bfq_queue *__bfqq;
+
+ /*
+ * If there are no process references on the new_bfqq, then it is
+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
+ * may have dropped their last reference (not just their last process
+ * reference).
+ */
+ if (!bfqq_process_refs(new_bfqq))
+ return NULL;
+
+ /* Avoid a circular list and skip interim queue merges. */
+ while ((__bfqq = new_bfqq->new_bfqq)) {
+ if (__bfqq == bfqq)
+ return NULL;
+ new_bfqq = __bfqq;
+ }
+
+ process_refs = bfqq_process_refs(bfqq);
+ new_process_refs = bfqq_process_refs(new_bfqq);
+ /*
+ * If the process for the bfqq has gone away, there is no
+ * sense in merging the queues.
+ */
+ if (process_refs == 0 || new_process_refs == 0)
+ return NULL;
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
+ new_bfqq->pid);
+
+ /*
+ * Merging is just a redirection: the requests of the process
+ * owning one of the two queues are redirected to the other queue.
+ * The latter queue, in its turn, is set as shared if this is the
+ * first time that the requests of some process are redirected to
+ * it.
+ *
+ * We redirect bfqq to new_bfqq and not the opposite, because
+ * we are in the context of the process owning bfqq, thus we
+ * have the io_cq of this process. So we can immediately
+ * configure this io_cq to redirect the requests of the
+ * process to new_bfqq. In contrast, the io_cq of new_bfqq is
+ * not available any more (new_bfqq->bic == NULL).
+ *
+ * Anyway, even in case new_bfqq coincides with the in-service
+ * queue, redirecting requests the in-service queue is the
+ * best option, as we feed the in-service queue with new
+ * requests close to the last request served and, by doing so,
+ * are likely to increase the throughput.
+ */
+ bfqq->new_bfqq = new_bfqq;
+ new_bfqq->ref += process_refs;
+ return new_bfqq;
+}
+
+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
+ struct bfq_queue *new_bfqq)
+{
+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
+ (bfqq->ioprio_class != new_bfqq->ioprio_class))
+ return false;
+
+ /*
+ * If either of the queues has already been detected as seeky,
+ * then merging it with the other queue is unlikely to lead to
+ * sequential I/O.
+ */
+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
+ return false;
+
+ /*
+ * Interleaved I/O is known to be done by (some) applications
+ * only for reads, so it does not make sense to merge async
+ * queues.
+ */
+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
+ return false;
+
+ return true;
+}
+
+/*
+ * If this function returns true, then bfqq cannot be merged. The idea
+ * is that true cooperation happens very early after processes start
+ * to do I/O. Usually, late cooperations are just accidental false
+ * positives. In case bfqq is weight-raised, such false positives
+ * would evidently degrade latency guarantees for bfqq.
+ */
+static bool wr_from_too_long(struct bfq_queue *bfqq)
+{
+ return bfqq->wr_coeff > 1 &&
+ time_is_before_jiffies(bfqq->last_wr_start_finish +
+ msecs_to_jiffies(100));
+}
+
+/*
+ * Attempt to schedule a merge of bfqq with the currently in-service
+ * queue or with a close queue among the scheduled queues. Return
+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue
+ * structure otherwise.
+ *
+ * The OOM queue is not allowed to participate to cooperation: in fact, since
+ * the requests temporarily redirected to the OOM queue could be redirected
+ * again to dedicated queues at any time, the state needed to correctly
+ * handle merging with the OOM queue would be quite complex and expensive
+ * to maintain. Besides, in such a critical condition as an out of memory,
+ * the benefits of queue merging may be little relevant, or even negligible.
+ *
+ * Weight-raised queues can be merged only if their weight-raising
+ * period has just started. In fact cooperating processes are usually
+ * started together. Thus, with this filter we avoid false positives
+ * that would jeopardize low-latency guarantees.
+ *
+ * WARNING: queue merging may impair fairness among non-weight raised
+ * queues, for at least two reasons: 1) the original weight of a
+ * merged queue may change during the merged state, 2) even being the
+ * weight the same, a merged queue may be bloated with many more
+ * requests than the ones produced by its originally-associated
+ * process.
+ */
+static struct bfq_queue *
+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ void *io_struct, bool request)
+{
+ struct bfq_queue *in_service_bfqq, *new_bfqq;
+
+ if (bfqq->new_bfqq)
+ return bfqq->new_bfqq;
+
+ if (!io_struct ||
+ wr_from_too_long(bfqq) ||
+ unlikely(bfqq == &bfqd->oom_bfqq))
+ return NULL;
+
+ /* If there is only one backlogged queue, don't search. */
+ if (bfqd->busy_queues == 1)
+ return NULL;
+
+ in_service_bfqq = bfqd->in_service_queue;
+
+ if (!in_service_bfqq || in_service_bfqq == bfqq
+ || wr_from_too_long(in_service_bfqq) ||
+ unlikely(in_service_bfqq == &bfqd->oom_bfqq))
+ goto check_scheduled;
+
+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
+ bfqq->entity.parent == in_service_bfqq->entity.parent &&
+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
+ if (new_bfqq)
+ return new_bfqq;
+ }
+ /*
+ * Check whether there is a cooperator among currently scheduled
+ * queues. The only thing we need is that the bio/request is not
+ * NULL, as we need it to establish whether a cooperator exists.
+ */
+check_scheduled:
+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
+ bfq_io_struct_pos(io_struct, request));
+
+ if (new_bfqq && !wr_from_too_long(new_bfqq) &&
+ likely(new_bfqq != &bfqd->oom_bfqq) &&
+ bfq_may_be_close_cooperator(bfqq, new_bfqq))
+ return bfq_setup_merge(bfqq, new_bfqq);
+
+ return NULL;
+}
+
+static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
+{
+ struct bfq_io_cq *bic = bfqq->bic;
+
+ /*
+ * If !bfqq->bic, the queue is already shared or its requests
+ * have already been redirected to a shared queue; both idle window
+ * and weight raising state have already been saved. Do nothing.
+ */
+ if (!bic)
+ return;
+
+ bic->saved_ttime = bfqq->ttime;
+ bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+ bic->saved_wr_coeff = bfqq->wr_coeff;
+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+}
+
+static void
+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
+ (unsigned long)new_bfqq->pid);
+ /* Save weight raising and idle window of the merged queues */
+ bfq_bfqq_save_state(bfqq);
+ bfq_bfqq_save_state(new_bfqq);
+ if (bfq_bfqq_IO_bound(bfqq))
+ bfq_mark_bfqq_IO_bound(new_bfqq);
+ bfq_clear_bfqq_IO_bound(bfqq);
+
+ /*
+ * If bfqq is weight-raised, then let new_bfqq inherit
+ * weight-raising. To reduce false positives, neglect the case
+ * where bfqq has just been created, but has not yet made it
+ * to be weight-raised (which may happen because EQM may merge
+ * bfqq even before bfq_add_request is executed for the first
+ * time for bfqq). Handling this case would however be very
+ * easy, thanks to the flag just_created.
+ */
+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
+ new_bfqq->wr_coeff = bfqq->wr_coeff;
+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
+ new_bfqq->wr_start_at_switch_to_srt =
+ bfqq->wr_start_at_switch_to_srt;
+ if (bfq_bfqq_busy(new_bfqq))
+ bfqd->wr_busy_queues++;
+ new_bfqq->entity.prio_changed = 1;
+ }
+
+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
+ bfqq->wr_coeff = 1;
+ bfqq->entity.prio_changed = 1;
+ if (bfq_bfqq_busy(bfqq))
+ bfqd->wr_busy_queues--;
+ }
+
+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
+ bfqd->wr_busy_queues);
+
+ /*
+ * Merge queues (that is, let bic redirect its requests to new_bfqq)
+ */
+ bic_set_bfqq(bic, new_bfqq, 1);
+ bfq_mark_bfqq_coop(new_bfqq);
+ /*
+ * new_bfqq now belongs to at least two bics (it is a shared queue):
+ * set new_bfqq->bic to NULL. bfqq either:
+ * - does not belong to any bic any more, and hence bfqq->bic must
+ * be set to NULL, or
+ * - is a queue whose owning bics have already been redirected to a
+ * different queue, hence the queue is destined to not belong to
+ * any bic soon and bfqq->bic is already NULL (therefore the next
+ * assignment causes no harm).
+ */
+ new_bfqq->bic = NULL;
+ bfqq->bic = NULL;
+ /* release process reference to bfqq */
+ bfq_put_queue(bfqq);
+}
+
+static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ bool is_sync = op_is_sync(bio->bi_opf);
+ struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq;
+
+ /*
+ * Disallow merge of a sync bio into an async request.
+ */
+ if (is_sync && !rq_is_sync(rq))
+ return false;
+
+ /*
+ * Lookup the bfqq that this bio will be queued with. Allow
+ * merge only if rq is queued there.
+ */
+ if (!bfqq)
+ return false;
+
+ /*
+ * We take advantage of this function to perform an early merge
+ * of the queues of possible cooperating processes.
+ */
+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
+ if (new_bfqq) {
+ /*
+ * bic still points to bfqq, then it has not yet been
+ * redirected to some other bfq_queue, and a queue
+ * merge beween bfqq and new_bfqq can be safely
+ * fulfillled, i.e., bic can be redirected to new_bfqq
+ * and bfqq can be put.
+ */
+ bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
+ new_bfqq);
+ /*
+ * If we get here, bio will be queued into new_queue,
+ * so use new_bfqq to decide whether bio and rq can be
+ * merged.
+ */
+ bfqq = new_bfqq;
+
+ /*
+ * Change also bqfd->bio_bfqq, as
+ * bfqd->bio_bic now points to new_bfqq, and
+ * this function may be invoked again (and then may
+ * use again bqfd->bio_bfqq).
+ */
+ bfqd->bio_bfqq = bfqq;
+ }
+
+ return bfqq == RQ_BFQQ(rq);
+}
+
+/*
+ * Set the maximum time for the in-service queue to consume its
+ * budget. This prevents seeky processes from lowering the throughput.
+ * In practice, a time-slice service scheme is used with seeky
+ * processes.
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ unsigned int timeout_coeff;
+
+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
+ timeout_coeff = 1;
+ else
+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+
+ bfqd->last_budget_start = ktime_get();
+
+ bfqq->budget_timeout = jiffies +
+ bfqd->bfq_timeout * timeout_coeff;
+}
+
+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ if (bfqq) {
+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
+ bfq_clear_bfqq_fifo_expire(bfqq);
+
+ bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
+
+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
+ bfqq->wr_coeff > 1 &&
+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+ time_is_before_jiffies(bfqq->budget_timeout)) {
+ /*
+ * For soft real-time queues, move the start
+ * of the weight-raising period forward by the
+ * time the queue has not received any
+ * service. Otherwise, a relatively long
+ * service delay is likely to cause the
+ * weight-raising period of the queue to end,
+ * because of the short duration of the
+ * weight-raising period of a soft real-time
+ * queue. It is worth noting that this move
+ * is not so dangerous for the other queues,
+ * because soft real-time queues are not
+ * greedy.
+ *
+ * To not add a further variable, we use the
+ * overloaded field budget_timeout to
+ * determine for how long the queue has not
+ * received service, i.e., how much time has
+ * elapsed since the queue expired. However,
+ * this is a little imprecise, because
+ * budget_timeout is set to jiffies if bfqq
+ * not only expires, but also remains with no
+ * request.
+ */
+ if (time_after(bfqq->budget_timeout,
+ bfqq->last_wr_start_finish))
+ bfqq->last_wr_start_finish +=
+ jiffies - bfqq->budget_timeout;
+ else
+ bfqq->last_wr_start_finish = jiffies;
+ }
+
+ bfq_set_budget_timeout(bfqd, bfqq);
+ bfq_log_bfqq(bfqd, bfqq,
+ "set_in_service_queue, cur-budget = %d",
+ bfqq->entity.budget);
+ }
+
+ bfqd->in_service_queue = bfqq;
+}
+
+/*
+ * Get and set a new queue for service.
+ */
+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
+
+ __bfq_set_in_service_queue(bfqd, bfqq);
+ return bfqq;
+}
+
+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq = bfqd->in_service_queue;
+ u32 sl;
+
+ bfq_mark_bfqq_wait_request(bfqq);
+
+ /*
+ * We don't want to idle for seeks, but we do want to allow
+ * fair distribution of slice time for a process doing back-to-back
+ * seeks. So allow a little bit of time for him to submit a new rq.
+ */
+ sl = bfqd->bfq_slice_idle;
+ /*
+ * Unless the queue is being weight-raised or the scenario is
+ * asymmetric, grant only minimum idle time if the queue
+ * is seeky. A long idling is preserved for a weight-raised
+ * queue, or, more in general, in an asymmetric scenario,
+ * because a long idling is needed for guaranteeing to a queue
+ * its reserved share of the throughput (in particular, it is
+ * needed if the queue has a higher weight than some other
+ * queue).
+ */
+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
+ bfq_symmetric_scenario(bfqd))
+ sl = min_t(u64, sl, BFQ_MIN_TT);
+
+ bfqd->last_idling_start = ktime_get();
+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
+ HRTIMER_MODE_REL);
+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
+}
+
+/*
+ * In autotuning mode, max_budget is dynamically recomputed as the
+ * amount of sectors transferred in timeout at the estimated peak
+ * rate. This enables BFQ to utilize a full timeslice with a full
+ * budget, even if the in-service queue is served at peak rate. And
+ * this maximises throughput with sequential workloads.
+ */
+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
+{
+ return (u64)bfqd->peak_rate * USEC_PER_MSEC *
+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
+}
+
+/*
+ * Update parameters related to throughput and responsiveness, as a
+ * function of the estimated peak rate. See comments on
+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ */
+static void update_thr_responsiveness_params(struct bfq_data *bfqd)
+{
+ int dev_type = blk_queue_nonrot(bfqd->queue);
+
+ if (bfqd->bfq_user_max_budget == 0)
+ bfqd->bfq_max_budget =
+ bfq_calc_max_budget(bfqd);
+
+ if (bfqd->device_speed == BFQ_BFQD_FAST &&
+ bfqd->peak_rate < device_speed_thresh[dev_type]) {
+ bfqd->device_speed = BFQ_BFQD_SLOW;
+ bfqd->RT_prod = R_slow[dev_type] *
+ T_slow[dev_type];
+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
+ bfqd->peak_rate > device_speed_thresh[dev_type]) {
+ bfqd->device_speed = BFQ_BFQD_FAST;
+ bfqd->RT_prod = R_fast[dev_type] *
+ T_fast[dev_type];
+ }
+
+ bfq_log(bfqd,
+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
+ dev_type == 0 ? "ROT" : "NONROT",
+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
+ bfqd->device_speed == BFQ_BFQD_FAST ?
+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
+ BFQ_RATE_SHIFT);
+}
+
+static void bfq_reset_rate_computation(struct bfq_data *bfqd,
+ struct request *rq)
+{
+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */
+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
+ bfqd->peak_rate_samples = 1;
+ bfqd->sequential_samples = 0;
+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
+ blk_rq_sectors(rq);
+ } else /* no new rq dispatched, just reset the number of samples */
+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
+
+ bfq_log(bfqd,
+ "reset_rate_computation at end, sample %u/%u tot_sects %llu",
+ bfqd->peak_rate_samples, bfqd->sequential_samples,
+ bfqd->tot_sectors_dispatched);
+}
+
+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
+{
+ u32 rate, weight, divisor;
+
+ /*
+ * For the convergence property to hold (see comments on
+ * bfq_update_peak_rate()) and for the assessment to be
+ * reliable, a minimum number of samples must be present, and
+ * a minimum amount of time must have elapsed. If not so, do
+ * not compute new rate. Just reset parameters, to get ready
+ * for a new evaluation attempt.
+ */
+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL)
+ goto reset_computation;
+
+ /*
+ * If a new request completion has occurred after last
+ * dispatch, then, to approximate the rate at which requests
+ * have been served by the device, it is more precise to
+ * extend the observation interval to the last completion.
+ */
+ bfqd->delta_from_first =
+ max_t(u64, bfqd->delta_from_first,
+ bfqd->last_completion - bfqd->first_dispatch);
+
+ /*
+ * Rate computed in sects/usec, and not sects/nsec, for
+ * precision issues.
+ */
+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
+
+ /*
+ * Peak rate not updated if:
+ * - the percentage of sequential dispatches is below 3/4 of the
+ * total, and rate is below the current estimated peak rate
+ * - rate is unreasonably high (> 20M sectors/sec)
+ */
+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
+ rate <= bfqd->peak_rate) ||
+ rate > 20<<BFQ_RATE_SHIFT)
+ goto reset_computation;
+
+ /*
+ * We have to update the peak rate, at last! To this purpose,
+ * we use a low-pass filter. We compute the smoothing constant
+ * of the filter as a function of the 'weight' of the new
+ * measured rate.
+ *
+ * As can be seen in next formulas, we define this weight as a
+ * quantity proportional to how sequential the workload is,
+ * and to how long the observation time interval is.
+ *
+ * The weight runs from 0 to 8. The maximum value of the
+ * weight, 8, yields the minimum value for the smoothing
+ * constant. At this minimum value for the smoothing constant,
+ * the measured rate contributes for half of the next value of
+ * the estimated peak rate.
+ *
+ * So, the first step is to compute the weight as a function
+ * of how sequential the workload is. Note that the weight
+ * cannot reach 9, because bfqd->sequential_samples cannot
+ * become equal to bfqd->peak_rate_samples, which, in its
+ * turn, holds true because bfqd->sequential_samples is not
+ * incremented for the first sample.
+ */
+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
+
+ /*
+ * Second step: further refine the weight as a function of the
+ * duration of the observation interval.
+ */
+ weight = min_t(u32, 8,
+ div_u64(weight * bfqd->delta_from_first,
+ BFQ_RATE_REF_INTERVAL));
+
+ /*
+ * Divisor ranging from 10, for minimum weight, to 2, for
+ * maximum weight.
+ */
+ divisor = 10 - weight;
+
+ /*
+ * Finally, update peak rate:
+ *
+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
+ */
+ bfqd->peak_rate *= divisor-1;
+ bfqd->peak_rate /= divisor;
+ rate /= divisor; /* smoothing constant alpha = 1/divisor */
+
+ bfqd->peak_rate += rate;
+ update_thr_responsiveness_params(bfqd);
+
+reset_computation:
+ bfq_reset_rate_computation(bfqd, rq);
+}
+
+/*
+ * Update the read/write peak rate (the main quantity used for
+ * auto-tuning, see update_thr_responsiveness_params()).
+ *
+ * It is not trivial to estimate the peak rate (correctly): because of
+ * the presence of sw and hw queues between the scheduler and the
+ * device components that finally serve I/O requests, it is hard to
+ * say exactly when a given dispatched request is served inside the
+ * device, and for how long. As a consequence, it is hard to know
+ * precisely at what rate a given set of requests is actually served
+ * by the device.
+ *
+ * On the opposite end, the dispatch time of any request is trivially
+ * available, and, from this piece of information, the "dispatch rate"
+ * of requests can be immediately computed. So, the idea in the next
+ * function is to use what is known, namely request dispatch times
+ * (plus, when useful, request completion times), to estimate what is
+ * unknown, namely in-device request service rate.
+ *
+ * The main issue is that, because of the above facts, the rate at
+ * which a certain set of requests is dispatched over a certain time
+ * interval can vary greatly with respect to the rate at which the
+ * same requests are then served. But, since the size of any
+ * intermediate queue is limited, and the service scheme is lossless
+ * (no request is silently dropped), the following obvious convergence
+ * property holds: the number of requests dispatched MUST become
+ * closer and closer to the number of requests completed as the
+ * observation interval grows. This is the key property used in
+ * the next function to estimate the peak service rate as a function
+ * of the observed dispatch rate. The function assumes to be invoked
+ * on every request dispatch.
+ */
+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
+{
+ u64 now_ns = ktime_get_ns();
+
+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */
+ bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
+ bfqd->peak_rate_samples);
+ bfq_reset_rate_computation(bfqd, rq);
+ goto update_last_values; /* will add one sample */
+ }
+
+ /*
+ * Device idle for very long: the observation interval lasting
+ * up to this dispatch cannot be a valid observation interval
+ * for computing a new peak rate (similarly to the late-
+ * completion event in bfq_completed_request()). Go to
+ * update_rate_and_reset to have the following three steps
+ * taken:
+ * - close the observation interval at the last (previous)
+ * request dispatch or completion
+ * - compute rate, if possible, for that observation interval
+ * - start a new observation interval with this dispatch
+ */
+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
+ bfqd->rq_in_driver == 0)
+ goto update_rate_and_reset;
+
+ /* Update sampling information */
+ bfqd->peak_rate_samples++;
+
+ if ((bfqd->rq_in_driver > 0 ||
+ now_ns - bfqd->last_completion < BFQ_MIN_TT)
+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
+ bfqd->sequential_samples++;
+
+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
+
+ /* Reset max observed rq size every 32 dispatches */
+ if (likely(bfqd->peak_rate_samples % 32))
+ bfqd->last_rq_max_size =
+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
+ else
+ bfqd->last_rq_max_size = blk_rq_sectors(rq);
+
+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
+
+ /* Target observation interval not yet reached, go on sampling */
+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
+ goto update_last_values;
+
+update_rate_and_reset:
+ bfq_update_rate_reset(bfqd, rq);
+update_last_values:
+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+ bfqd->last_dispatch = now_ns;
+}
+
+/*
+ * Remove request from internal lists.
+ */
+static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+ /*
+ * For consistency, the next instruction should have been
+ * executed after removing the request from the queue and
+ * dispatching it. We execute instead this instruction before
+ * bfq_remove_request() (and hence introduce a temporary
+ * inconsistency), for efficiency. In fact, should this
+ * dispatch occur for a non in-service bfqq, this anticipated
+ * increment prevents two counters related to bfqq->dispatched
+ * from risking to be, first, uselessly decremented, and then
+ * incremented again when the (new) value of bfqq->dispatched
+ * happens to be taken into account.
+ */
+ bfqq->dispatched++;
+ bfq_update_peak_rate(q->elevator->elevator_data, rq);
+
+ bfq_remove_request(q, rq);
+}
+
+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /*
+ * If this bfqq is shared between multiple processes, check
+ * to make sure that those processes are still issuing I/Os
+ * within the mean seek distance. If not, it may be time to
+ * break the queues apart again.
+ */
+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
+ bfq_mark_bfqq_split_coop(bfqq);
+
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ if (bfqq->dispatched == 0)
+ /*
+ * Overloading budget_timeout field to store
+ * the time at which the queue remains with no
+ * backlog and no outstanding request; used by
+ * the weight-raising mechanism.
+ */
+ bfqq->budget_timeout = jiffies;
+
+ bfq_del_bfqq_busy(bfqd, bfqq, true);
+ } else {
+ bfq_requeue_bfqq(bfqd, bfqq);
+ /*
+ * Resort priority tree of potential close cooperators.
+ */
+ bfq_pos_tree_add_move(bfqd, bfqq);
+ }
+
+ /*
+ * All in-service entities must have been properly deactivated
+ * or requeued before executing the next function, which
+ * resets all in-service entites as no more in service.
+ */
+ __bfq_bfqd_reset_in_service(bfqd);
+}
+
+/**
+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
+ * @bfqd: device data.
+ * @bfqq: queue to update.
+ * @reason: reason for expiration.
+ *
+ * Handle the feedback on @bfqq budget at queue expiration.
+ * See the body for detailed comments.
+ */
+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ enum bfqq_expiration reason)
+{
+ struct request *next_rq;
+ int budget, min_budget;
+
+ min_budget = bfq_min_budget(bfqd);
+
+ if (bfqq->wr_coeff == 1)
+ budget = bfqq->max_budget;
+ else /*
+ * Use a constant, low budget for weight-raised queues,
+ * to help achieve a low latency. Keep it slightly higher
+ * than the minimum possible budget, to cause a little
+ * bit fewer expirations.
+ */
+ budget = 2 * min_budget;
+
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
+ budget, bfq_min_budget(bfqd));
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
+
+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
+ switch (reason) {
+ /*
+ * Caveat: in all the following cases we trade latency
+ * for throughput.
+ */
+ case BFQQE_TOO_IDLE:
+ /*
+ * This is the only case where we may reduce
+ * the budget: if there is no request of the
+ * process still waiting for completion, then
+ * we assume (tentatively) that the timer has
+ * expired because the batch of requests of
+ * the process could have been served with a
+ * smaller budget. Hence, betting that
+ * process will behave in the same way when it
+ * becomes backlogged again, we reduce its
+ * next budget. As long as we guess right,
+ * this budget cut reduces the latency
+ * experienced by the process.
+ *
+ * However, if there are still outstanding
+ * requests, then the process may have not yet
+ * issued its next request just because it is
+ * still waiting for the completion of some of
+ * the still outstanding ones. So in this
+ * subcase we do not reduce its budget, on the
+ * contrary we increase it to possibly boost
+ * the throughput, as discussed in the
+ * comments to the BUDGET_TIMEOUT case.
+ */
+ if (bfqq->dispatched > 0) /* still outstanding reqs */
+ budget = min(budget * 2, bfqd->bfq_max_budget);
+ else {
+ if (budget > 5 * min_budget)
+ budget -= 4 * min_budget;
+ else
+ budget = min_budget;
+ }
+ break;
+ case BFQQE_BUDGET_TIMEOUT:
+ /*
+ * We double the budget here because it gives
+ * the chance to boost the throughput if this
+ * is not a seeky process (and has bumped into
+ * this timeout because of, e.g., ZBR).
+ */
+ budget = min(budget * 2, bfqd->bfq_max_budget);
+ break;
+ case BFQQE_BUDGET_EXHAUSTED:
+ /*
+ * The process still has backlog, and did not
+ * let either the budget timeout or the disk
+ * idling timeout expire. Hence it is not
+ * seeky, has a short thinktime and may be
+ * happy with a higher budget too. So
+ * definitely increase the budget of this good
+ * candidate to boost the disk throughput.
+ */
+ budget = min(budget * 4, bfqd->bfq_max_budget);
+ break;
+ case BFQQE_NO_MORE_REQUESTS:
+ /*
+ * For queues that expire for this reason, it
+ * is particularly important to keep the
+ * budget close to the actual service they
+ * need. Doing so reduces the timestamp
+ * misalignment problem described in the
+ * comments in the body of
+ * __bfq_activate_entity. In fact, suppose
+ * that a queue systematically expires for
+ * BFQQE_NO_MORE_REQUESTS and presents a
+ * new request in time to enjoy timestamp
+ * back-shifting. The larger the budget of the
+ * queue is with respect to the service the
+ * queue actually requests in each service
+ * slot, the more times the queue can be
+ * reactivated with the same virtual finish
+ * time. It follows that, even if this finish
+ * time is pushed to the system virtual time
+ * to reduce the consequent timestamp
+ * misalignment, the queue unjustly enjoys for
+ * many re-activations a lower finish time
+ * than all newly activated queues.
+ *
+ * The service needed by bfqq is measured
+ * quite precisely by bfqq->entity.service.
+ * Since bfqq does not enjoy device idling,
+ * bfqq->entity.service is equal to the number
+ * of sectors that the process associated with
+ * bfqq requested to read/write before waiting
+ * for request completions, or blocking for
+ * other reasons.
+ */
+ budget = max_t(int, bfqq->entity.service, min_budget);
+ break;
+ default:
+ return;
+ }
+ } else if (!bfq_bfqq_sync(bfqq)) {
+ /*
+ * Async queues get always the maximum possible
+ * budget, as for them we do not care about latency
+ * (in addition, their ability to dispatch is limited
+ * by the charging factor).
+ */
+ budget = bfqd->bfq_max_budget;
+ }
+
+ bfqq->max_budget = budget;
+
+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
+ !bfqd->bfq_user_max_budget)
+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
+
+ /*
+ * If there is still backlog, then assign a new budget, making
+ * sure that it is large enough for the next request. Since
+ * the finish time of bfqq must be kept in sync with the
+ * budget, be sure to call __bfq_bfqq_expire() *after* this
+ * update.
+ *
+ * If there is no backlog, then no need to update the budget;
+ * it will be updated on the arrival of a new request.
+ */
+ next_rq = bfqq->next_rq;
+ if (next_rq)
+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(next_rq, bfqq));
+
+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
+ next_rq ? blk_rq_sectors(next_rq) : 0,
+ bfqq->entity.budget);
+}
+
+/*
+ * Return true if the process associated with bfqq is "slow". The slow
+ * flag is used, in addition to the budget timeout, to reduce the
+ * amount of service provided to seeky processes, and thus reduce
+ * their chances to lower the throughput. More details in the comments
+ * on the function bfq_bfqq_expire().
+ *
+ * An important observation is in order: as discussed in the comments
+ * on the function bfq_update_peak_rate(), with devices with internal
+ * queues, it is hard if ever possible to know when and for how long
+ * an I/O request is processed by the device (apart from the trivial
+ * I/O pattern where a new request is dispatched only after the
+ * previous one has been completed). This makes it hard to evaluate
+ * the real rate at which the I/O requests of each bfq_queue are
+ * served. In fact, for an I/O scheduler like BFQ, serving a
+ * bfq_queue means just dispatching its requests during its service
+ * slot (i.e., until the budget of the queue is exhausted, or the
+ * queue remains idle, or, finally, a timeout fires). But, during the
+ * service slot of a bfq_queue, around 100 ms at most, the device may
+ * be even still processing requests of bfq_queues served in previous
+ * service slots. On the opposite end, the requests of the in-service
+ * bfq_queue may be completed after the service slot of the queue
+ * finishes.
+ *
+ * Anyway, unless more sophisticated solutions are used
+ * (where possible), the sum of the sizes of the requests dispatched
+ * during the service slot of a bfq_queue is probably the only
+ * approximation available for the service received by the bfq_queue
+ * during its service slot. And this sum is the quantity used in this
+ * function to evaluate the I/O speed of a process.
+ */
+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool compensate, enum bfqq_expiration reason,
+ unsigned long *delta_ms)
+{
+ ktime_t delta_ktime;
+ u32 delta_usecs;
+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
+
+ if (!bfq_bfqq_sync(bfqq))
+ return false;
+
+ if (compensate)
+ delta_ktime = bfqd->last_idling_start;
+ else
+ delta_ktime = ktime_get();
+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
+ delta_usecs = ktime_to_us(delta_ktime);
+
+ /* don't use too short time intervals */
+ if (delta_usecs < 1000) {
+ if (blk_queue_nonrot(bfqd->queue))
+ /*
+ * give same worst-case guarantees as idling
+ * for seeky
+ */
+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
+ else /* charge at least one seek */
+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
+
+ return slow;
+ }
+
+ *delta_ms = delta_usecs / USEC_PER_MSEC;
+
+ /*
+ * Use only long (> 20ms) intervals to filter out excessive
+ * spikes in service rate estimation.
+ */
+ if (delta_usecs > 20000) {
+ /*
+ * Caveat for rotational devices: processes doing I/O
+ * in the slower disk zones tend to be slow(er) even
+ * if not seeky. In this respect, the estimated peak
+ * rate is likely to be an average over the disk
+ * surface. Accordingly, to not be too harsh with
+ * unlucky processes, a process is deemed slow only if
+ * its rate has been lower than half of the estimated
+ * peak rate.
+ */
+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
+ }
+
+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
+
+ return slow;
+}
+
+/*
+ * To be deemed as soft real-time, an application must meet two
+ * requirements. First, the application must not require an average
+ * bandwidth higher than the approximate bandwidth required to playback or
+ * record a compressed high-definition video.
+ * The next function is invoked on the completion of the last request of a
+ * batch, to compute the next-start time instant, soft_rt_next_start, such
+ * that, if the next request of the application does not arrive before
+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
+ *
+ * The second requirement is that the request pattern of the application is
+ * isochronous, i.e., that, after issuing a request or a batch of requests,
+ * the application stops issuing new requests until all its pending requests
+ * have been completed. After that, the application may issue a new batch,
+ * and so on.
+ * For this reason the next function is invoked to compute
+ * soft_rt_next_start only for applications that meet this requirement,
+ * whereas soft_rt_next_start is set to infinity for applications that do
+ * not.
+ *
+ * Unfortunately, even a greedy application may happen to behave in an
+ * isochronous way if the CPU load is high. In fact, the application may
+ * stop issuing requests while the CPUs are busy serving other processes,
+ * then restart, then stop again for a while, and so on. In addition, if
+ * the disk achieves a low enough throughput with the request pattern
+ * issued by the application (e.g., because the request pattern is random
+ * and/or the device is slow), then the application may meet the above
+ * bandwidth requirement too. To prevent such a greedy application to be
+ * deemed as soft real-time, a further rule is used in the computation of
+ * soft_rt_next_start: soft_rt_next_start must be higher than the current
+ * time plus the maximum time for which the arrival of a request is waited
+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
+ * This filters out greedy applications, as the latter issue instead their
+ * next request as soon as possible after the last one has been completed
+ * (in contrast, when a batch of requests is completed, a soft real-time
+ * application spends some time processing data).
+ *
+ * Unfortunately, the last filter may easily generate false positives if
+ * only bfqd->bfq_slice_idle is used as a reference time interval and one
+ * or both the following cases occur:
+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
+ * HZ=100.
+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
+ * for a while, then suddenly 'jump' by several units to recover the lost
+ * increments. This seems to happen, e.g., inside virtual machines.
+ * To address this issue, we do not use as a reference time interval just
+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
+ * particular we add the minimum number of jiffies for which the filter
+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual
+ * machines.
+ */
+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ return max(bfqq->last_idle_bklogged +
+ HZ * bfqq->service_from_backlogged /
+ bfqd->bfq_wr_max_softrt_rate,
+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
+}
+
+/*
+ * Return the farthest future time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_greatest_from_now(void)
+{
+ return jiffies + MAX_JIFFY_OFFSET;
+}
+
+/*
+ * Return the farthest past time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_smallest_from_now(void)
+{
+ return jiffies - MAX_JIFFY_OFFSET;
+}
+
+/**
+ * bfq_bfqq_expire - expire a queue.
+ * @bfqd: device owning the queue.
+ * @bfqq: the queue to expire.
+ * @compensate: if true, compensate for the time spent idling.
+ * @reason: the reason causing the expiration.
+ *
+ * If the process associated with bfqq does slow I/O (e.g., because it
+ * issues random requests), we charge bfqq with the time it has been
+ * in service instead of the service it has received (see
+ * bfq_bfqq_charge_time for details on how this goal is achieved). As
+ * a consequence, bfqq will typically get higher timestamps upon
+ * reactivation, and hence it will be rescheduled as if it had
+ * received more service than what it has actually received. In the
+ * end, bfqq receives less service in proportion to how slowly its
+ * associated process consumes its budgets (and hence how seriously it
+ * tends to lower the throughput). In addition, this time-charging
+ * strategy guarantees time fairness among slow processes. In
+ * contrast, if the process associated with bfqq is not slow, we
+ * charge bfqq exactly with the service it has received.
+ *
+ * Charging time to the first type of queues and the exact service to
+ * the other has the effect of using the WF2Q+ policy to schedule the
+ * former on a timeslice basis, without violating service domain
+ * guarantees among the latter.
+ */
+void bfq_bfqq_expire(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ bool compensate,
+ enum bfqq_expiration reason)
+{
+ bool slow;
+ unsigned long delta = 0;
+ struct bfq_entity *entity = &bfqq->entity;
+ int ref;
+
+ /*
+ * Check whether the process is slow (see bfq_bfqq_is_slow).
+ */
+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
+
+ /*
+ * Increase service_from_backlogged before next statement,
+ * because the possible next invocation of
+ * bfq_bfqq_charge_time would likely inflate
+ * entity->service. In contrast, service_from_backlogged must
+ * contain real service, to enable the soft real-time
+ * heuristic to correctly compute the bandwidth consumed by
+ * bfqq.
+ */
+ bfqq->service_from_backlogged += entity->service;
+
+ /*
+ * As above explained, charge slow (typically seeky) and
+ * timed-out queues with the time and not the service
+ * received, to favor sequential workloads.
+ *
+ * Processes doing I/O in the slower disk zones will tend to
+ * be slow(er) even if not seeky. Therefore, since the
+ * estimated peak rate is actually an average over the disk
+ * surface, these processes may timeout just for bad luck. To
+ * avoid punishing them, do not charge time to processes that
+ * succeeded in consuming at least 2/3 of their budget. This
+ * allows BFQ to preserve enough elasticity to still perform
+ * bandwidth, and not time, distribution with little unlucky
+ * or quasi-sequential processes.
+ */
+ if (bfqq->wr_coeff == 1 &&
+ (slow ||
+ (reason == BFQQE_BUDGET_TIMEOUT &&
+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
+ bfq_bfqq_charge_time(bfqd, bfqq, delta);
+
+ if (reason == BFQQE_TOO_IDLE &&
+ entity->service <= 2 * entity->budget / 10)
+ bfq_clear_bfqq_IO_bound(bfqq);
+
+ if (bfqd->low_latency && bfqq->wr_coeff == 1)
+ bfqq->last_wr_start_finish = jiffies;
+
+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
+ RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ /*
+ * If we get here, and there are no outstanding
+ * requests, then the request pattern is isochronous
+ * (see the comments on the function
+ * bfq_bfqq_softrt_next_start()). Thus we can compute
+ * soft_rt_next_start. If, instead, the queue still
+ * has outstanding requests, then we have to wait for
+ * the completion of all the outstanding requests to
+ * discover whether the request pattern is actually
+ * isochronous.
+ */
+ if (bfqq->dispatched == 0)
+ bfqq->soft_rt_next_start =
+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
+ else {
+ /*
+ * The application is still waiting for the
+ * completion of one or more requests:
+ * prevent it from possibly being incorrectly
+ * deemed as soft real-time by setting its
+ * soft_rt_next_start to infinity. In fact,
+ * without this assignment, the application
+ * would be incorrectly deemed as soft
+ * real-time if:
+ * 1) it issued a new request before the
+ * completion of all its in-flight
+ * requests, and
+ * 2) at that time, its soft_rt_next_start
+ * happened to be in the past.
+ */
+ bfqq->soft_rt_next_start =
+ bfq_greatest_from_now();
+ /*
+ * Schedule an update of soft_rt_next_start to when
+ * the task may be discovered to be isochronous.
+ */
+ bfq_mark_bfqq_softrt_update(bfqq);
+ }
+ }
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
+ slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
+
+ /*
+ * Increase, decrease or leave budget unchanged according to
+ * reason.
+ */
+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+ ref = bfqq->ref;
+ __bfq_bfqq_expire(bfqd, bfqq);
+
+ /* mark bfqq as waiting a request only if a bic still points to it */
+ if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
+ reason != BFQQE_BUDGET_TIMEOUT &&
+ reason != BFQQE_BUDGET_EXHAUSTED)
+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+/*
+ * Budget timeout is not implemented through a dedicated timer, but
+ * just checked on request arrivals and completions, as well as on
+ * idle timer expirations.
+ */
+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+{
+ return time_is_before_eq_jiffies(bfqq->budget_timeout);
+}
+
+/*
+ * If we expire a queue that is actively waiting (i.e., with the
+ * device idled) for the arrival of a new request, then we may incur
+ * the timestamp misalignment problem described in the body of the
+ * function __bfq_activate_entity. Hence we return true only if this
+ * condition does not hold, or if the queue is slow enough to deserve
+ * only to be kicked off for preserving a high throughput.
+ */
+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+{
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "may_budget_timeout: wait_request %d left %d timeout %d",
+ bfq_bfqq_wait_request(bfqq),
+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
+ bfq_bfqq_budget_timeout(bfqq));
+
+ return (!bfq_bfqq_wait_request(bfqq) ||
+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
+ &&
+ bfq_bfqq_budget_timeout(bfqq);
+}
+
+/*
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for the queue. As a consequence, since
+ * device idling plays a critical role in both throughput boosting and
+ * service guarantees, the return value of this function plays a
+ * critical role in both these aspects as well.
+ *
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
+ *
+ * In more detail, the return value of this function is obtained by,
+ * first, computing a number of boolean variables that take into
+ * account throughput and service-guarantee issues, and, then,
+ * combining these variables in a logical expression. Most of the
+ * issues taken into account are not trivial. We discuss these issues
+ * individually while introducing the variables.
+ */
+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+ bool idling_boosts_thr, idling_boosts_thr_without_issues,
+ idling_needed_for_service_guarantees,
+ asymmetric_scenario;
+
+ if (bfqd->strict_guarantees)
+ return true;
+
+ /*
+ * The next variable takes into account the cases where idling
+ * boosts the throughput.
+ *
+ * The value of the variable is computed considering, first, that
+ * idling is virtually always beneficial for the throughput if:
+ * (a) the device is not NCQ-capable, or
+ * (b) regardless of the presence of NCQ, the device is rotational
+ * and the request pattern for bfqq is I/O-bound and sequential.
+ *
+ * Secondly, and in contrast to the above item (b), idling an
+ * NCQ-capable flash-based device would not boost the
+ * throughput even with sequential I/O; rather it would lower
+ * the throughput in proportion to how fast the device
+ * is. Accordingly, the next variable is true if any of the
+ * above conditions (a) and (b) is true, and, in particular,
+ * happens to be false if bfqd is an NCQ-capable flash-based
+ * device.
+ */
+ idling_boosts_thr = !bfqd->hw_tag ||
+ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
+ bfq_bfqq_idle_window(bfqq));
+
+ /*
+ * The value of the next variable,
+ * idling_boosts_thr_without_issues, is equal to that of
+ * idling_boosts_thr, unless a special case holds. In this
+ * special case, described below, idling may cause problems to
+ * weight-raised queues.
+ *
+ * When the request pool is saturated (e.g., in the presence
+ * of write hogs), if the processes associated with
+ * non-weight-raised queues ask for requests at a lower rate,
+ * then processes associated with weight-raised queues have a
+ * higher probability to get a request from the pool
+ * immediately (or at least soon) when they need one. Thus
+ * they have a higher probability to actually get a fraction
+ * of the device throughput proportional to their high
+ * weight. This is especially true with NCQ-capable drives,
+ * which enqueue several requests in advance, and further
+ * reorder internally-queued requests.
+ *
+ * For this reason, we force to false the value of
+ * idling_boosts_thr_without_issues if there are weight-raised
+ * busy queues. In this case, and if bfqq is not weight-raised,
+ * this guarantees that the device is not idled for bfqq (if,
+ * instead, bfqq is weight-raised, then idling will be
+ * guaranteed by another variable, see below). Combined with
+ * the timestamping rules of BFQ (see [1] for details), this
+ * behavior causes bfqq, and hence any sync non-weight-raised
+ * queue, to get a lower number of requests served, and thus
+ * to ask for a lower number of requests from the request
+ * pool, before the busy weight-raised queues get served
+ * again. This often mitigates starvation problems in the
+ * presence of heavy write workloads and NCQ, thereby
+ * guaranteeing a higher application and system responsiveness
+ * in these hostile scenarios.
+ */
+ idling_boosts_thr_without_issues = idling_boosts_thr &&
+ bfqd->wr_busy_queues == 0;
+
+ /*
+ * There is then a case where idling must be performed not
+ * for throughput concerns, but to preserve service
+ * guarantees.
+ *
+ * To introduce this case, we can note that allowing the drive
+ * to enqueue more than one request at a time, and hence
+ * delegating de facto final scheduling decisions to the
+ * drive's internal scheduler, entails loss of control on the
+ * actual request service order. In particular, the critical
+ * situation is when requests from different processes happen
+ * to be present, at the same time, in the internal queue(s)
+ * of the drive. In such a situation, the drive, by deciding
+ * the service order of the internally-queued requests, does
+ * determine also the actual throughput distribution among
+ * these processes. But the drive typically has no notion or
+ * concern about per-process throughput distribution, and
+ * makes its decisions only on a per-request basis. Therefore,
+ * the service distribution enforced by the drive's internal
+ * scheduler is likely to coincide with the desired
+ * device-throughput distribution only in a completely
+ * symmetric scenario where:
+ * (i) each of these processes must get the same throughput as
+ * the others;
+ * (ii) all these processes have the same I/O pattern
+ (either sequential or random).
+ * In fact, in such a scenario, the drive will tend to treat
+ * the requests of each of these processes in about the same
+ * way as the requests of the others, and thus to provide
+ * each of these processes with about the same throughput
+ * (which is exactly the desired throughput distribution). In
+ * contrast, in any asymmetric scenario, device idling is
+ * certainly needed to guarantee that bfqq receives its
+ * assigned fraction of the device throughput (see [1] for
+ * details).
+ *
+ * We address this issue by controlling, actually, only the
+ * symmetry sub-condition (i), i.e., provided that
+ * sub-condition (i) holds, idling is not performed,
+ * regardless of whether sub-condition (ii) holds. In other
+ * words, only if sub-condition (i) holds, then idling is
+ * allowed, and the device tends to be prevented from queueing
+ * many requests, possibly of several processes. The reason
+ * for not controlling also sub-condition (ii) is that we
+ * exploit preemption to preserve guarantees in case of
+ * symmetric scenarios, even if (ii) does not hold, as
+ * explained in the next two paragraphs.
+ *
+ * Even if a queue, say Q, is expired when it remains idle, Q
+ * can still preempt the new in-service queue if the next
+ * request of Q arrives soon (see the comments on
+ * bfq_bfqq_update_budg_for_activation). If all queues and
+ * groups have the same weight, this form of preemption,
+ * combined with the hole-recovery heuristic described in the
+ * comments on function bfq_bfqq_update_budg_for_activation,
+ * are enough to preserve a correct bandwidth distribution in
+ * the mid term, even without idling. In fact, even if not
+ * idling allows the internal queues of the device to contain
+ * many requests, and thus to reorder requests, we can rather
+ * safely assume that the internal scheduler still preserves a
+ * minimum of mid-term fairness. The motivation for using
+ * preemption instead of idling is that, by not idling,
+ * service guarantees are preserved without minimally
+ * sacrificing throughput. In other words, both a high
+ * throughput and its desired distribution are obtained.
+ *
+ * More precisely, this preemption-based, idleless approach
+ * provides fairness in terms of IOPS, and not sectors per
+ * second. This can be seen with a simple example. Suppose
+ * that there are two queues with the same weight, but that
+ * the first queue receives requests of 8 sectors, while the
+ * second queue receives requests of 1024 sectors. In
+ * addition, suppose that each of the two queues contains at
+ * most one request at a time, which implies that each queue
+ * always remains idle after it is served. Finally, after
+ * remaining idle, each queue receives very quickly a new
+ * request. It follows that the two queues are served
+ * alternatively, preempting each other if needed. This
+ * implies that, although both queues have the same weight,
+ * the queue with large requests receives a service that is
+ * 1024/8 times as high as the service received by the other
+ * queue.
+ *
+ * On the other hand, device idling is performed, and thus
+ * pure sector-domain guarantees are provided, for the
+ * following queues, which are likely to need stronger
+ * throughput guarantees: weight-raised queues, and queues
+ * with a higher weight than other queues. When such queues
+ * are active, sub-condition (i) is false, which triggers
+ * device idling.
+ *
+ * According to the above considerations, the next variable is
+ * true (only) if sub-condition (i) holds. To compute the
+ * value of this variable, we not only use the return value of
+ * the function bfq_symmetric_scenario(), but also check
+ * whether bfqq is being weight-raised, because
+ * bfq_symmetric_scenario() does not take into account also
+ * weight-raised queues (see comments on
+ * bfq_weights_tree_add()).
+ *
+ * As a side note, it is worth considering that the above
+ * device-idling countermeasures may however fail in the
+ * following unlucky scenario: if idling is (correctly)
+ * disabled in a time period during which all symmetry
+ * sub-conditions hold, and hence the device is allowed to
+ * enqueue many requests, but at some later point in time some
+ * sub-condition stops to hold, then it may become impossible
+ * to let requests be served in the desired order until all
+ * the requests already queued in the device have been served.
+ */
+ asymmetric_scenario = bfqq->wr_coeff > 1 ||
+ !bfq_symmetric_scenario(bfqd);
+
+ /*
+ * Finally, there is a case where maximizing throughput is the
+ * best choice even if it may cause unfairness toward
+ * bfqq. Such a case is when bfqq became active in a burst of
+ * queue activations. Queues that became active during a large
+ * burst benefit only from throughput, as discussed in the
+ * comments on bfq_handle_burst. Thus, if bfqq became active
+ * in a burst and not idling the device maximizes throughput,
+ * then the device must no be idled, because not idling the
+ * device provides bfqq and all other queues in the burst with
+ * maximum benefit. Combining this and the above case, we can
+ * now establish when idling is actually needed to preserve
+ * service guarantees.
+ */
+ idling_needed_for_service_guarantees =
+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
+
+ /*
+ * We have now all the components we need to compute the return
+ * value of the function, which is true only if both the following
+ * conditions hold:
+ * 1) bfqq is sync, because idling make sense only for sync queues;
+ * 2) idling either boosts the throughput (without issues), or
+ * is necessary to preserve service guarantees.
+ */
+ return bfq_bfqq_sync(bfqq) &&
+ (idling_boosts_thr_without_issues ||
+ idling_needed_for_service_guarantees);
+}
+
+/*
+ * If the in-service queue is empty but the function bfq_bfqq_may_idle
+ * returns true, then:
+ * 1) the queue must remain in service and cannot be expired, and
+ * 2) the device must be idled to wait for the possible arrival of a new
+ * request for the queue.
+ * See the comments on the function bfq_bfqq_may_idle for the reasons
+ * why performing device idling is the best choice to boost the throughput
+ * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * returns true.
+ */
+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+
+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
+ bfq_bfqq_may_idle(bfqq);
+}
+
+/*
+ * Select a queue for service. If we have a current queue in service,
+ * check whether to continue servicing it, or retrieve and set a new one.
+ */
+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq;
+ struct request *next_rq;
+ enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
+
+ bfqq = bfqd->in_service_queue;
+ if (!bfqq)
+ goto new_queue;
+
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
+
+ if (bfq_may_expire_for_budg_timeout(bfqq) &&
+ !bfq_bfqq_wait_request(bfqq) &&
+ !bfq_bfqq_must_idle(bfqq))
+ goto expire;
+
+check_queue:
+ /*
+ * This loop is rarely executed more than once. Even when it
+ * happens, it is much more convenient to re-execute this loop
+ * than to return NULL and trigger a new dispatch to get a
+ * request served.
+ */
+ next_rq = bfqq->next_rq;
+ /*
+ * If bfqq has requests queued and it has enough budget left to
+ * serve them, keep the queue, otherwise expire it.
+ */
+ if (next_rq) {
+ if (bfq_serv_to_charge(next_rq, bfqq) >
+ bfq_bfqq_budget_left(bfqq)) {
+ /*
+ * Expire the queue for budget exhaustion,
+ * which makes sure that the next budget is
+ * enough to serve the next request, even if
+ * it comes from the fifo expired path.
+ */
+ reason = BFQQE_BUDGET_EXHAUSTED;
+ goto expire;
+ } else {
+ /*
+ * The idle timer may be pending because we may
+ * not disable disk idling even when a new request
+ * arrives.
+ */
+ if (bfq_bfqq_wait_request(bfqq)) {
+ /*
+ * If we get here: 1) at least a new request
+ * has arrived but we have not disabled the
+ * timer because the request was too small,
+ * 2) then the block layer has unplugged
+ * the device, causing the dispatch to be
+ * invoked.
+ *
+ * Since the device is unplugged, now the
+ * requests are probably large enough to
+ * provide a reasonable throughput.
+ * So we disable idling.
+ */
+ bfq_clear_bfqq_wait_request(bfqq);
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+ bfqg_stats_update_idle_time(bfqq_group(bfqq));
+ }
+ goto keep_queue;
+ }
+ }
+
+ /*
+ * No requests pending. However, if the in-service queue is idling
+ * for a new request, or has requests waiting for a completion and
+ * may idle after their completion, then keep it anyway.
+ */
+ if (bfq_bfqq_wait_request(bfqq) ||
+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+ bfqq = NULL;
+ goto keep_queue;
+ }
+
+ reason = BFQQE_NO_MORE_REQUESTS;
+expire:
+ bfq_bfqq_expire(bfqd, bfqq, false, reason);
+new_queue:
+ bfqq = bfq_set_in_service_queue(bfqd);
+ if (bfqq) {
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
+ goto check_queue;
+ }
+keep_queue:
+ if (bfqq)
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
+ else
+ bfq_log(bfqd, "select_queue: no queue returned");
+
+ return bfqq;
+}
+
+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
+ bfq_log_bfqq(bfqd, bfqq,
+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+ jiffies_to_msecs(bfqq->wr_cur_max_time),
+ bfqq->wr_coeff,
+ bfqq->entity.weight, bfqq->entity.orig_weight);
+
+ if (entity->prio_changed)
+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
+
+ /*
+ * If the queue was activated in a burst, or too much
+ * time has elapsed from the beginning of this
+ * weight-raising period, then end weight raising.
+ */
+ if (bfq_bfqq_in_large_burst(bfqq))
+ bfq_bfqq_end_wr(bfqq);
+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
+ bfqq->wr_cur_max_time)) {
+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
+ bfq_wr_duration(bfqd)))
+ bfq_bfqq_end_wr(bfqq);
+ else {
+ /* switch back to interactive wr */
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ bfqq->last_wr_start_finish =
+ bfqq->wr_start_at_switch_to_srt;
+ bfqq->entity.prio_changed = 1;
+ }
+ }
+ }
+ /* Update weight both if it must be raised and if it must be lowered */
+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
+ __bfq_entity_update_weight_prio(
+ bfq_entity_service_tree(entity),
+ entity);
+}
+
+/*
+ * Dispatch next request from bfqq.
+ */
+static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ struct request *rq = bfqq->next_rq;
+ unsigned long service_to_charge;
+
+ service_to_charge = bfq_serv_to_charge(rq, bfqq);
+
+ bfq_bfqq_served(bfqq, service_to_charge);
+
+ bfq_dispatch_remove(bfqd->queue, rq);
+
+ /*
+ * If weight raising has to terminate for bfqq, then next
+ * function causes an immediate update of bfqq's weight,
+ * without waiting for next activation. As a consequence, on
+ * expiration, bfqq will be timestamped as if has never been
+ * weight-raised during this service slot, even if it has
+ * received part or even most of the service as a
+ * weight-raised queue. This inflates bfqq's timestamps, which
+ * is beneficial, as bfqq is then more willing to leave the
+ * device immediately to possible other weight-raised queues.
+ */
+ bfq_update_wr_data(bfqd, bfqq);
+
+ /*
+ * Expire bfqq, pretending that its budget expired, if bfqq
+ * belongs to CLASS_IDLE and other queues are waiting for
+ * service.
+ */
+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
+ goto expire;
+
+ return rq;
+
+expire:
+ bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
+ return rq;
+}
+
+static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+
+ /*
+ * Avoiding lock: a race on bfqd->busy_queues should cause at
+ * most a call to dispatch for nothing
+ */
+ return !list_empty_careful(&bfqd->dispatch) ||
+ bfqd->busy_queues > 0;
+}
+
+static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ struct request *rq = NULL;
+ struct bfq_queue *bfqq = NULL;
+
+ if (!list_empty(&bfqd->dispatch)) {
+ rq = list_first_entry(&bfqd->dispatch, struct request,
+ queuelist);
+ list_del_init(&rq->queuelist);
+
+ bfqq = RQ_BFQQ(rq);
+
+ if (bfqq) {
+ /*
+ * Increment counters here, because this
+ * dispatch does not follow the standard
+ * dispatch flow (where counters are
+ * incremented)
+ */
+ bfqq->dispatched++;
+
+ goto inc_in_driver_start_rq;
+ }
+
+ /*
+ * We exploit the put_rq_private hook to decrement
+ * rq_in_driver, but put_rq_private will not be
+ * invoked on this request. So, to avoid unbalance,
+ * just start this request, without incrementing
+ * rq_in_driver. As a negative consequence,
+ * rq_in_driver is deceptively lower than it should be
+ * while this request is in service. This may cause
+ * bfq_schedule_dispatch to be invoked uselessly.
+ *
+ * As for implementing an exact solution, the
+ * put_request hook, if defined, is probably invoked
+ * also on this request. So, by exploiting this hook,
+ * we could 1) increment rq_in_driver here, and 2)
+ * decrement it in put_request. Such a solution would
+ * let the value of the counter be always accurate,
+ * but it would entail using an extra interface
+ * function. This cost seems higher than the benefit,
+ * being the frequency of non-elevator-private
+ * requests very low.
+ */
+ goto start_rq;
+ }
+
+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+
+ if (bfqd->busy_queues == 0)
+ goto exit;
+
+ /*
+ * Force device to serve one request at a time if
+ * strict_guarantees is true. Forcing this service scheme is
+ * currently the ONLY way to guarantee that the request
+ * service order enforced by the scheduler is respected by a
+ * queueing device. Otherwise the device is free even to make
+ * some unlucky request wait for as long as the device
+ * wishes.
+ *
+ * Of course, serving one request at at time may cause loss of
+ * throughput.
+ */
+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
+ goto exit;
+
+ bfqq = bfq_select_queue(bfqd);
+ if (!bfqq)
+ goto exit;
+
+ rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
+
+ if (rq) {
+inc_in_driver_start_rq:
+ bfqd->rq_in_driver++;
+start_rq:
+ rq->rq_flags |= RQF_STARTED;
+ }
+exit:
+ return rq;
+}
+
+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ struct request *rq;
+
+ spin_lock_irq(&bfqd->lock);
+
+ rq = __bfq_dispatch_request(hctx);
+ spin_unlock_irq(&bfqd->lock);
+
+ return rq;
+}
+
+/*
+ * Task holds one reference to the queue, dropped when task exits. Each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
+ * Scheduler lock must be held here. Recall not to use bfqq after calling
+ * this function on it.
+ */
+void bfq_put_queue(struct bfq_queue *bfqq)
+{
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_group *bfqg = bfqq_group(bfqq);
+#endif
+
+ if (bfqq->bfqd)
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
+ bfqq, bfqq->ref);
+
+ bfqq->ref--;
+ if (bfqq->ref)
+ return;
+
+ if (bfq_bfqq_sync(bfqq))
+ /*
+ * The fact that this queue is being destroyed does not
+ * invalidate the fact that this queue may have been
+ * activated during the current burst. As a consequence,
+ * although the queue does not exist anymore, and hence
+ * needs to be removed from the burst list if there,
+ * the burst size has not to be decremented.
+ */
+ hlist_del_init(&bfqq->burst_list_node);
+
+ kmem_cache_free(bfq_pool, bfqq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ bfqg_put(bfqg);
+#endif
+}
+
+static void bfq_put_cooperator(struct bfq_queue *bfqq)
+{
+ struct bfq_queue *__bfqq, *next;
+
+ /*
+ * If this queue was scheduled to merge with another queue, be
+ * sure to drop the reference taken on that queue (and others in
+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
+ */
+ __bfqq = bfqq->new_bfqq;
+ while (__bfqq) {
+ if (__bfqq == bfqq)
+ break;
+ next = __bfqq->new_bfqq;
+ bfq_put_queue(__bfqq);
+ __bfqq = next;
+ }
+}
+
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ if (bfqq == bfqd->in_service_queue) {
+ __bfq_bfqq_expire(bfqd, bfqq);
+ bfq_schedule_dispatch(bfqd);
+ }
+
+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
+
+ bfq_put_cooperator(bfqq);
+
+ bfq_put_queue(bfqq); /* release process reference */
+}
+
+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+ struct bfq_data *bfqd;
+
+ if (bfqq)
+ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
+
+ if (bfqq && bfqd) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&bfqd->lock, flags);
+ bfq_exit_bfqq(bfqd, bfqq);
+ bic_set_bfqq(bic, NULL, is_sync);
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ }
+}
+
+static void bfq_exit_icq(struct io_cq *icq)
+{
+ struct bfq_io_cq *bic = icq_to_bic(icq);
+
+ bfq_exit_icq_bfqq(bic, true);
+ bfq_exit_icq_bfqq(bic, false);
+}
+
+/*
+ * Update the entity prio values; note that the new values will not
+ * be used until the next (re)activation.
+ */
+static void
+bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+ struct task_struct *tsk = current;
+ int ioprio_class;
+ struct bfq_data *bfqd = bfqq->bfqd;
+
+ if (!bfqd)
+ return;
+
+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+ switch (ioprio_class) {
+ default:
+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
+ "bfq: bad prio class %d\n", ioprio_class);
+ case IOPRIO_CLASS_NONE:
+ /*
+ * No prio set, inherit CPU scheduling settings.
+ */
+ bfqq->new_ioprio = task_nice_ioprio(tsk);
+ bfqq->new_ioprio_class = task_nice_ioclass(tsk);
+ break;
+ case IOPRIO_CLASS_RT:
+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
+ break;
+ case IOPRIO_CLASS_BE:
+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
+ break;
+ case IOPRIO_CLASS_IDLE:
+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
+ bfqq->new_ioprio = 7;
+ bfq_clear_bfqq_idle_window(bfqq);
+ break;
+ }
+
+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
+ bfqq->new_ioprio);
+ bfqq->new_ioprio = IOPRIO_BE_NR;
+ }
+
+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
+ bfqq->entity.prio_changed = 1;
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+ struct bio *bio, bool is_sync,
+ struct bfq_io_cq *bic);
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
+{
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
+ struct bfq_queue *bfqq;
+ int ioprio = bic->icq.ioc->ioprio;
+
+ /*
+ * This condition may trigger on a newly created bic, be sure to
+ * drop the lock before returning.
+ */
+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
+ return;
+
+ bic->ioprio = ioprio;
+
+ bfqq = bic_to_bfqq(bic, false);
+ if (bfqq) {
+ /* release process reference on this queue */
+ bfq_put_queue(bfqq);
+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
+ bic_set_bfqq(bic, bfqq, false);
+ }
+
+ bfqq = bic_to_bfqq(bic, true);
+ if (bfqq)
+ bfq_set_next_ioprio_data(bfqq, bic);
+}
+
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_io_cq *bic, pid_t pid, int is_sync)
+{
+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
+ INIT_LIST_HEAD(&bfqq->fifo);
+ INIT_HLIST_NODE(&bfqq->burst_list_node);
+
+ bfqq->ref = 0;
+ bfqq->bfqd = bfqd;
+
+ if (bic)
+ bfq_set_next_ioprio_data(bfqq, bic);
+
+ if (is_sync) {
+ if (!bfq_class_idle(bfqq))
+ bfq_mark_bfqq_idle_window(bfqq);
+ bfq_mark_bfqq_sync(bfqq);
+ bfq_mark_bfqq_just_created(bfqq);
+ } else
+ bfq_clear_bfqq_sync(bfqq);
+
+ /* set end request to minus infinity from now */
+ bfqq->ttime.last_end_request = ktime_get_ns() + 1;
+
+ bfq_mark_bfqq_IO_bound(bfqq);
+
+ bfqq->pid = pid;
+
+ /* Tentative initial value to trade off between thr and lat */
+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
+ bfqq->budget_timeout = bfq_smallest_from_now();
+
+ bfqq->wr_coeff = 1;
+ bfqq->last_wr_start_finish = jiffies;
+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
+ bfqq->split_time = bfq_smallest_from_now();
+
+ /*
+ * Set to the value for which bfqq will not be deemed as
+ * soft rt when it becomes backlogged.
+ */
+ bfqq->soft_rt_next_start = bfq_greatest_from_now();
+
+ /* first request is almost certainly seeky */
+ bfqq->seek_history = 1;
+}
+
+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+ struct bfq_group *bfqg,
+ int ioprio_class, int ioprio)
+{
+ switch (ioprio_class) {
+ case IOPRIO_CLASS_RT:
+ return &bfqg->async_bfqq[0][ioprio];
+ case IOPRIO_CLASS_NONE:
+ ioprio = IOPRIO_NORM;
+ /* fall through */
+ case IOPRIO_CLASS_BE:
+ return &bfqg->async_bfqq[1][ioprio];
+ case IOPRIO_CLASS_IDLE:
+ return &bfqg->async_idle_bfqq;
+ default:
+ return NULL;
+ }
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+ struct bio *bio, bool is_sync,
+ struct bfq_io_cq *bic)
+{
+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+ struct bfq_queue **async_bfqq = NULL;
+ struct bfq_queue *bfqq;
+ struct bfq_group *bfqg;
+
+ rcu_read_lock();
+
+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+ if (!bfqg) {
+ bfqq = &bfqd->oom_bfqq;
+ goto out;
+ }
+
+ if (!is_sync) {
+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
+ ioprio);
+ bfqq = *async_bfqq;
+ if (bfqq)
+ goto out;
+ }
+
+ bfqq = kmem_cache_alloc_node(bfq_pool,
+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
+ bfqd->queue->node);
+
+ if (bfqq) {
+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
+ is_sync);
+ bfq_init_entity(&bfqq->entity, bfqg);
+ bfq_log_bfqq(bfqd, bfqq, "allocated");
+ } else {
+ bfqq = &bfqd->oom_bfqq;
+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
+ goto out;
+ }
+
+ /*
+ * Pin the queue now that it's allocated, scheduler exit will
+ * prune it.
+ */
+ if (async_bfqq) {
+ bfqq->ref++; /*
+ * Extra group reference, w.r.t. sync
+ * queue. This extra reference is removed
+ * only if bfqq->bfqg disappears, to
+ * guarantee that this queue is not freed
+ * until its group goes away.
+ */
+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
+ bfqq, bfqq->ref);
+ *async_bfqq = bfqq;
+ }
+
+out:
+ bfqq->ref++; /* get a process reference to this queue */
+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
+ rcu_read_unlock();
+ return bfqq;
+}
+
+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ struct bfq_ttime *ttime = &bfqq->ttime;
+ u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
+
+ elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
+
+ ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
+ ttime->ttime_samples);
+}
+
+static void
+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct request *rq)
+{
+ bfqq->seek_history <<= 1;
+ bfqq->seek_history |=
+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
+ (!blk_queue_nonrot(bfqd->queue) ||
+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
+}
+
+/*
+ * Disable idle window if the process thinks too long or seeks so much that
+ * it doesn't matter.
+ */
+static void bfq_update_idle_window(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ struct bfq_io_cq *bic)
+{
+ int enable_idle;
+
+ /* Don't idle for async or idle io prio class. */
+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
+ return;
+
+ /* Idle window just restored, statistics are meaningless. */
+ if (time_is_after_eq_jiffies(bfqq->split_time +
+ bfqd->bfq_wr_min_idle_time))
+ return;
+
+ enable_idle = bfq_bfqq_idle_window(bfqq);
+
+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
+ bfqd->bfq_slice_idle == 0 ||
+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
+ bfqq->wr_coeff == 1))
+ enable_idle = 0;
+ else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
+ if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle &&
+ bfqq->wr_coeff == 1)
+ enable_idle = 0;
+ else
+ enable_idle = 1;
+ }
+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
+ enable_idle);
+
+ if (enable_idle)
+ bfq_mark_bfqq_idle_window(bfqq);
+ else
+ bfq_clear_bfqq_idle_window(bfqq);
+}
+
+/*
+ * Called when a new fs request (rq) is added to bfqq. Check if there's
+ * something we should do about it.
+ */
+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct request *rq)
+{
+ struct bfq_io_cq *bic = RQ_BIC(rq);
+
+ if (rq->cmd_flags & REQ_META)
+ bfqq->meta_pending++;
+
+ bfq_update_io_thinktime(bfqd, bfqq);
+ bfq_update_io_seektime(bfqd, bfqq, rq);
+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
+ !BFQQ_SEEKY(bfqq))
+ bfq_update_idle_window(bfqd, bfqq, bic);
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "rq_enqueued: idle_window=%d (seeky %d)",
+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
+
+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+
+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
+ blk_rq_sectors(rq) < 32;
+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
+
+ /*
+ * There is just this request queued: if the request
+ * is small and the queue is not to be expired, then
+ * just exit.
+ *
+ * In this way, if the device is being idled to wait
+ * for a new request from the in-service queue, we
+ * avoid unplugging the device and committing the
+ * device to serve just a small request. On the
+ * contrary, we wait for the block layer to decide
+ * when to unplug the device: hopefully, new requests
+ * will be merged to this one quickly, then the device
+ * will be unplugged and larger requests will be
+ * dispatched.
+ */
+ if (small_req && !budget_timeout)
+ return;
+
+ /*
+ * A large enough request arrived, or the queue is to
+ * be expired: in both cases disk idling is to be
+ * stopped, so clear wait_request flag and reset
+ * timer.
+ */
+ bfq_clear_bfqq_wait_request(bfqq);
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+ bfqg_stats_update_idle_time(bfqq_group(bfqq));
+
+ /*
+ * The queue is not empty, because a new request just
+ * arrived. Hence we can safely expire the queue, in
+ * case of budget timeout, without risking that the
+ * timestamps of the queue are not updated correctly.
+ * See [1] for more details.
+ */
+ if (budget_timeout)
+ bfq_bfqq_expire(bfqd, bfqq, false,
+ BFQQE_BUDGET_TIMEOUT);
+ }
+}
+
+static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq),
+ *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
+
+ if (new_bfqq) {
+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
+ /*
+ * Release the request's reference to the old bfqq
+ * and make sure one is taken to the shared queue.
+ */
+ new_bfqq->allocated++;
+ bfqq->allocated--;
+ new_bfqq->ref++;
+ bfq_clear_bfqq_just_created(bfqq);
+ /*
+ * If the bic associated with the process
+ * issuing this request still points to bfqq
+ * (and thus has not been already redirected
+ * to new_bfqq or even some other bfq_queue),
+ * then complete the merge and redirect it to
+ * new_bfqq.
+ */
+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
+ bfqq, new_bfqq);
+ /*
+ * rq is about to be enqueued into new_bfqq,
+ * release rq reference on bfqq
+ */
+ bfq_put_queue(bfqq);
+ rq->elv.priv[1] = new_bfqq;
+ bfqq = new_bfqq;
+ }
+
+ bfq_add_request(rq);
+
+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
+ list_add_tail(&rq->queuelist, &bfqq->fifo);
+
+ bfq_rq_enqueued(bfqd, bfqq, rq);
+}
+
+static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+ bool at_head)
+{
+ struct request_queue *q = hctx->queue;
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+
+ spin_lock_irq(&bfqd->lock);
+ if (blk_mq_sched_try_insert_merge(q, rq)) {
+ spin_unlock_irq(&bfqd->lock);
+ return;
+ }
+
+ spin_unlock_irq(&bfqd->lock);
+
+ blk_mq_sched_request_inserted(rq);
+
+ spin_lock_irq(&bfqd->lock);
+ if (at_head || blk_rq_is_passthrough(rq)) {
+ if (at_head)
+ list_add(&rq->queuelist, &bfqd->dispatch);
+ else
+ list_add_tail(&rq->queuelist, &bfqd->dispatch);
+ } else {
+ __bfq_insert_request(bfqd, rq);
+
+ if (rq_mergeable(rq)) {
+ elv_rqhash_add(q, rq);
+ if (!q->last_merge)
+ q->last_merge = rq;
+ }
+ }
+
+ spin_unlock_irq(&bfqd->lock);
+}
+
+static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list, bool at_head)
+{
+ while (!list_empty(list)) {
+ struct request *rq;
+
+ rq = list_first_entry(list, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ bfq_insert_request(hctx, rq, at_head);
+ }
+}
+
+static void bfq_update_hw_tag(struct bfq_data *bfqd)
+{
+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
+ bfqd->rq_in_driver);
+
+ if (bfqd->hw_tag == 1)
+ return;
+
+ /*
+ * This sample is valid if the number of outstanding requests
+ * is large enough to allow a queueing behavior. Note that the
+ * sum is not exact, as it's not taking into account deactivated
+ * requests.
+ */
+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+ return;
+
+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
+ return;
+
+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
+ bfqd->max_rq_in_driver = 0;
+ bfqd->hw_tag_samples = 0;
+}
+
+static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
+{
+ u64 now_ns;
+ u32 delta_us;
+
+ bfq_update_hw_tag(bfqd);
+
+ bfqd->rq_in_driver--;
+ bfqq->dispatched--;
+
+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
+ /*
+ * Set budget_timeout (which we overload to store the
+ * time at which the queue remains with no backlog and
+ * no outstanding request; used by the weight-raising
+ * mechanism).
+ */
+ bfqq->budget_timeout = jiffies;
+
+ bfq_weights_tree_remove(bfqd, &bfqq->entity,
+ &bfqd->queue_weights_tree);
+ }
+
+ now_ns = ktime_get_ns();
+
+ bfqq->ttime.last_end_request = now_ns;
+
+ /*
+ * Using us instead of ns, to get a reasonable precision in
+ * computing rate in next check.
+ */
+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
+
+ /*
+ * If the request took rather long to complete, and, according
+ * to the maximum request size recorded, this completion latency
+ * implies that the request was certainly served at a very low
+ * rate (less than 1M sectors/sec), then the whole observation
+ * interval that lasts up to this time instant cannot be a
+ * valid time interval for computing a new peak rate. Invoke
+ * bfq_update_rate_reset to have the following three steps
+ * taken:
+ * - close the observation interval at the last (previous)
+ * request dispatch or completion
+ * - compute rate, if possible, for that observation interval
+ * - reset to zero samples, which will trigger a proper
+ * re-initialization of the observation interval on next
+ * dispatch
+ */
+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
+ 1UL<<(BFQ_RATE_SHIFT - 10))
+ bfq_update_rate_reset(bfqd, NULL);
+ bfqd->last_completion = now_ns;
+
+ /*
+ * If we are waiting to discover whether the request pattern
+ * of the task associated with the queue is actually
+ * isochronous, and both requisites for this condition to hold
+ * are now satisfied, then compute soft_rt_next_start (see the
+ * comments on the function bfq_bfqq_softrt_next_start()). We
+ * schedule this delayed check when bfqq expires, if it still
+ * has in-flight requests.
+ */
+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
+ RB_EMPTY_ROOT(&bfqq->sort_list))
+ bfqq->soft_rt_next_start =
+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
+
+ /*
+ * If this is the in-service queue, check if it needs to be expired,
+ * or if we want to idle in case it has no pending requests.
+ */
+ if (bfqd->in_service_queue == bfqq) {
+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
+ bfq_arm_slice_timer(bfqd);
+ return;
+ } else if (bfq_may_expire_for_budg_timeout(bfqq))
+ bfq_bfqq_expire(bfqd, bfqq, false,
+ BFQQE_BUDGET_TIMEOUT);
+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ (bfqq->dispatched == 0 ||
+ !bfq_bfqq_may_idle(bfqq)))
+ bfq_bfqq_expire(bfqd, bfqq, false,
+ BFQQE_NO_MORE_REQUESTS);
+ }
+}
+
+static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
+{
+ bfqq->allocated--;
+
+ bfq_put_queue(bfqq);
+}
+
+static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_data *bfqd = bfqq->bfqd;
+
+ if (rq->rq_flags & RQF_STARTED)
+ bfqg_stats_update_completion(bfqq_group(bfqq),
+ rq_start_time_ns(rq),
+ rq_io_start_time_ns(rq),
+ rq->cmd_flags);
+
+ if (likely(rq->rq_flags & RQF_STARTED)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&bfqd->lock, flags);
+
+ bfq_completed_request(bfqq, bfqd);
+ bfq_put_rq_priv_body(bfqq);
+
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ } else {
+ /*
+ * Request rq may be still/already in the scheduler,
+ * in which case we need to remove it. And we cannot
+ * defer such a check and removal, to avoid
+ * inconsistencies in the time interval from the end
+ * of this function to the start of the deferred work.
+ * This situation seems to occur only in process
+ * context, as a consequence of a merge. In the
+ * current version of the code, this implies that the
+ * lock is held.
+ */
+
+ if (!RB_EMPTY_NODE(&rq->rb_node))
+ bfq_remove_request(q, rq);
+ bfq_put_rq_priv_body(bfqq);
+ }
+
+ rq->elv.priv[0] = NULL;
+ rq->elv.priv[1] = NULL;
+}
+
+/*
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
+ * was the last process referring to that bfqq.
+ */
+static struct bfq_queue *
+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
+{
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
+
+ if (bfqq_process_refs(bfqq) == 1) {
+ bfqq->pid = current->pid;
+ bfq_clear_bfqq_coop(bfqq);
+ bfq_clear_bfqq_split_coop(bfqq);
+ return bfqq;
+ }
+
+ bic_set_bfqq(bic, NULL, 1);
+
+ bfq_put_cooperator(bfqq);
+
+ bfq_put_queue(bfqq);
+ return NULL;
+}
+
+static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic,
+ struct bio *bio,
+ bool split, bool is_sync,
+ bool *new_queue)
+{
+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+
+ if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
+ return bfqq;
+
+ if (new_queue)
+ *new_queue = true;
+
+ if (bfqq)
+ bfq_put_queue(bfqq);
+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
+
+ bic_set_bfqq(bic, bfqq, is_sync);
+ if (split && is_sync) {
+ if ((bic->was_in_burst_list && bfqd->large_burst) ||
+ bic->saved_in_large_burst)
+ bfq_mark_bfqq_in_large_burst(bfqq);
+ else {
+ bfq_clear_bfqq_in_large_burst(bfqq);
+ if (bic->was_in_burst_list)
+ hlist_add_head(&bfqq->burst_list_node,
+ &bfqd->burst_list);
+ }
+ bfqq->split_time = jiffies;
+ }
+
+ return bfqq;
+}
+
+/*
+ * Allocate bfq data structures associated with this request.
+ */
+static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
+ const int is_sync = rq_is_sync(rq);
+ struct bfq_queue *bfqq;
+ bool new_queue = false;
+ bool split = false;
+
+ spin_lock_irq(&bfqd->lock);
+
+ if (!bic)
+ goto queue_fail;
+
+ bfq_check_ioprio_change(bic, bio);
+
+ bfq_bic_update_cgroup(bic, bio);
+
+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
+ &new_queue);
+
+ if (likely(!new_queue)) {
+ /* If the queue was seeky for too long, break it apart. */
+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+
+ /* Update bic before losing reference to bfqq */
+ if (bfq_bfqq_in_large_burst(bfqq))
+ bic->saved_in_large_burst = true;
+
+ bfqq = bfq_split_bfqq(bic, bfqq);
+ split = true;
+
+ if (!bfqq)
+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
+ true, is_sync,
+ NULL);
+ }
+ }
+
+ bfqq->allocated++;
+ bfqq->ref++;
+ bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
+ rq, bfqq, bfqq->ref);
+
+ rq->elv.priv[0] = bic;
+ rq->elv.priv[1] = bfqq;
+
+ /*
+ * If a bfq_queue has only one process reference, it is owned
+ * by only this bic: we can then set bfqq->bic = bic. in
+ * addition, if the queue has also just been split, we have to
+ * resume its state.
+ */
+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
+ bfqq->bic = bic;
+ if (split) {
+ /*
+ * The queue has just been split from a shared
+ * queue: restore the idle window and the
+ * possible weight raising period.
+ */
+ bfq_bfqq_resume_state(bfqq, bic);
+ }
+ }
+
+ if (unlikely(bfq_bfqq_just_created(bfqq)))
+ bfq_handle_burst(bfqd, bfqq);
+
+ spin_unlock_irq(&bfqd->lock);
+
+ return 0;
+
+queue_fail:
+ spin_unlock_irq(&bfqd->lock);
+
+ return 1;
+}
+
+static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+ enum bfqq_expiration reason;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bfqd->lock, flags);
+ bfq_clear_bfqq_wait_request(bfqq);
+
+ if (bfqq != bfqd->in_service_queue) {
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ return;
+ }
+
+ if (bfq_bfqq_budget_timeout(bfqq))
+ /*
+ * Also here the queue can be safely expired
+ * for budget timeout without wasting
+ * guarantees
+ */
+ reason = BFQQE_BUDGET_TIMEOUT;
+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
+ /*
+ * The queue may not be empty upon timer expiration,
+ * because we may not disable the timer when the
+ * first request of the in-service queue arrives
+ * during disk idling.
+ */
+ reason = BFQQE_TOO_IDLE;
+ else
+ goto schedule_dispatch;
+
+ bfq_bfqq_expire(bfqd, bfqq, true, reason);
+
+schedule_dispatch:
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ bfq_schedule_dispatch(bfqd);
+}
+
+/*
+ * Handler of the expiration of the timer running if the in-service queue
+ * is idling inside its time slice.
+ */
+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
+{
+ struct bfq_data *bfqd = container_of(timer, struct bfq_data,
+ idle_slice_timer);
+ struct bfq_queue *bfqq = bfqd->in_service_queue;
+
+ /*
+ * Theoretical race here: the in-service queue can be NULL or
+ * different from the queue that was idling if a new request
+ * arrives for the current queue and there is a full dispatch
+ * cycle that changes the in-service queue. This can hardly
+ * happen, but in the worst case we just expire a queue too
+ * early.
+ */
+ if (bfqq)
+ bfq_idle_slice_timer_body(bfqq);
+
+ return HRTIMER_NORESTART;
+}
+
+static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+ struct bfq_queue **bfqq_ptr)
+{
+ struct bfq_queue *bfqq = *bfqq_ptr;
+
+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
+ if (bfqq) {
+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+
+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
+ bfqq, bfqq->ref);
+ bfq_put_queue(bfqq);
+ *bfqq_ptr = NULL;
+ }
+}
+
+/*
+ * Release all the bfqg references to its async queues. If we are
+ * deallocating the group these queues may still contain requests, so
+ * we reparent them to the root cgroup (i.e., the only one that will
+ * exist for sure until all the requests on a device are gone).
+ */
+void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_BE_NR; j++)
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
+
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
+}
+
+static void bfq_exit_queue(struct elevator_queue *e)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ struct bfq_queue *bfqq, *n;
+
+ hrtimer_cancel(&bfqd->idle_slice_timer);
+
+ spin_lock_irq(&bfqd->lock);
+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
+ bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+ spin_unlock_irq(&bfqd->lock);
+
+ hrtimer_cancel(&bfqd->idle_slice_timer);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
+#else
+ spin_lock_irq(&bfqd->lock);
+ bfq_put_async_queues(bfqd, bfqd->root_group);
+ kfree(bfqd->root_group);
+ spin_unlock_irq(&bfqd->lock);
+#endif
+
+ kfree(bfqd);
+}
+
+static void bfq_init_root_group(struct bfq_group *root_group,
+ struct bfq_data *bfqd)
+{
+ int i;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ root_group->entity.parent = NULL;
+ root_group->my_entity = NULL;
+ root_group->bfqd = bfqd;
+#endif
+ root_group->rq_pos_tree = RB_ROOT;
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+ root_group->sched_data.bfq_class_idle_last_service = jiffies;
+}
+
+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+ struct bfq_data *bfqd;
+ struct elevator_queue *eq;
+
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return -ENOMEM;
+
+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
+ if (!bfqd) {
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+ }
+ eq->elevator_data = bfqd;
+
+ spin_lock_irq(q->queue_lock);
+ q->elevator = eq;
+ spin_unlock_irq(q->queue_lock);
+
+ /*
+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+ * Grab a permanent reference to it, so that the normal code flow
+ * will not attempt to free it.
+ */
+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
+ bfqd->oom_bfqq.ref++;
+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+ bfqd->oom_bfqq.entity.new_weight =
+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+
+ /* oom_bfqq does not participate to bursts */
+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
+
+ /*
+ * Trigger weight initialization, according to ioprio, at the
+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
+ * class won't be changed any more.
+ */
+ bfqd->oom_bfqq.entity.prio_changed = 1;
+
+ bfqd->queue = q;
+
+ INIT_LIST_HEAD(&bfqd->dispatch);
+
+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+
+ bfqd->queue_weights_tree = RB_ROOT;
+ bfqd->group_weights_tree = RB_ROOT;
+
+ INIT_LIST_HEAD(&bfqd->active_list);
+ INIT_LIST_HEAD(&bfqd->idle_list);
+ INIT_HLIST_HEAD(&bfqd->burst_list);
+
+ bfqd->hw_tag = -1;
+
+ bfqd->bfq_max_budget = bfq_default_max_budget;
+
+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
+ bfqd->bfq_back_max = bfq_back_max;
+ bfqd->bfq_back_penalty = bfq_back_penalty;
+ bfqd->bfq_slice_idle = bfq_slice_idle;
+ bfqd->bfq_timeout = bfq_timeout;
+
+ bfqd->bfq_requests_within_timer = 120;
+
+ bfqd->bfq_large_burst_thresh = 8;
+ bfqd->bfq_burst_interval = msecs_to_jiffies(180);
+
+ bfqd->low_latency = true;
+
+ /*
+ * Trade-off between responsiveness and fairness.
+ */
+ bfqd->bfq_wr_coeff = 30;
+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
+ bfqd->bfq_wr_max_time = 0;
+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
+ bfqd->bfq_wr_max_softrt_rate = 7000; /*
+ * Approximate rate required
+ * to playback or record a
+ * high-definition compressed
+ * video.
+ */
+ bfqd->wr_busy_queues = 0;
+
+ /*
+ * Begin by assuming, optimistically, that the device is a
+ * high-speed one, and that its peak rate is equal to 2/3 of
+ * the highest reference rate.
+ */
+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
+ T_fast[blk_queue_nonrot(bfqd->queue)];
+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
+ bfqd->device_speed = BFQ_BFQD_FAST;
+
+ spin_lock_init(&bfqd->lock);
+
+ /*
+ * The invocation of the next bfq_create_group_hierarchy
+ * function is the head of a chain of function calls
+ * (bfq_create_group_hierarchy->blkcg_activate_policy->
+ * blk_mq_freeze_queue) that may lead to the invocation of the
+ * has_work hook function. For this reason,
+ * bfq_create_group_hierarchy is invoked only after all
+ * scheduler data has been initialized, apart from the fields
+ * that can be initialized only after invoking
+ * bfq_create_group_hierarchy. This, in particular, enables
+ * has_work to correctly return false. Of course, to avoid
+ * other inconsistencies, the blk-mq stack must then refrain
+ * from invoking further scheduler hooks before this init
+ * function is finished.
+ */
+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
+ if (!bfqd->root_group)
+ goto out_free;
+ bfq_init_root_group(bfqd->root_group, bfqd);
+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+
+
+ return 0;
+
+out_free:
+ kfree(bfqd);
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+}
+
+static void bfq_slab_kill(void)
+{
+ kmem_cache_destroy(bfq_pool);
+}
+
+static int __init bfq_slab_setup(void)
+{
+ bfq_pool = KMEM_CACHE(bfq_queue, 0);
+ if (!bfq_pool)
+ return -ENOMEM;
+ return 0;
+}
+
+static ssize_t bfq_var_show(unsigned int var, char *page)
+{
+ return sprintf(page, "%u\n", var);
+}
+
+static ssize_t bfq_var_store(unsigned long *var, const char *page,
+ size_t count)
+{
+ unsigned long new_val;
+ int ret = kstrtoul(page, 10, &new_val);
+
+ if (ret == 0)
+ *var = new_val;
+
+ return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ u64 __data = __VAR; \
+ if (__CONV == 1) \
+ __data = jiffies_to_msecs(__data); \
+ else if (__CONV == 2) \
+ __data = div_u64(__data, NSEC_PER_MSEC); \
+ return bfq_var_show(__data, (page)); \
+}
+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
+#undef SHOW_FUNCTION
+
+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ u64 __data = __VAR; \
+ __data = div_u64(__data, NSEC_PER_USEC); \
+ return bfq_var_show(__data, (page)); \
+}
+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
+#undef USEC_SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
+static ssize_t \
+__FUNC(struct elevator_queue *e, const char *page, size_t count) \
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ unsigned long uninitialized_var(__data); \
+ int ret = bfq_var_store(&__data, (page), count); \
+ if (__data < (MIN)) \
+ __data = (MIN); \
+ else if (__data > (MAX)) \
+ __data = (MAX); \
+ if (__CONV == 1) \
+ *(__PTR) = msecs_to_jiffies(__data); \
+ else if (__CONV == 2) \
+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
+ else \
+ *(__PTR) = __data; \
+ return ret; \
+}
+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
+ INT_MAX, 2);
+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
+ INT_MAX, 2);
+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
+ INT_MAX, 0);
+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
+#undef STORE_FUNCTION
+
+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ unsigned long uninitialized_var(__data); \
+ int ret = bfq_var_store(&__data, (page), count); \
+ if (__data < (MIN)) \
+ __data = (MIN); \
+ else if (__data > (MAX)) \
+ __data = (MAX); \
+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \
+ return ret; \
+}
+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
+ UINT_MAX);
+#undef USEC_STORE_FUNCTION
+
+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data == 0)
+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
+ else {
+ if (__data > INT_MAX)
+ __data = INT_MAX;
+ bfqd->bfq_max_budget = __data;
+ }
+
+ bfqd->bfq_user_max_budget = __data;
+
+ return ret;
+}
+
+/*
+ * Leaving this name to preserve name compatibility with cfq
+ * parameters, but this timeout is used for both sync and async.
+ */
+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data < 1)
+ __data = 1;
+ else if (__data > INT_MAX)
+ __data = INT_MAX;
+
+ bfqd->bfq_timeout = msecs_to_jiffies(__data);
+ if (bfqd->bfq_user_max_budget == 0)
+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
+
+ return ret;
+}
+
+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data > 1)
+ __data = 1;
+ if (!bfqd->strict_guarantees && __data == 1
+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
+
+ bfqd->strict_guarantees = __data;
+
+ return ret;
+}
+
+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data > 1)
+ __data = 1;
+ if (__data == 0 && bfqd->low_latency != 0)
+ bfq_end_wr(bfqd);
+ bfqd->low_latency = __data;
+
+ return ret;
+}
+
+#define BFQ_ATTR(name) \
+ __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
+
+static struct elv_fs_entry bfq_attrs[] = {
+ BFQ_ATTR(fifo_expire_sync),
+ BFQ_ATTR(fifo_expire_async),
+ BFQ_ATTR(back_seek_max),
+ BFQ_ATTR(back_seek_penalty),
+ BFQ_ATTR(slice_idle),
+ BFQ_ATTR(slice_idle_us),
+ BFQ_ATTR(max_budget),
+ BFQ_ATTR(timeout_sync),
+ BFQ_ATTR(strict_guarantees),
+ BFQ_ATTR(low_latency),
+ __ATTR_NULL
+};
+
+static struct elevator_type iosched_bfq_mq = {
+ .ops.mq = {
+ .get_rq_priv = bfq_get_rq_private,
+ .put_rq_priv = bfq_put_rq_private,
+ .exit_icq = bfq_exit_icq,
+ .insert_requests = bfq_insert_requests,
+ .dispatch_request = bfq_dispatch_request,
+ .next_request = elv_rb_latter_request,
+ .former_request = elv_rb_former_request,
+ .allow_merge = bfq_allow_bio_merge,
+ .bio_merge = bfq_bio_merge,
+ .request_merge = bfq_request_merge,
+ .requests_merged = bfq_requests_merged,
+ .request_merged = bfq_request_merged,
+ .has_work = bfq_has_work,
+ .init_sched = bfq_init_queue,
+ .exit_sched = bfq_exit_queue,
+ },
+
+ .uses_mq = true,
+ .icq_size = sizeof(struct bfq_io_cq),
+ .icq_align = __alignof__(struct bfq_io_cq),
+ .elevator_attrs = bfq_attrs,
+ .elevator_name = "bfq",
+ .elevator_owner = THIS_MODULE,
+};
+
+static int __init bfq_init(void)
+{
+ int ret;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ ret = blkcg_policy_register(&blkcg_policy_bfq);
+ if (ret)
+ return ret;
+#endif
+
+ ret = -ENOMEM;
+ if (bfq_slab_setup())
+ goto err_pol_unreg;
+
+ /*
+ * Times to load large popular applications for the typical
+ * systems installed on the reference devices (see the
+ * comments before the definitions of the next two
+ * arrays). Actually, we use slightly slower values, as the
+ * estimated peak rate tends to be smaller than the actual
+ * peak rate. The reason for this last fact is that estimates
+ * are computed over much shorter time intervals than the long
+ * intervals typically used for benchmarking. Why? First, to
+ * adapt more quickly to variations. Second, because an I/O
+ * scheduler cannot rely on a peak-rate-evaluation workload to
+ * be run for a long time.
+ */
+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
+
+ /*
+ * Thresholds that determine the switch between speed classes
+ * (see the comments before the definition of the array
+ * device_speed_thresh). These thresholds are biased towards
+ * transitions to the fast class. This is safer than the
+ * opposite bias. In fact, a wrong transition to the slow
+ * class results in short weight-raising periods, because the
+ * speed of the device then tends to be higher that the
+ * reference peak rate. On the opposite end, a wrong
+ * transition to the fast class tends to increase
+ * weight-raising periods, because of the opposite reason.
+ */
+ device_speed_thresh[0] = (4 * R_slow[0]) / 3;
+ device_speed_thresh[1] = (4 * R_slow[1]) / 3;
+
+ ret = elv_register(&iosched_bfq_mq);
+ if (ret)
+ goto err_pol_unreg;
+
+ return 0;
+
+err_pol_unreg:
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+ return ret;
+}
+
+static void __exit bfq_exit(void)
+{
+ elv_unregister(&iosched_bfq_mq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+ bfq_slab_kill();
+}
+
+module_init(bfq_init);
+module_exit(bfq_exit);
+
+MODULE_AUTHOR("Paolo Valente");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
new file mode 100644
index 000000000000..ae783c06dfd9
--- /dev/null
+++ b/block/bfq-iosched.h
@@ -0,0 +1,941 @@
+/*
+ * Header file for the BFQ I/O scheduler: data structures and
+ * prototypes of interface functions among BFQ components.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef _BFQ_H
+#define _BFQ_H
+
+#include <linux/blktrace_api.h>
+#include <linux/hrtimer.h>
+#include <linux/blk-cgroup.h>
+
+#define BFQ_IOPRIO_CLASSES 3
+#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
+
+#define BFQ_MIN_WEIGHT 1
+#define BFQ_MAX_WEIGHT 1000
+#define BFQ_WEIGHT_CONVERSION_COEFF 10
+
+#define BFQ_DEFAULT_QUEUE_IOPRIO 4
+
+#define BFQ_WEIGHT_LEGACY_DFL 100
+#define BFQ_DEFAULT_GRP_IOPRIO 0
+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
+
+/*
+ * Soft real-time applications are extremely more latency sensitive
+ * than interactive ones. Over-raise the weight of the former to
+ * privilege them against the latter.
+ */
+#define BFQ_SOFTRT_WEIGHT_FACTOR 100
+
+struct bfq_entity;
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree. All the fields are protected by the queue lock
+ * of the containing bfqd.
+ */
+struct bfq_service_tree {
+ /* tree for active entities (i.e., those backlogged) */
+ struct rb_root active;
+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
+ struct rb_root idle;
+
+ /* idle entity with minimum F_i */
+ struct bfq_entity *first_idle;
+ /* idle entity with maximum F_i */
+ struct bfq_entity *last_idle;
+
+ /* scheduler virtual time */
+ u64 vtime;
+ /* scheduler weight sum; active and idle entities contribute to it */
+ unsigned long wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ *
+ * bfq_sched_data is the basic scheduler queue. It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as an
+ * intermediate queue on a hierarchical setup. @next_in_service
+ * points to the active entity of the sched_data service trees that
+ * will be scheduled next. It is used to reduce the number of steps
+ * needed for each hierarchical-schedule update.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_sched_data {
+ /* entity in service */
+ struct bfq_entity *in_service_entity;
+ /* head-of-line entity (see comments above) */
+ struct bfq_entity *next_in_service;
+ /* array of service trees, one per ioprio_class */
+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+ /* last time CLASS_IDLE was served */
+ unsigned long bfq_class_idle_last_service;
+
+};
+
+/**
+ * struct bfq_weight_counter - counter of the number of all active entities
+ * with a given weight.
+ */
+struct bfq_weight_counter {
+ unsigned int weight; /* weight of the entities this counter refers to */
+ unsigned int num_active; /* nr of active entities with this weight */
+ /*
+ * Weights tree member (see bfq_data's @queue_weights_tree and
+ * @group_weights_tree)
+ */
+ struct rb_node weights_node;
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ *
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy. Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would
+ * allow different weights on different devices, but this
+ * functionality is not exported to userspace by now. Priorities and
+ * weights are updated lazily, first storing the new values into the
+ * new_* fields, then setting the @prio_changed flag. As soon as
+ * there is a transition in the entity state that allows the priority
+ * update to take place the effective and the requested priority
+ * values are synchronized.
+ *
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ. When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed. All the fields are protected by the queue lock of the
+ * containing bfqd.
+ */
+struct bfq_entity {
+ /* service_tree member */
+ struct rb_node rb_node;
+ /* pointer to the weight counter associated with this entity */
+ struct bfq_weight_counter *weight_counter;
+
+ /*
+ * Flag, true if the entity is on a tree (either the active or
+ * the idle one of its service_tree) or is in service.
+ */
+ bool on_st;
+
+ /* B-WF2Q+ start and finish timestamps [sectors/weight] */
+ u64 start, finish;
+
+ /* tree the entity is enqueued into; %NULL if not on a tree */
+ struct rb_root *tree;
+
+ /*
+ * minimum start time of the (active) subtree rooted at this
+ * entity; used for O(log N) lookups into active trees
+ */
+ u64 min_start;
+
+ /* amount of service received during the last service slot */
+ int service;
+
+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
+ int budget;
+
+ /* weight of the queue */
+ int weight;
+ /* next weight if a change is in progress */
+ int new_weight;
+
+ /* original weight, used to implement weight boosting */
+ int orig_weight;
+
+ /* parent entity, for hierarchical scheduling */
+ struct bfq_entity *parent;
+
+ /*
+ * For non-leaf nodes in the hierarchy, the associated
+ * scheduler queue, %NULL on leaf nodes.
+ */
+ struct bfq_sched_data *my_sched_data;
+ /* the scheduler queue this entity belongs to */
+ struct bfq_sched_data *sched_data;
+
+ /* flag, set to request a weight, ioprio or ioprio_class change */
+ int prio_changed;
+};
+
+struct bfq_group;
+
+/**
+ * struct bfq_ttime - per process thinktime stats.
+ */
+struct bfq_ttime {
+ /* completion time of the last request */
+ u64 last_end_request;
+
+ /* total process thinktime */
+ u64 ttime_total;
+ /* number of thinktime samples */
+ unsigned long ttime_samples;
+ /* average process thinktime */
+ u64 ttime_mean;
+};
+
+/**
+ * struct bfq_queue - leaf schedulable entity.
+ *
+ * A bfq_queue is a leaf request queue; it can be associated with an
+ * io_context or more, if it is async or shared between cooperating
+ * processes. @cgroup holds a reference to the cgroup, to be sure that it
+ * does not disappear while a bfqq still references it (mostly to avoid
+ * races between request issuing and task migration followed by cgroup
+ * destruction).
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_queue {
+ /* reference counter */
+ int ref;
+ /* parent bfq_data */
+ struct bfq_data *bfqd;
+
+ /* current ioprio and ioprio class */
+ unsigned short ioprio, ioprio_class;
+ /* next ioprio and ioprio class if a change is in progress */
+ unsigned short new_ioprio, new_ioprio_class;
+
+ /*
+ * Shared bfq_queue if queue is cooperating with one or more
+ * other queues.
+ */
+ struct bfq_queue *new_bfqq;
+ /* request-position tree member (see bfq_group's @rq_pos_tree) */
+ struct rb_node pos_node;
+ /* request-position tree root (see bfq_group's @rq_pos_tree) */
+ struct rb_root *pos_root;
+
+ /* sorted list of pending requests */
+ struct rb_root sort_list;
+ /* if fifo isn't expired, next request to serve */
+ struct request *next_rq;
+ /* number of sync and async requests queued */
+ int queued[2];
+ /* number of requests currently allocated */
+ int allocated;
+ /* number of pending metadata requests */
+ int meta_pending;
+ /* fifo list of requests in sort_list */
+ struct list_head fifo;
+
+ /* entity representing this queue in the scheduler */
+ struct bfq_entity entity;
+
+ /* maximum budget allowed from the feedback mechanism */
+ int max_budget;
+ /* budget expiration (in jiffies) */
+ unsigned long budget_timeout;
+
+ /* number of requests on the dispatch list or inside driver */
+ int dispatched;
+
+ /* status flags */
+ unsigned long flags;
+
+ /* node for active/idle bfqq list inside parent bfqd */
+ struct list_head bfqq_list;
+
+ /* associated @bfq_ttime struct */
+ struct bfq_ttime ttime;
+
+ /* bit vector: a 1 for each seeky requests in history */
+ u32 seek_history;
+
+ /* node for the device's burst list */
+ struct hlist_node burst_list_node;
+
+ /* position of the last request enqueued */
+ sector_t last_request_pos;
+
+ /* Number of consecutive pairs of request completion and
+ * arrival, such that the queue becomes idle after the
+ * completion, but the next request arrives within an idle
+ * time slice; used only if the queue's IO_bound flag has been
+ * cleared.
+ */
+ unsigned int requests_within_timer;
+
+ /* pid of the process owning the queue, used for logging purposes */
+ pid_t pid;
+
+ /*
+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
+ * if the queue is shared.
+ */
+ struct bfq_io_cq *bic;
+
+ /* current maximum weight-raising time for this queue */
+ unsigned long wr_cur_max_time;
+ /*
+ * Minimum time instant such that, only if a new request is
+ * enqueued after this time instant in an idle @bfq_queue with
+ * no outstanding requests, then the task associated with the
+ * queue it is deemed as soft real-time (see the comments on
+ * the function bfq_bfqq_softrt_next_start())
+ */
+ unsigned long soft_rt_next_start;
+ /*
+ * Start time of the current weight-raising period if
+ * the @bfq-queue is being weight-raised, otherwise
+ * finish time of the last weight-raising period.
+ */
+ unsigned long last_wr_start_finish;
+ /* factor by which the weight of this queue is multiplied */
+ unsigned int wr_coeff;
+ /*
+ * Time of the last transition of the @bfq_queue from idle to
+ * backlogged.
+ */
+ unsigned long last_idle_bklogged;
+ /*
+ * Cumulative service received from the @bfq_queue since the
+ * last transition from idle to backlogged.
+ */
+ unsigned long service_from_backlogged;
+
+ /*
+ * Value of wr start time when switching to soft rt
+ */
+ unsigned long wr_start_at_switch_to_srt;
+
+ unsigned long split_time; /* time of last split */
+};
+
+/**
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
+ */
+struct bfq_io_cq {
+ /* associated io_cq structure */
+ struct io_cq icq; /* must be the first member */
+ /* array of two process queues, the sync and the async */
+ struct bfq_queue *bfqq[2];
+ /* per (request_queue, blkcg) ioprio */
+ int ioprio;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ uint64_t blkcg_serial_nr; /* the current blkcg serial */
+#endif
+ /*
+ * Snapshot of the idle window before merging; taken to
+ * remember this value while the queue is merged, so as to be
+ * able to restore it in case of split.
+ */
+ bool saved_idle_window;
+ /*
+ * Same purpose as the previous two fields for the I/O bound
+ * classification of a queue.
+ */
+ bool saved_IO_bound;
+
+ /*
+ * Same purpose as the previous fields for the value of the
+ * field keeping the queue's belonging to a large burst
+ */
+ bool saved_in_large_burst;
+ /*
+ * True if the queue belonged to a burst list before its merge
+ * with another cooperating queue.
+ */
+ bool was_in_burst_list;
+
+ /*
+ * Similar to previous fields: save wr information.
+ */
+ unsigned long saved_wr_coeff;
+ unsigned long saved_last_wr_start_finish;
+ unsigned long saved_wr_start_at_switch_to_srt;
+ unsigned int saved_wr_cur_max_time;
+ struct bfq_ttime saved_ttime;
+};
+
+enum bfq_device_speed {
+ BFQ_BFQD_FAST,
+ BFQ_BFQD_SLOW,
+};
+
+/**
+ * struct bfq_data - per-device data structure.
+ *
+ * All the fields are protected by @lock.
+ */
+struct bfq_data {
+ /* device request queue */
+ struct request_queue *queue;
+ /* dispatch queue */
+ struct list_head dispatch;
+
+ /* root bfq_group for the device */
+ struct bfq_group *root_group;
+
+ /*
+ * rbtree of weight counters of @bfq_queues, sorted by
+ * weight. Used to keep track of whether all @bfq_queues have
+ * the same weight. The tree contains one counter for each
+ * distinct weight associated to some active and not
+ * weight-raised @bfq_queue (see the comments to the functions
+ * bfq_weights_tree_[add|remove] for further details).
+ */
+ struct rb_root queue_weights_tree;
+ /*
+ * rbtree of non-queue @bfq_entity weight counters, sorted by
+ * weight. Used to keep track of whether all @bfq_groups have
+ * the same weight. The tree contains one counter for each
+ * distinct weight associated to some active @bfq_group (see
+ * the comments to the functions bfq_weights_tree_[add|remove]
+ * for further details).
+ */
+ struct rb_root group_weights_tree;
+
+ /*
+ * Number of bfq_queues containing requests (including the
+ * queue in service, even if it is idling).
+ */
+ int busy_queues;
+ /* number of weight-raised busy @bfq_queues */
+ int wr_busy_queues;
+ /* number of queued requests */
+ int queued;
+ /* number of requests dispatched and waiting for completion */
+ int rq_in_driver;
+
+ /*
+ * Maximum number of requests in driver in the last
+ * @hw_tag_samples completed requests.
+ */
+ int max_rq_in_driver;
+ /* number of samples used to calculate hw_tag */
+ int hw_tag_samples;
+ /* flag set to one if the driver is showing a queueing behavior */
+ int hw_tag;
+
+ /* number of budgets assigned */
+ int budgets_assigned;
+
+ /*
+ * Timer set when idling (waiting) for the next request from
+ * the queue in service.
+ */
+ struct hrtimer idle_slice_timer;
+
+ /* bfq_queue in service */
+ struct bfq_queue *in_service_queue;
+
+ /* on-disk position of the last served request */
+ sector_t last_position;
+
+ /* time of last request completion (ns) */
+ u64 last_completion;
+
+ /* time of first rq dispatch in current observation interval (ns) */
+ u64 first_dispatch;
+ /* time of last rq dispatch in current observation interval (ns) */
+ u64 last_dispatch;
+
+ /* beginning of the last budget */
+ ktime_t last_budget_start;
+ /* beginning of the last idle slice */
+ ktime_t last_idling_start;
+
+ /* number of samples in current observation interval */
+ int peak_rate_samples;
+ /* num of samples of seq dispatches in current observation interval */
+ u32 sequential_samples;
+ /* total num of sectors transferred in current observation interval */
+ u64 tot_sectors_dispatched;
+ /* max rq size seen during current observation interval (sectors) */
+ u32 last_rq_max_size;
+ /* time elapsed from first dispatch in current observ. interval (us) */
+ u64 delta_from_first;
+ /*
+ * Current estimate of the device peak rate, measured in
+ * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
+ * BFQ_RATE_SHIFT is performed to increase precision in
+ * fixed-point calculations.
+ */
+ u32 peak_rate;
+
+ /* maximum budget allotted to a bfq_queue before rescheduling */
+ int bfq_max_budget;
+
+ /* list of all the bfq_queues active on the device */
+ struct list_head active_list;
+ /* list of all the bfq_queues idle on the device */
+ struct list_head idle_list;
+
+ /*
+ * Timeout for async/sync requests; when it fires, requests
+ * are served in fifo order.
+ */
+ u64 bfq_fifo_expire[2];
+ /* weight of backward seeks wrt forward ones */
+ unsigned int bfq_back_penalty;
+ /* maximum allowed backward seek */
+ unsigned int bfq_back_max;
+ /* maximum idling time */
+ u32 bfq_slice_idle;
+
+ /* user-configured max budget value (0 for auto-tuning) */
+ int bfq_user_max_budget;
+ /*
+ * Timeout for bfq_queues to consume their budget; used to
+ * prevent seeky queues from imposing long latencies to
+ * sequential or quasi-sequential ones (this also implies that
+ * seeky queues cannot receive guarantees in the service
+ * domain; after a timeout they are charged for the time they
+ * have been in service, to preserve fairness among them, but
+ * without service-domain guarantees).
+ */
+ unsigned int bfq_timeout;
+
+ /*
+ * Number of consecutive requests that must be issued within
+ * the idle time slice to set again idling to a queue which
+ * was marked as non-I/O-bound (see the definition of the
+ * IO_bound flag for further details).
+ */
+ unsigned int bfq_requests_within_timer;
+
+ /*
+ * Force device idling whenever needed to provide accurate
+ * service guarantees, without caring about throughput
+ * issues. CAVEAT: this may even increase latencies, in case
+ * of useless idling for processes that did stop doing I/O.
+ */
+ bool strict_guarantees;
+
+ /*
+ * Last time at which a queue entered the current burst of
+ * queues being activated shortly after each other; for more
+ * details about this and the following parameters related to
+ * a burst of activations, see the comments on the function
+ * bfq_handle_burst.
+ */
+ unsigned long last_ins_in_burst;
+ /*
+ * Reference time interval used to decide whether a queue has
+ * been activated shortly after @last_ins_in_burst.
+ */
+ unsigned long bfq_burst_interval;
+ /* number of queues in the current burst of queue activations */
+ int burst_size;
+
+ /* common parent entity for the queues in the burst */
+ struct bfq_entity *burst_parent_entity;
+ /* Maximum burst size above which the current queue-activation
+ * burst is deemed as 'large'.
+ */
+ unsigned long bfq_large_burst_thresh;
+ /* true if a large queue-activation burst is in progress */
+ bool large_burst;
+ /*
+ * Head of the burst list (as for the above fields, more
+ * details in the comments on the function bfq_handle_burst).
+ */
+ struct hlist_head burst_list;
+
+ /* if set to true, low-latency heuristics are enabled */
+ bool low_latency;
+ /*
+ * Maximum factor by which the weight of a weight-raised queue
+ * is multiplied.
+ */
+ unsigned int bfq_wr_coeff;
+ /* maximum duration of a weight-raising period (jiffies) */
+ unsigned int bfq_wr_max_time;
+
+ /* Maximum weight-raising duration for soft real-time processes */
+ unsigned int bfq_wr_rt_max_time;
+ /*
+ * Minimum idle period after which weight-raising may be
+ * reactivated for a queue (in jiffies).
+ */
+ unsigned int bfq_wr_min_idle_time;
+ /*
+ * Minimum period between request arrivals after which
+ * weight-raising may be reactivated for an already busy async
+ * queue (in jiffies).
+ */
+ unsigned long bfq_wr_min_inter_arr_async;
+
+ /* Max service-rate for a soft real-time queue, in sectors/sec */
+ unsigned int bfq_wr_max_softrt_rate;
+ /*
+ * Cached value of the product R*T, used for computing the
+ * maximum duration of weight raising automatically.
+ */
+ u64 RT_prod;
+ /* device-speed class for the low-latency heuristic */
+ enum bfq_device_speed device_speed;
+
+ /* fallback dummy bfqq for extreme OOM conditions */
+ struct bfq_queue oom_bfqq;
+
+ spinlock_t lock;
+
+ /*
+ * bic associated with the task issuing current bio for
+ * merging. This and the next field are used as a support to
+ * be able to perform the bic lookup, needed by bio-merge
+ * functions, before the scheduler lock is taken, and thus
+ * avoid taking the request-queue lock while the scheduler
+ * lock is being held.
+ */
+ struct bfq_io_cq *bio_bic;
+ /* bfqq associated with the task issuing current bio for merging */
+ struct bfq_queue *bio_bfqq;
+};
+
+enum bfqq_state_flags {
+ BFQQF_just_created = 0, /* queue just allocated */
+ BFQQF_busy, /* has requests or is in service */
+ BFQQF_wait_request, /* waiting for a request */
+ BFQQF_non_blocking_wait_rq, /*
+ * waiting for a request
+ * without idling the device
+ */
+ BFQQF_fifo_expire, /* FIFO checked in this slice */
+ BFQQF_idle_window, /* slice idling enabled */
+ BFQQF_sync, /* synchronous queue */
+ BFQQF_IO_bound, /*
+ * bfqq has timed-out at least once
+ * having consumed at most 2/10 of
+ * its budget
+ */
+ BFQQF_in_large_burst, /*
+ * bfqq activated in a large burst,
+ * see comments to bfq_handle_burst.
+ */
+ BFQQF_softrt_update, /*
+ * may need softrt-next-start
+ * update
+ */
+ BFQQF_coop, /* bfqq is shared */
+ BFQQF_split_coop /* shared bfqq will be split */
+};
+
+#define BFQ_BFQQ_FNS(name) \
+void bfq_mark_bfqq_##name(struct bfq_queue *bfqq); \
+void bfq_clear_bfqq_##name(struct bfq_queue *bfqq); \
+int bfq_bfqq_##name(const struct bfq_queue *bfqq);
+
+BFQ_BFQQ_FNS(just_created);
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS
+
+/* Expiration reasons. */
+enum bfqq_expiration {
+ BFQQE_TOO_IDLE = 0, /*
+ * queue has been idling for
+ * too long
+ */
+ BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
+ BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
+ BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
+ BFQQE_PREEMPTED /* preemption in progress */
+};
+
+struct bfqg_stats {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ /* number of ios merged */
+ struct blkg_rwstat merged;
+ /* total time spent on device in ns, may not be accurate w/ queueing */
+ struct blkg_rwstat service_time;
+ /* total time spent waiting in scheduler queue in ns */
+ struct blkg_rwstat wait_time;
+ /* number of IOs queued up */
+ struct blkg_rwstat queued;
+ /* total disk time and nr sectors dispatched by this group */
+ struct blkg_stat time;
+ /* sum of number of ios queued across all samples */
+ struct blkg_stat avg_queue_size_sum;
+ /* count of samples taken for average */
+ struct blkg_stat avg_queue_size_samples;
+ /* how many times this group has been removed from service tree */
+ struct blkg_stat dequeue;
+ /* total time spent waiting for it to be assigned a timeslice. */
+ struct blkg_stat group_wait_time;
+ /* time spent idling for this blkcg_gq */
+ struct blkg_stat idle_time;
+ /* total time with empty current active q with other requests queued */
+ struct blkg_stat empty_time;
+ /* fields after this shouldn't be cleared on stat reset */
+ uint64_t start_group_wait_time;
+ uint64_t start_idle_time;
+ uint64_t start_empty_time;
+ uint16_t flags;
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/*
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
+ *
+ * @ps: @blkcg_policy_storage that this structure inherits
+ * @weight: weight of the bfq_group
+ */
+struct bfq_group_data {
+ /* must be the first member */
+ struct blkcg_policy_data pd;
+
+ unsigned int weight;
+};
+
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ * both bfq_queues and bfq_groups).
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ * the group, one queue per ioprio value per ioprio_class,
+ * except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ * to avoid too many special cases during group creation/
+ * migration.
+ * @stats: stats for this bfqg.
+ * @active_entities: number of active entities belonging to the group;
+ * unused for the root group. Used to know whether there
+ * are groups with more than one active @bfq_entity
+ * (see the comments to the function
+ * bfq_bfqq_may_idle()).
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
+ * determining if two or more queues have interleaving
+ * requests (see bfq_find_close_cooperator()).
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ * o @bfqd is protected by the queue lock, RCU is used to access it
+ * from the readers.
+ * o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+ /* must be the first member */
+ struct blkg_policy_data pd;
+
+ struct bfq_entity entity;
+ struct bfq_sched_data sched_data;
+
+ void *bfqd;
+
+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+ struct bfq_queue *async_idle_bfqq;
+
+ struct bfq_entity *my_entity;
+
+ int active_entities;
+
+ struct rb_root rq_pos_tree;
+
+ struct bfqg_stats stats;
+};
+
+#else
+struct bfq_group {
+ struct bfq_sched_data sched_data;
+
+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+ struct bfq_queue *async_idle_bfqq;
+
+ struct rb_root rq_pos_tree;
+};
+#endif
+
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+
+/* --------------- main algorithm interface ----------------- */
+
+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+extern const int bfq_timeout;
+
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
+struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+ struct rb_root *root);
+void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
+ struct rb_root *root);
+void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool compensate, enum bfqq_expiration reason);
+void bfq_put_queue(struct bfq_queue *bfqq);
+void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+void bfq_schedule_dispatch(struct bfq_data *bfqd);
+void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+
+/* ------------ end of main algorithm interface -------------- */
+
+/* ---------------- cgroups-support interface ---------------- */
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+ unsigned int op);
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+ uint64_t io_start_time, unsigned int op);
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg);
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_group *bfqg);
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg);
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio);
+void bfq_end_wr_async(struct bfq_data *bfqd);
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+ struct blkcg *blkcg);
+struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
+void bfqg_put(struct bfq_group *bfqg);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+extern struct cftype bfq_blkcg_legacy_files[];
+extern struct cftype bfq_blkg_files[];
+extern struct blkcg_policy blkcg_policy_bfq;
+#endif
+
+/* ------------- end of cgroups-support interface ------------- */
+
+/* - interface of the internal hierarchical B-WF2Q+ scheduler - */
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/* both next loops stop at one of the child entities of the root group */
+#define for_each_entity(entity) \
+ for (; entity ; entity = entity->parent)
+
+/*
+ * For each iteration, compute parent in advance, so as to be safe if
+ * entity is deallocated during the iteration. Such a deallocation may
+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue
+ * containing entity.
+ */
+#define for_each_entity_safe(entity, parent) \
+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+/*
+ * Next two macros are fake loops when cgroups support is not
+ * enabled. I fact, in such a case, there is only one level to go up
+ * (to reach the root group).
+ */
+#define for_each_entity(entity) \
+ for (; entity ; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+ for (parent = NULL; entity ; entity = parent)
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
+struct bfq_entity *bfq_entity_of(struct rb_node *node);
+unsigned short bfq_ioprio_to_weight(int ioprio);
+void bfq_put_idle_entity(struct bfq_service_tree *st,
+ struct bfq_entity *entity);
+struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+ struct bfq_entity *entity);
+void bfq_bfqq_served(struct bfq_queue *bfqq, int served);
+void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ unsigned long time_ms);
+bool __bfq_deactivate_entity(struct bfq_entity *entity,
+ bool ins_into_idle_tree);
+bool next_queue_may_preempt(struct bfq_data *bfqd);
+struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);
+void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
+void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool ins_into_idle_tree, bool expiration);
+void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool expiration);
+void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+
+/* --------------- end of interface of B-WF2Q+ ---------------- */
+
+/* Logging facilities. */
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
+ char __pbuf[128]; \
+ \
+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
+ __pbuf, ##args); \
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
+ char __pbuf[128]; \
+ \
+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
+ ##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log(bfqd, fmt, args...) \
+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
+
+#endif /* _BFQ_H */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
new file mode 100644
index 000000000000..b4fc3e4260b7
--- /dev/null
+++ b/block/bfq-wf2q.c
@@ -0,0 +1,1616 @@
+/*
+ * Hierarchical Budget Worst-case Fair Weighted Fair Queueing
+ * (B-WF2Q+): hierarchical scheduling algorithm by which the BFQ I/O
+ * scheduler schedules generic entities. The latter can represent
+ * either single bfq queues (associated with processes) or groups of
+ * bfq queues (associated with cgroups).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include "bfq-iosched.h"
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+ return (s64)(a - b) > 0;
+}
+
+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
+{
+ struct rb_node *node = tree->rb_node;
+
+ return rb_entry(node, struct bfq_entity, rb_node);
+}
+
+static unsigned int bfq_class_idx(struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ return bfqq ? bfqq->ioprio_class - 1 :
+ BFQ_DEFAULT_GRP_CLASS - 1;
+}
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
+
+/**
+ * bfq_update_next_in_service - update sd->next_in_service
+ * @sd: sched_data for which to perform the update.
+ * @new_entity: if not NULL, pointer to the entity whose activation,
+ * requeueing or repositionig triggered the invocation of
+ * this function.
+ *
+ * This function is called to update sd->next_in_service, which, in
+ * its turn, may change as a consequence of the insertion or
+ * extraction of an entity into/from one of the active trees of
+ * sd. These insertions/extractions occur as a consequence of
+ * activations/deactivations of entities, with some activations being
+ * 'true' activations, and other activations being requeueings (i.e.,
+ * implementing the second, requeueing phase of the mechanism used to
+ * reposition an entity in its active tree; see comments on
+ * __bfq_activate_entity and __bfq_requeue_entity for details). In
+ * both the last two activation sub-cases, new_entity points to the
+ * just activated or requeued entity.
+ *
+ * Returns true if sd->next_in_service changes in such a way that
+ * entity->parent may become the next_in_service for its parent
+ * entity.
+ */
+static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
+ struct bfq_entity *new_entity)
+{
+ struct bfq_entity *next_in_service = sd->next_in_service;
+ bool parent_sched_may_change = false;
+
+ /*
+ * If this update is triggered by the activation, requeueing
+ * or repositiong of an entity that does not coincide with
+ * sd->next_in_service, then a full lookup in the active tree
+ * can be avoided. In fact, it is enough to check whether the
+ * just-modified entity has a higher priority than
+ * sd->next_in_service, or, even if it has the same priority
+ * as sd->next_in_service, is eligible and has a lower virtual
+ * finish time than sd->next_in_service. If this compound
+ * condition holds, then the new entity becomes the new
+ * next_in_service. Otherwise no change is needed.
+ */
+ if (new_entity && new_entity != sd->next_in_service) {
+ /*
+ * Flag used to decide whether to replace
+ * sd->next_in_service with new_entity. Tentatively
+ * set to true, and left as true if
+ * sd->next_in_service is NULL.
+ */
+ bool replace_next = true;
+
+ /*
+ * If there is already a next_in_service candidate
+ * entity, then compare class priorities or timestamps
+ * to decide whether to replace sd->service_tree with
+ * new_entity.
+ */
+ if (next_in_service) {
+ unsigned int new_entity_class_idx =
+ bfq_class_idx(new_entity);
+ struct bfq_service_tree *st =
+ sd->service_tree + new_entity_class_idx;
+
+ /*
+ * For efficiency, evaluate the most likely
+ * sub-condition first.
+ */
+ replace_next =
+ (new_entity_class_idx ==
+ bfq_class_idx(next_in_service)
+ &&
+ !bfq_gt(new_entity->start, st->vtime)
+ &&
+ bfq_gt(next_in_service->finish,
+ new_entity->finish))
+ ||
+ new_entity_class_idx <
+ bfq_class_idx(next_in_service);
+ }
+
+ if (replace_next)
+ next_in_service = new_entity;
+ } else /* invoked because of a deactivation: lookup needed */
+ next_in_service = bfq_lookup_next_entity(sd);
+
+ if (next_in_service) {
+ parent_sched_may_change = !sd->next_in_service ||
+ bfq_update_parent_budget(next_in_service);
+ }
+
+ sd->next_in_service = next_in_service;
+
+ if (!next_in_service)
+ return parent_sched_may_change;
+
+ return parent_sched_may_change;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *group_entity = bfqq->entity.parent;
+
+ if (!group_entity)
+ group_entity = &bfqq->bfqd->root_group->entity;
+
+ return container_of(group_entity, struct bfq_group, entity);
+}
+
+/*
+ * Returns true if this budget changes may let next_in_service->parent
+ * become the next_in_service entity for its parent entity.
+ */
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
+{
+ struct bfq_entity *bfqg_entity;
+ struct bfq_group *bfqg;
+ struct bfq_sched_data *group_sd;
+ bool ret = false;
+
+ group_sd = next_in_service->sched_data;
+
+ bfqg = container_of(group_sd, struct bfq_group, sched_data);
+ /*
+ * bfq_group's my_entity field is not NULL only if the group
+ * is not the root group. We must not touch the root entity
+ * as it must never become an in-service entity.
+ */
+ bfqg_entity = bfqg->my_entity;
+ if (bfqg_entity) {
+ if (bfqg_entity->budget > next_in_service->budget)
+ ret = true;
+ bfqg_entity->budget = next_in_service->budget;
+ }
+
+ return ret;
+}
+
+/*
+ * This function tells whether entity stops being a candidate for next
+ * service, according to the following logic.
+ *
+ * This function is invoked for an entity that is about to be set in
+ * service. If such an entity is a queue, then the entity is no longer
+ * a candidate for next service (i.e, a candidate entity to serve
+ * after the in-service entity is expired). The function then returns
+ * true.
+ *
+ * In contrast, the entity could stil be a candidate for next service
+ * if it is not a queue, and has more than one child. In fact, even if
+ * one of its children is about to be set in service, other children
+ * may still be the next to serve. As a consequence, a non-queue
+ * entity is not a candidate for next-service only if it has only one
+ * child. And only if this condition holds, then the function returns
+ * true for a non-queue entity.
+ */
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+ struct bfq_group *bfqg;
+
+ if (bfq_entity_to_bfqq(entity))
+ return true;
+
+ bfqg = container_of(entity, struct bfq_group, entity);
+
+ if (bfqg->active_entities == 1)
+ return true;
+
+ return false;
+}
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+ return bfqq->bfqd->root_group;
+}
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
+{
+ return false;
+}
+
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+ return true;
+}
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+/*
+ * Shift for timestamp calculations. This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time
+ * wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT 22
+
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = NULL;
+
+ if (!entity->my_sched_data)
+ bfqq = container_of(entity, struct bfq_queue, entity);
+
+ return bfqq;
+}
+
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor (weight of an entity or weight sum).
+ */
+static u64 bfq_delta(unsigned long service, unsigned long weight)
+{
+ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
+
+ do_div(d, weight);
+ return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->finish = entity->start +
+ bfq_delta(service, entity->weight);
+
+ if (bfqq) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "calc_finish: serv %lu, w %d",
+ service, entity->weight);
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "calc_finish: start %llu, finish %llu, delta %llu",
+ entity->start, entity->finish,
+ bfq_delta(service, entity->weight));
+ }
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity. This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+struct bfq_entity *bfq_entity_of(struct rb_node *node)
+{
+ struct bfq_entity *entity = NULL;
+
+ if (node)
+ entity = rb_entry(node, struct bfq_entity, rb_node);
+
+ return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
+{
+ entity->tree = NULL;
+ rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct rb_node *next;
+
+ if (entity == st->first_idle) {
+ next = rb_next(&entity->rb_node);
+ st->first_idle = bfq_entity_of(next);
+ }
+
+ if (entity == st->last_idle) {
+ next = rb_prev(&entity->rb_node);
+ st->last_idle = bfq_entity_of(next);
+ }
+
+ bfq_extract(&st->idle, entity);
+
+ if (bfqq)
+ list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
+{
+ struct bfq_entity *entry;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *parent = NULL;
+
+ while (*node) {
+ parent = *node;
+ entry = rb_entry(parent, struct bfq_entity, rb_node);
+
+ if (bfq_gt(entry->finish, entity->finish))
+ node = &parent->rb_left;
+ else
+ node = &parent->rb_right;
+ }
+
+ rb_link_node(&entity->rb_node, parent, node);
+ rb_insert_color(&entity->rb_node, root);
+
+ entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree. The function assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
+{
+ struct bfq_entity *child;
+
+ if (node) {
+ child = rb_entry(node, struct bfq_entity, rb_node);
+ if (bfq_gt(entity->min_start, child->min_start))
+ entity->min_start = child->min_start;
+ }
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value. The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static void bfq_update_active_node(struct rb_node *node)
+{
+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
+
+ entity->min_start = entity->start;
+ bfq_update_min(entity, node->rb_right);
+ bfq_update_min(entity, node->rb_left);
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update. This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root. The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+ struct rb_node *parent;
+
+up:
+ bfq_update_active_node(node);
+
+ parent = rb_parent(node);
+ if (!parent)
+ return;
+
+ if (node == parent->rb_left && parent->rb_right)
+ bfq_update_active_node(parent->rb_right);
+ else if (parent->rb_left)
+ bfq_update_active_node(parent->rb_left);
+
+ node = parent;
+ goto up;
+}
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its
+ * group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct rb_node *node = &entity->rb_node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_sched_data *sd = NULL;
+ struct bfq_group *bfqg = NULL;
+ struct bfq_data *bfqd = NULL;
+#endif
+
+ bfq_insert(&st->active, entity);
+
+ if (node->rb_left)
+ node = node->rb_left;
+ else if (node->rb_right)
+ node = node->rb_right;
+
+ bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ sd = entity->sched_data;
+ bfqg = container_of(sd, struct bfq_group, sched_data);
+ bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+ if (bfqq)
+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else /* bfq_group */
+ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
+
+ if (bfqg != bfqd->root_group)
+ bfqg->active_entities++;
+#endif
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+unsigned short bfq_ioprio_to_weight(int ioprio)
+{
+ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
+}
+
+/**
+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
+ * @weight: the weight value to convert.
+ *
+ * To preserve as much as possible the old only-ioprio user interface,
+ * 0 is used as an escape ioprio value for weights (numerically) equal or
+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ */
+static unsigned short bfq_weight_to_ioprio(int weight)
+{
+ return max_t(int, 0,
+ IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+}
+
+static void bfq_get_entity(struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ if (bfqq) {
+ bfqq->ref++;
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
+ bfqq, bfqq->ref);
+ }
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch. If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+ struct rb_node *deepest;
+
+ if (!node->rb_right && !node->rb_left)
+ deepest = rb_parent(node);
+ else if (!node->rb_right)
+ deepest = node->rb_left;
+ else if (!node->rb_left)
+ deepest = node->rb_right;
+ else {
+ deepest = rb_next(node);
+ if (deepest->rb_right)
+ deepest = deepest->rb_right;
+ else if (rb_parent(deepest) != node)
+ deepest = rb_parent(deepest);
+ }
+
+ return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct rb_node *node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_sched_data *sd = NULL;
+ struct bfq_group *bfqg = NULL;
+ struct bfq_data *bfqd = NULL;
+#endif
+
+ node = bfq_find_deepest(&entity->rb_node);
+ bfq_extract(&st->active, entity);
+
+ if (node)
+ bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ sd = entity->sched_data;
+ bfqg = container_of(sd, struct bfq_group, sched_data);
+ bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+ if (bfqq)
+ list_del(&bfqq->bfqq_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else /* bfq_group */
+ bfq_weights_tree_remove(bfqd, entity,
+ &bfqd->group_weights_tree);
+
+ if (bfqg != bfqd->root_group)
+ bfqg->active_entities--;
+#endif
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct bfq_entity *first_idle = st->first_idle;
+ struct bfq_entity *last_idle = st->last_idle;
+
+ if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
+ st->first_idle = entity;
+ if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
+ st->last_idle = entity;
+
+ bfq_insert(&st->idle, entity);
+
+ if (bfqq)
+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - do not consider entity any longer for scheduling
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ * @is_in_service: true if entity is currently the in-service entity.
+ *
+ * Forget everything about @entity. In addition, if entity represents
+ * a queue, and the latter is not in service, then release the service
+ * reference to the queue (the one taken through bfq_get_entity). In
+ * fact, in this case, there is really no more service reference to
+ * the queue, as the latter is also outside any service tree. If,
+ * instead, the queue is in service, then __bfq_bfqd_reset_in_service
+ * will take care of putting the reference when the queue finally
+ * stops being served.
+ */
+static void bfq_forget_entity(struct bfq_service_tree *st,
+ struct bfq_entity *entity,
+ bool is_in_service)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->on_st = false;
+ st->wsum -= entity->weight;
+ if (bfqq && !is_in_service)
+ bfq_put_queue(bfqq);
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+void bfq_put_idle_entity(struct bfq_service_tree *st, struct bfq_entity *entity)
+{
+ bfq_idle_extract(st, entity);
+ bfq_forget_entity(st, entity,
+ entity == entity->sched_data->in_service_entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+static void bfq_forget_idle(struct bfq_service_tree *st)
+{
+ struct bfq_entity *first_idle = st->first_idle;
+ struct bfq_entity *last_idle = st->last_idle;
+
+ if (RB_EMPTY_ROOT(&st->active) && last_idle &&
+ !bfq_gt(last_idle->finish, st->vtime)) {
+ /*
+ * Forget the whole idle tree, increasing the vtime past
+ * the last finish time of idle entities.
+ */
+ st->vtime = last_idle->finish;
+ }
+
+ if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
+ bfq_put_idle_entity(st, first_idle);
+}
+
+struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity)
+{
+ struct bfq_sched_data *sched_data = entity->sched_data;
+ unsigned int idx = bfq_class_idx(entity);
+
+ return sched_data->service_tree + idx;
+}
+
+
+struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+ struct bfq_entity *entity)
+{
+ struct bfq_service_tree *new_st = old_st;
+
+ if (entity->prio_changed) {
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ unsigned int prev_weight, new_weight;
+ struct bfq_data *bfqd = NULL;
+ struct rb_root *root;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_sched_data *sd;
+ struct bfq_group *bfqg;
+#endif
+
+ if (bfqq)
+ bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else {
+ sd = entity->my_sched_data;
+ bfqg = container_of(sd, struct bfq_group, sched_data);
+ bfqd = (struct bfq_data *)bfqg->bfqd;
+ }
+#endif
+
+ old_st->wsum -= entity->weight;
+
+ if (entity->new_weight != entity->orig_weight) {
+ if (entity->new_weight < BFQ_MIN_WEIGHT ||
+ entity->new_weight > BFQ_MAX_WEIGHT) {
+ pr_crit("update_weight_prio: new_weight %d\n",
+ entity->new_weight);
+ if (entity->new_weight < BFQ_MIN_WEIGHT)
+ entity->new_weight = BFQ_MIN_WEIGHT;
+ else
+ entity->new_weight = BFQ_MAX_WEIGHT;
+ }
+ entity->orig_weight = entity->new_weight;
+ if (bfqq)
+ bfqq->ioprio =
+ bfq_weight_to_ioprio(entity->orig_weight);
+ }
+
+ if (bfqq)
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
+ entity->prio_changed = 0;
+
+ /*
+ * NOTE: here we may be changing the weight too early,
+ * this will cause unfairness. The correct approach
+ * would have required additional complexity to defer
+ * weight changes to the proper time instants (i.e.,
+ * when entity->finish <= old_st->vtime).
+ */
+ new_st = bfq_entity_service_tree(entity);
+
+ prev_weight = entity->weight;
+ new_weight = entity->orig_weight *
+ (bfqq ? bfqq->wr_coeff : 1);
+ /*
+ * If the weight of the entity changes, remove the entity
+ * from its old weight counter (if there is a counter
+ * associated with the entity), and add it to the counter
+ * associated with its new weight.
+ */
+ if (prev_weight != new_weight) {
+ root = bfqq ? &bfqd->queue_weights_tree :
+ &bfqd->group_weights_tree;
+ bfq_weights_tree_remove(bfqd, entity, root);
+ }
+ entity->weight = new_weight;
+ /*
+ * Add the entity to its weights tree only if it is
+ * not associated with a weight-raised queue.
+ */
+ if (prev_weight != new_weight &&
+ (bfqq ? bfqq->wr_coeff == 1 : 1))
+ /* If we get here, root has been initialized. */
+ bfq_weights_tree_add(bfqd, entity, root);
+
+ new_st->wsum += entity->weight;
+
+ if (new_st != old_st)
+ entity->start = new_st->vtime;
+ }
+
+ return new_st;
+}
+
+/**
+ * bfq_bfqq_served - update the scheduler status after selection for
+ * service.
+ * @bfqq: the queue being served.
+ * @served: bytes to transfer.
+ *
+ * NOTE: this can be optimized, as the timestamps of upper level entities
+ * are synchronized every time a new bfqq is selected for service. By now,
+ * we keep it to better check consistency.
+ */
+void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_service_tree *st;
+
+ for_each_entity(entity) {
+ st = bfq_entity_service_tree(entity);
+
+ entity->service += served;
+
+ st->vtime += bfq_delta(served, st->wsum);
+ bfq_forget_idle(st);
+ }
+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
+}
+
+/**
+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
+ * of the time interval during which bfqq has been in
+ * service.
+ * @bfqd: the device
+ * @bfqq: the queue that needs a service update.
+ * @time_ms: the amount of time during which the queue has received service
+ *
+ * If a queue does not consume its budget fast enough, then providing
+ * the queue with service fairness may impair throughput, more or less
+ * severely. For this reason, queues that consume their budget slowly
+ * are provided with time fairness instead of service fairness. This
+ * goal is achieved through the BFQ scheduling engine, even if such an
+ * engine works in the service, and not in the time domain. The trick
+ * is charging these queues with an inflated amount of service, equal
+ * to the amount of service that they would have received during their
+ * service slot if they had been fast, i.e., if their requests had
+ * been dispatched at a rate equal to the estimated peak rate.
+ *
+ * It is worth noting that time fairness can cause important
+ * distortions in terms of bandwidth distribution, on devices with
+ * internal queueing. The reason is that I/O requests dispatched
+ * during the service slot of a queue may be served after that service
+ * slot is finished, and may have a total processing time loosely
+ * correlated with the duration of the service slot. This is
+ * especially true for short service slots.
+ */
+void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ unsigned long time_ms)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ int tot_serv_to_charge = entity->service;
+ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
+
+ if (time_ms > 0 && time_ms < timeout_ms)
+ tot_serv_to_charge =
+ (bfqd->bfq_max_budget * time_ms) / timeout_ms;
+
+ if (tot_serv_to_charge < entity->service)
+ tot_serv_to_charge = entity->service;
+
+ /* Increase budget to avoid inconsistencies */
+ if (tot_serv_to_charge > entity->budget)
+ entity->budget = tot_serv_to_charge;
+
+ bfq_bfqq_served(bfqq,
+ max_t(int, 0, tot_serv_to_charge - entity->service));
+}
+
+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
+ struct bfq_service_tree *st,
+ bool backshifted)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ st = __bfq_entity_update_weight_prio(st, entity);
+ bfq_calc_finish(entity, entity->budget);
+
+ /*
+ * If some queues enjoy backshifting for a while, then their
+ * (virtual) finish timestamps may happen to become lower and
+ * lower than the system virtual time. In particular, if
+ * these queues often happen to be idle for short time
+ * periods, and during such time periods other queues with
+ * higher timestamps happen to be busy, then the backshifted
+ * timestamps of the former queues can become much lower than
+ * the system virtual time. In fact, to serve the queues with
+ * higher timestamps while the ones with lower timestamps are
+ * idle, the system virtual time may be pushed-up to much
+ * higher values than the finish timestamps of the idle
+ * queues. As a consequence, the finish timestamps of all new
+ * or newly activated queues may end up being much larger than
+ * those of lucky queues with backshifted timestamps. The
+ * latter queues may then monopolize the device for a lot of
+ * time. This would simply break service guarantees.
+ *
+ * To reduce this problem, push up a little bit the
+ * backshifted timestamps of the queue associated with this
+ * entity (only a queue can happen to have the backshifted
+ * flag set): just enough to let the finish timestamp of the
+ * queue be equal to the current value of the system virtual
+ * time. This may introduce a little unfairness among queues
+ * with backshifted timestamps, but it does not break
+ * worst-case fairness guarantees.
+ *
+ * As a special case, if bfqq is weight-raised, push up
+ * timestamps much less, to keep very low the probability that
+ * this push up causes the backshifted finish timestamps of
+ * weight-raised queues to become higher than the backshifted
+ * finish timestamps of non weight-raised queues.
+ */
+ if (backshifted && bfq_gt(st->vtime, entity->finish)) {
+ unsigned long delta = st->vtime - entity->finish;
+
+ if (bfqq)
+ delta /= bfqq->wr_coeff;
+
+ entity->start += delta;
+ entity->finish += delta;
+ }
+
+ bfq_active_insert(st, entity);
+}
+
+/**
+ * __bfq_activate_entity - handle activation of entity.
+ * @entity: the entity being activated.
+ * @non_blocking_wait_rq: true if entity was waiting for a request
+ *
+ * Called for a 'true' activation, i.e., if entity is not active and
+ * one of its children receives a new request.
+ *
+ * Basically, this function updates the timestamps of entity and
+ * inserts entity into its active tree, ater possible extracting it
+ * from its idle tree.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity,
+ bool non_blocking_wait_rq)
+{
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+ bool backshifted = false;
+ unsigned long long min_vstart;
+
+ /* See comments on bfq_fqq_update_budg_for_activation */
+ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
+ backshifted = true;
+ min_vstart = entity->finish;
+ } else
+ min_vstart = st->vtime;
+
+ if (entity->tree == &st->idle) {
+ /*
+ * Must be on the idle tree, bfq_idle_extract() will
+ * check for that.
+ */
+ bfq_idle_extract(st, entity);
+ entity->start = bfq_gt(min_vstart, entity->finish) ?
+ min_vstart : entity->finish;
+ } else {
+ /*
+ * The finish time of the entity may be invalid, and
+ * it is in the past for sure, otherwise the queue
+ * would have been on the idle tree.
+ */
+ entity->start = min_vstart;
+ st->wsum += entity->weight;
+ /*
+ * entity is about to be inserted into a service tree,
+ * and then set in service: get a reference to make
+ * sure entity does not disappear until it is no
+ * longer in service or scheduled for service.
+ */
+ bfq_get_entity(entity);
+
+ entity->on_st = true;
+ }
+
+ bfq_update_fin_time_enqueue(entity, st, backshifted);
+}
+
+/**
+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
+ * @entity: the entity being requeued or repositioned.
+ *
+ * Requeueing is needed if this entity stops being served, which
+ * happens if a leaf descendant entity has expired. On the other hand,
+ * repositioning is needed if the next_inservice_entity for the child
+ * entity has changed. See the comments inside the function for
+ * details.
+ *
+ * Basically, this function: 1) removes entity from its active tree if
+ * present there, 2) updates the timestamps of entity and 3) inserts
+ * entity back into its active tree (in the new, right position for
+ * the new values of the timestamps).
+ */
+static void __bfq_requeue_entity(struct bfq_entity *entity)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+ if (entity == sd->in_service_entity) {
+ /*
+ * We are requeueing the current in-service entity,
+ * which may have to be done for one of the following
+ * reasons:
+ * - entity represents the in-service queue, and the
+ * in-service queue is being requeued after an
+ * expiration;
+ * - entity represents a group, and its budget has
+ * changed because one of its child entities has
+ * just been either activated or requeued for some
+ * reason; the timestamps of the entity need then to
+ * be updated, and the entity needs to be enqueued
+ * or repositioned accordingly.
+ *
+ * In particular, before requeueing, the start time of
+ * the entity must be moved forward to account for the
+ * service that the entity has received while in
+ * service. This is done by the next instructions. The
+ * finish time will then be updated according to this
+ * new value of the start time, and to the budget of
+ * the entity.
+ */
+ bfq_calc_finish(entity, entity->service);
+ entity->start = entity->finish;
+ /*
+ * In addition, if the entity had more than one child
+ * when set in service, then was not extracted from
+ * the active tree. This implies that the position of
+ * the entity in the active tree may need to be
+ * changed now, because we have just updated the start
+ * time of the entity, and we will update its finish
+ * time in a moment (the requeueing is then, more
+ * precisely, a repositioning in this case). To
+ * implement this repositioning, we: 1) dequeue the
+ * entity here, 2) update the finish time and
+ * requeue the entity according to the new
+ * timestamps below.
+ */
+ if (entity->tree)
+ bfq_active_extract(st, entity);
+ } else { /* The entity is already active, and not in service */
+ /*
+ * In this case, this function gets called only if the
+ * next_in_service entity below this entity has
+ * changed, and this change has caused the budget of
+ * this entity to change, which, finally implies that
+ * the finish time of this entity must be
+ * updated. Such an update may cause the scheduling,
+ * i.e., the position in the active tree, of this
+ * entity to change. We handle this change by: 1)
+ * dequeueing the entity here, 2) updating the finish
+ * time and requeueing the entity according to the new
+ * timestamps below. This is the same approach as the
+ * non-extracted-entity sub-case above.
+ */
+ bfq_active_extract(st, entity);
+ }
+
+ bfq_update_fin_time_enqueue(entity, st, false);
+}
+
+static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
+ struct bfq_sched_data *sd,
+ bool non_blocking_wait_rq)
+{
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+ if (sd->in_service_entity == entity || entity->tree == &st->active)
+ /*
+ * in service or already queued on the active tree,
+ * requeue or reposition
+ */
+ __bfq_requeue_entity(entity);
+ else
+ /*
+ * Not in service and not queued on its active tree:
+ * the activity is idle and this is a true activation.
+ */
+ __bfq_activate_entity(entity, non_blocking_wait_rq);
+}
+
+
+/**
+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
+ * and activate, requeue or reposition all ancestors
+ * for which such an update becomes necessary.
+ * @entity: the entity to activate.
+ * @non_blocking_wait_rq: true if this entity was waiting for a request
+ * @requeue: true if this is a requeue, which implies that bfqq is
+ * being expired; thus ALL its ancestors stop being served and must
+ * therefore be requeued
+ */
+static void bfq_activate_requeue_entity(struct bfq_entity *entity,
+ bool non_blocking_wait_rq,
+ bool requeue)
+{
+ struct bfq_sched_data *sd;
+
+ for_each_entity(entity) {
+ sd = entity->sched_data;
+ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
+
+ if (!bfq_update_next_in_service(sd, entity) && !requeue)
+ break;
+ }
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @ins_into_idle_tree: if false, the entity will not be put into the
+ * idle tree.
+ *
+ * Deactivates an entity, independently from its previous state. Must
+ * be invoked only if entity is on a service tree. Extracts the entity
+ * from that tree, and if necessary and allowed, puts it on the idle
+ * tree.
+ */
+bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+ int is_in_service = entity == sd->in_service_entity;
+
+ if (!entity->on_st) /* entity never activated, or already inactive */
+ return false;
+
+ if (is_in_service)
+ bfq_calc_finish(entity, entity->service);
+
+ if (entity->tree == &st->active)
+ bfq_active_extract(st, entity);
+ else if (!is_in_service && entity->tree == &st->idle)
+ bfq_idle_extract(st, entity);
+
+ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
+ bfq_forget_entity(st, entity, is_in_service);
+ else
+ bfq_idle_insert(st, entity);
+
+ return true;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
+ * @entity: the entity to deactivate.
+ * @ins_into_idle_tree: true if the entity can be put on the idle tree
+ */
+static void bfq_deactivate_entity(struct bfq_entity *entity,
+ bool ins_into_idle_tree,
+ bool expiration)
+{
+ struct bfq_sched_data *sd;
+ struct bfq_entity *parent = NULL;
+
+ for_each_entity_safe(entity, parent) {
+ sd = entity->sched_data;
+
+ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
+ /*
+ * entity is not in any tree any more, so
+ * this deactivation is a no-op, and there is
+ * nothing to change for upper-level entities
+ * (in case of expiration, this can never
+ * happen).
+ */
+ return;
+ }
+
+ if (sd->next_in_service == entity)
+ /*
+ * entity was the next_in_service entity,
+ * then, since entity has just been
+ * deactivated, a new one must be found.
+ */
+ bfq_update_next_in_service(sd, NULL);
+
+ if (sd->next_in_service)
+ /*
+ * The parent entity is still backlogged,
+ * because next_in_service is not NULL. So, no
+ * further upwards deactivation must be
+ * performed. Yet, next_in_service has
+ * changed. Then the schedule does need to be
+ * updated upwards.
+ */
+ break;
+
+ /*
+ * If we get here, then the parent is no more
+ * backlogged and we need to propagate the
+ * deactivation upwards. Thus let the loop go on.
+ */
+
+ /*
+ * Also let parent be queued into the idle tree on
+ * deactivation, to preserve service guarantees, and
+ * assuming that who invoked this function does not
+ * need parent entities too to be removed completely.
+ */
+ ins_into_idle_tree = true;
+ }
+
+ /*
+ * If the deactivation loop is fully executed, then there are
+ * no more entities to touch and next loop is not executed at
+ * all. Otherwise, requeue remaining entities if they are
+ * about to stop receiving service, or reposition them if this
+ * is not the case.
+ */
+ entity = parent;
+ for_each_entity(entity) {
+ /*
+ * Invoke __bfq_requeue_entity on entity, even if
+ * already active, to requeue/reposition it in the
+ * active tree (because sd->next_in_service has
+ * changed)
+ */
+ __bfq_requeue_entity(entity);
+
+ sd = entity->sched_data;
+ if (!bfq_update_next_in_service(sd, entity) &&
+ !expiration)
+ /*
+ * next_in_service unchanged or not causing
+ * any change in entity->parent->sd, and no
+ * requeueing needed for expiration: stop
+ * here.
+ */
+ break;
+ }
+}
+
+/**
+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
+ * if needed, to have at least one entity eligible.
+ * @st: the service tree to act upon.
+ *
+ * Assumes that st is not empty.
+ */
+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
+{
+ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
+
+ if (bfq_gt(root_entity->min_start, st->vtime))
+ return root_entity->min_start;
+
+ return st->vtime;
+}
+
+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
+{
+ if (new_value > st->vtime) {
+ st->vtime = new_value;
+ bfq_forget_idle(st);
+ }
+}
+
+/**
+ * bfq_first_active_entity - find the eligible entity with
+ * the smallest finish time
+ * @st: the service tree to select from.
+ * @vtime: the system virtual to use as a reference for eligibility
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start >= vtime) entity. The path on
+ * the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
+ u64 vtime)
+{
+ struct bfq_entity *entry, *first = NULL;
+ struct rb_node *node = st->active.rb_node;
+
+ while (node) {
+ entry = rb_entry(node, struct bfq_entity, rb_node);
+left:
+ if (!bfq_gt(entry->start, vtime))
+ first = entry;
+
+ if (node->rb_left) {
+ entry = rb_entry(node->rb_left,
+ struct bfq_entity, rb_node);
+ if (!bfq_gt(entry->min_start, vtime)) {
+ node = node->rb_left;
+ goto left;
+ }
+ }
+ if (first)
+ break;
+ node = node->rb_right;
+ }
+
+ return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * If there is no in-service entity for the sched_data st belongs to,
+ * then return the entity that will be set in service if:
+ * 1) the parent entity this st belongs to is set in service;
+ * 2) no entity belonging to such parent entity undergoes a state change
+ * that would influence the timestamps of the entity (e.g., becomes idle,
+ * becomes backlogged, changes its budget, ...).
+ *
+ * In this first case, update the virtual time in @st too (see the
+ * comments on this update inside the function).
+ *
+ * In constrast, if there is an in-service entity, then return the
+ * entity that would be set in service if not only the above
+ * conditions, but also the next one held true: the currently
+ * in-service entity, on expiration,
+ * 1) gets a finish time equal to the current one, or
+ * 2) is not eligible any more, or
+ * 3) is idle.
+ */
+static struct bfq_entity *
+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
+{
+ struct bfq_entity *entity;
+ u64 new_vtime;
+
+ if (RB_EMPTY_ROOT(&st->active))
+ return NULL;
+
+ /*
+ * Get the value of the system virtual time for which at
+ * least one entity is eligible.
+ */
+ new_vtime = bfq_calc_vtime_jump(st);
+
+ /*
+ * If there is no in-service entity for the sched_data this
+ * active tree belongs to, then push the system virtual time
+ * up to the value that guarantees that at least one entity is
+ * eligible. If, instead, there is an in-service entity, then
+ * do not make any such update, because there is already an
+ * eligible entity, namely the in-service one (even if the
+ * entity is not on st, because it was extracted when set in
+ * service).
+ */
+ if (!in_service)
+ bfq_update_vtime(st, new_vtime);
+
+ entity = bfq_first_active_entity(st, new_vtime);
+
+ return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ *
+ * This function is invoked when there has been a change in the trees
+ * for sd, and we need know what is the new next entity after this
+ * change.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
+{
+ struct bfq_service_tree *st = sd->service_tree;
+ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
+ struct bfq_entity *entity = NULL;
+ int class_idx = 0;
+
+ /*
+ * Choose from idle class, if needed to guarantee a minimum
+ * bandwidth to this class (and if there is some active entity
+ * in idle class). This should also mitigate
+ * priority-inversion problems in case a low priority task is
+ * holding file system resources.
+ */
+ if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
+ BFQ_CL_IDLE_TIMEOUT)) {
+ if (!RB_EMPTY_ROOT(&idle_class_st->active))
+ class_idx = BFQ_IOPRIO_CLASSES - 1;
+ /* About to be served if backlogged, or not yet backlogged */
+ sd->bfq_class_idle_last_service = jiffies;
+ }
+
+ /*
+ * Find the next entity to serve for the highest-priority
+ * class, unless the idle class needs to be served.
+ */
+ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
+ entity = __bfq_lookup_next_entity(st + class_idx,
+ sd->in_service_entity);
+
+ if (entity)
+ break;
+ }
+
+ if (!entity)
+ return NULL;
+
+ return entity;
+}
+
+bool next_queue_may_preempt(struct bfq_data *bfqd)
+{
+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
+
+ return sd->next_in_service != sd->in_service_entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+ struct bfq_entity *entity = NULL;
+ struct bfq_sched_data *sd;
+ struct bfq_queue *bfqq;
+
+ if (bfqd->busy_queues == 0)
+ return NULL;
+
+ /*
+ * Traverse the path from the root to the leaf entity to
+ * serve. Set in service all the entities visited along the
+ * way.
+ */
+ sd = &bfqd->root_group->sched_data;
+ for (; sd ; sd = entity->my_sched_data) {
+ /*
+ * WARNING. We are about to set the in-service entity
+ * to sd->next_in_service, i.e., to the (cached) value
+ * returned by bfq_lookup_next_entity(sd) the last
+ * time it was invoked, i.e., the last time when the
+ * service order in sd changed as a consequence of the
+ * activation or deactivation of an entity. In this
+ * respect, if we execute bfq_lookup_next_entity(sd)
+ * in this very moment, it may, although with low
+ * probability, yield a different entity than that
+ * pointed to by sd->next_in_service. This rare event
+ * happens in case there was no CLASS_IDLE entity to
+ * serve for sd when bfq_lookup_next_entity(sd) was
+ * invoked for the last time, while there is now one
+ * such entity.
+ *
+ * If the above event happens, then the scheduling of
+ * such entity in CLASS_IDLE is postponed until the
+ * service of the sd->next_in_service entity
+ * finishes. In fact, when the latter is expired,
+ * bfq_lookup_next_entity(sd) gets called again,
+ * exactly to update sd->next_in_service.
+ */
+
+ /* Make next_in_service entity become in_service_entity */
+ entity = sd->next_in_service;
+ sd->in_service_entity = entity;
+
+ /*
+ * Reset the accumulator of the amount of service that
+ * the entity is about to receive.
+ */
+ entity->service = 0;
+
+ /*
+ * If entity is no longer a candidate for next
+ * service, then we extract it from its active tree,
+ * for the following reason. To further boost the
+ * throughput in some special case, BFQ needs to know
+ * which is the next candidate entity to serve, while
+ * there is already an entity in service. In this
+ * respect, to make it easy to compute/update the next
+ * candidate entity to serve after the current
+ * candidate has been set in service, there is a case
+ * where it is necessary to extract the current
+ * candidate from its service tree. Such a case is
+ * when the entity just set in service cannot be also
+ * a candidate for next service. Details about when
+ * this conditions holds are reported in the comments
+ * on the function bfq_no_longer_next_in_service()
+ * invoked below.
+ */
+ if (bfq_no_longer_next_in_service(entity))
+ bfq_active_extract(bfq_entity_service_tree(entity),
+ entity);
+
+ /*
+ * For the same reason why we may have just extracted
+ * entity from its active tree, we may need to update
+ * next_in_service for the sched_data of entity too,
+ * regardless of whether entity has been extracted.
+ * In fact, even if entity has not been extracted, a
+ * descendant entity may get extracted. Such an event
+ * would cause a change in next_in_service for the
+ * level of the descendant entity, and thus possibly
+ * back to upper levels.
+ *
+ * We cannot perform the resulting needed update
+ * before the end of this loop, because, to know which
+ * is the correct next-to-serve candidate entity for
+ * each level, we need first to find the leaf entity
+ * to set in service. In fact, only after we know
+ * which is the next-to-serve leaf entity, we can
+ * discover whether the parent entity of the leaf
+ * entity becomes the next-to-serve, and so on.
+ */
+
+ }
+
+ bfqq = bfq_entity_to_bfqq(entity);
+
+ /*
+ * We can finally update all next-to-serve entities along the
+ * path from the leaf entity just set in service to the root.
+ */
+ for_each_entity(entity) {
+ struct bfq_sched_data *sd = entity->sched_data;
+
+ if (!bfq_update_next_in_service(sd, NULL))
+ break;
+ }
+
+ return bfqq;
+}
+
+void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+ struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
+ struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+ struct bfq_entity *entity = in_serv_entity;
+
+ bfq_clear_bfqq_wait_request(in_serv_bfqq);
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+ bfqd->in_service_queue = NULL;
+
+ /*
+ * When this function is called, all in-service entities have
+ * been properly deactivated or requeued, so we can safely
+ * execute the final step: reset in_service_entity along the
+ * path from entity to the root.
+ */
+ for_each_entity(entity)
+ entity->sched_data->in_service_entity = NULL;
+
+ /*
+ * in_serv_entity is no longer in service, so, if it is in no
+ * service tree either, then release the service reference to
+ * the queue it represents (taken with bfq_get_entity).
+ */
+ if (!in_serv_entity->on_st)
+ bfq_put_queue(in_serv_bfqq);
+}
+
+void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool ins_into_idle_tree, bool expiration)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
+}
+
+void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
+ false);
+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ bfq_activate_requeue_entity(entity, false,
+ bfqq == bfqd->in_service_queue);
+}
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree. As a special case, it can be invoked during an
+ * expiration.
+ */
+void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool expiration)
+{
+ bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+ bfq_clear_bfqq_busy(bfqq);
+
+ bfqd->busy_queues--;
+
+ if (!bfqq->dispatched)
+ bfq_weights_tree_remove(bfqd, &bfqq->entity,
+ &bfqd->queue_weights_tree);
+
+ if (bfqq->wr_coeff > 1)
+ bfqd->wr_busy_queues--;
+
+ bfqg_stats_update_dequeue(bfqq_group(bfqq));
+
+ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+ bfq_activate_bfqq(bfqd, bfqq);
+
+ bfq_mark_bfqq_busy(bfqq);
+ bfqd->busy_queues++;
+
+ if (!bfqq->dispatched)
+ if (bfqq->wr_coeff == 1)
+ bfq_weights_tree_add(bfqd, &bfqq->entity,
+ &bfqd->queue_weights_tree);
+
+ if (bfqq->wr_coeff > 1)
+ bfqd->wr_busy_queues++;
+}
diff --git a/block/bio.c b/block/bio.c
index e75878f8b14a..f4d207180266 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -30,6 +30,7 @@
#include <linux/cgroup.h>
#include <trace/events/block.h>
+#include "blk.h"
/*
* Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -427,7 +428,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
* RETURNS:
* Pointer to new bio on success, NULL on failure.
*/
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
+ struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
unsigned front_pad;
@@ -1824,6 +1826,11 @@ static inline bool bio_remaining_done(struct bio *bio)
* bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
* way to end I/O on a bio. No one should call bi_end_io() directly on a
* bio unless they own it and thus know that it has an end_io function.
+ *
+ * bio_endio() can be called several times on a bio that has been chained
+ * using bio_chain(). The ->bi_end_io() function will only be called the
+ * last time. At this point the BLK_TA_COMPLETE tracing event will be
+ * generated if BIO_TRACE_COMPLETION is set.
**/
void bio_endio(struct bio *bio)
{
@@ -1844,6 +1851,13 @@ again:
goto again;
}
+ if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+ trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
+ bio, bio->bi_error);
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+ }
+
+ blk_throtl_bio_endio(bio);
if (bio->bi_end_io)
bio->bi_end_io(bio);
}
@@ -1882,6 +1896,9 @@ struct bio *bio_split(struct bio *bio, int sectors,
bio_advance(bio, split->bi_iter.bi_size);
+ if (bio_flagged(bio, BIO_TRACE_COMPLETION))
+ bio_set_flag(bio, BIO_TRACE_COMPLETION);
+
return split;
}
EXPORT_SYMBOL(bio_split);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bbe7ee00bd3d..7c2947128f58 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
}
EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
+/* Performs queue bypass and policy enabled checks then looks up blkg. */
+static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
+ const struct blkcg_policy *pol,
+ struct request_queue *q)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ lockdep_assert_held(q->queue_lock);
+
+ if (!blkcg_policy_enabled(q, pol))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /*
+ * This could be the first entry point of blkcg implementation and
+ * we shouldn't allow anything to go through for a bypassing queue.
+ */
+ if (unlikely(blk_queue_bypass(q)))
+ return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
+ return __blkg_lookup(blkcg, q, true /* update_hint */);
+}
+
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
* @blkcg: target block cgroup
@@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
__acquires(rcu) __acquires(disk->queue->queue_lock)
{
struct gendisk *disk;
+ struct request_queue *q;
struct blkcg_gq *blkg;
struct module *owner;
unsigned int major, minor;
@@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
if (!disk)
return -ENODEV;
if (part) {
- owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
- return -ENODEV;
+ ret = -ENODEV;
+ goto fail;
}
- rcu_read_lock();
- spin_lock_irq(disk->queue->queue_lock);
+ q = disk->queue;
- if (blkcg_policy_enabled(disk->queue, pol))
- blkg = blkg_lookup_create(blkcg, disk->queue);
- else
- blkg = ERR_PTR(-EOPNOTSUPP);
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+ blkg = blkg_lookup_check(blkcg, pol, q);
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+
+ if (blkg)
+ goto success;
+
+ /*
+ * Create blkgs walking down from blkcg_root to @blkcg, so that all
+ * non-root blkgs have access to their parents.
+ */
+ while (true) {
+ struct blkcg *pos = blkcg;
+ struct blkcg *parent;
+ struct blkcg_gq *new_blkg;
+
+ parent = blkcg_parent(blkcg);
+ while (parent && !__blkg_lookup(parent, q, false)) {
+ pos = parent;
+ parent = blkcg_parent(parent);
+ }
+
+ /* Drop locks to do new blkg allocation with GFP_KERNEL. */
+ spin_unlock_irq(q->queue_lock);
rcu_read_unlock();
- spin_unlock_irq(disk->queue->queue_lock);
- owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
- /*
- * If queue was bypassing, we should retry. Do so after a
- * short msleep(). It isn't strictly necessary but queue
- * can be bypassing for some time and it's always nice to
- * avoid busy looping.
- */
- if (ret == -EBUSY) {
- msleep(10);
- ret = restart_syscall();
+
+ new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
+ if (unlikely(!new_blkg)) {
+ ret = -ENOMEM;
+ goto fail;
}
- return ret;
- }
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+
+ blkg = blkg_lookup_check(pos, pol, q);
+ if (IS_ERR(blkg)) {
+ ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+
+ if (blkg) {
+ blkg_free(new_blkg);
+ } else {
+ blkg = blkg_create(pos, q, new_blkg);
+ if (unlikely(IS_ERR(blkg))) {
+ ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+ }
+
+ if (pos == blkcg)
+ goto success;
+ }
+success:
ctx->disk = disk;
ctx->blkg = blkg;
ctx->body = body;
return 0;
+
+fail_unlock:
+ spin_unlock_irq(q->queue_lock);
+ rcu_read_unlock();
+fail:
+ owner = disk->fops->owner;
+ put_disk(disk);
+ module_put(owner);
+ /*
+ * If queue was bypassing, we should retry. Do so after a
+ * short msleep(). It isn't strictly necessary but queue
+ * can be bypassing for some time and it's always nice to
+ * avoid busy looping.
+ */
+ if (ret == -EBUSY) {
+ msleep(10);
+ ret = restart_syscall();
+ }
+ return ret;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);
diff --git a/block/blk-core.c b/block/blk-core.c
index d772c221cc17..24886b69690f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -268,10 +268,8 @@ void blk_sync_queue(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
- queue_for_each_hw_ctx(q, hctx, i) {
- cancel_work_sync(&hctx->run_work);
- cancel_delayed_work_sync(&hctx->delay_work);
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ cancel_delayed_work_sync(&hctx->run_work);
} else {
cancel_delayed_work_sync(&q->delay_work);
}
@@ -500,6 +498,13 @@ void blk_set_queue_dying(struct request_queue *q)
queue_flag_set(QUEUE_FLAG_DYING, q);
spin_unlock_irq(q->queue_lock);
+ /*
+ * When queue DYING flag is set, we need to block new req
+ * entering queue, so we call blk_freeze_queue_start() to
+ * prevent I/O from crossing blk_queue_enter().
+ */
+ blk_freeze_queue_start(q);
+
if (q->mq_ops)
blk_mq_wake_waiters(q);
else {
@@ -556,9 +561,13 @@ void blk_cleanup_queue(struct request_queue *q)
* prevent that q->request_fn() gets invoked after draining finished.
*/
blk_freeze_queue(q);
- spin_lock_irq(lock);
- if (!q->mq_ops)
+ if (!q->mq_ops) {
+ spin_lock_irq(lock);
__blk_drain_queue(q, true);
+ } else {
+ blk_mq_debugfs_unregister_mq(q);
+ spin_lock_irq(lock);
+ }
queue_flag_set(QUEUE_FLAG_DEAD, q);
spin_unlock_irq(lock);
@@ -669,6 +678,15 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
if (nowait)
return -EBUSY;
+ /*
+ * read pair of barrier in blk_freeze_queue_start(),
+ * we need to order reading __PERCPU_REF_DEAD flag of
+ * .q_usage_counter and reading .mq_freeze_depth or
+ * queue dying flag, otherwise the following wait may
+ * never return if the two reads are reordered.
+ */
+ smp_rmb();
+
ret = wait_event_interruptible(q->mq_freeze_wq,
!atomic_read(&q->mq_freeze_depth) ||
blk_queue_dying(q));
@@ -720,6 +738,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q->backing_dev_info)
goto fail_split;
+ q->stats = blk_alloc_queue_stats();
+ if (!q->stats)
+ goto fail_stats;
+
q->backing_dev_info->ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
@@ -776,6 +798,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
fail_ref:
percpu_ref_exit(&q->q_usage_counter);
fail_bdi:
+ blk_free_queue_stats(q->stats);
+fail_stats:
bdi_put(q->backing_dev_info);
fail_split:
bioset_free(q->bio_split);
@@ -889,7 +913,6 @@ out_exit_flush_rq:
q->exit_rq_fn(q, q->fq->flush_rq);
out_free_flush_queue:
blk_free_flush_queue(q->fq);
- wbt_exit(q);
return -ENOMEM;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1128,7 +1151,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
blk_rq_init(q, rq);
blk_rq_set_rl(rq, rl);
- blk_rq_set_prio(rq, ioc);
rq->cmd_flags = op;
rq->rq_flags = rq_flags;
@@ -1608,17 +1630,23 @@ out:
return ret;
}
-void init_request_from_bio(struct request *req, struct bio *bio)
+void blk_init_request_from_bio(struct request *req, struct bio *bio)
{
+ struct io_context *ioc = rq_ioc(bio);
+
if (bio->bi_opf & REQ_RAHEAD)
req->cmd_flags |= REQ_FAILFAST_MASK;
- req->errors = 0;
req->__sector = bio->bi_iter.bi_sector;
if (ioprio_valid(bio_prio(bio)))
req->ioprio = bio_prio(bio);
+ else if (ioc)
+ req->ioprio = ioc->ioprio;
+ else
+ req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
blk_rq_bio_prep(req->q, req, bio);
}
+EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
{
@@ -1709,7 +1737,7 @@ get_rq:
* We don't worry about that case for efficiency. It won't happen
* often, and the elevators are able to handle it.
*/
- init_request_from_bio(req, bio);
+ blk_init_request_from_bio(req, bio);
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
req->cpu = raw_smp_processor_id();
@@ -1936,7 +1964,13 @@ generic_make_request_checks(struct bio *bio)
if (!blkcg_bio_issue_check(q, bio))
return false;
- trace_block_bio_queue(q, bio);
+ if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+ trace_block_bio_queue(q, bio);
+ /* Now that enqueuing has been traced, we need to trace
+ * completion as well.
+ */
+ bio_set_flag(bio, BIO_TRACE_COMPLETION);
+ }
return true;
not_supported:
@@ -2478,7 +2512,7 @@ void blk_start_request(struct request *req)
blk_dequeue_request(req);
if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
- blk_stat_set_issue_time(&req->issue_stat);
+ blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req));
req->rq_flags |= RQF_STATS;
wbt_issue(req->q->rq_wb, &req->issue_stat);
}
@@ -2540,22 +2574,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
{
int total_bytes;
- trace_block_rq_complete(req->q, req, nr_bytes);
+ trace_block_rq_complete(req, error, nr_bytes);
if (!req->bio)
return false;
- /*
- * For fs requests, rq is just carrier of independent bio's
- * and each partial completion should be handled separately.
- * Reset per-request error on each partial completion.
- *
- * TODO: tj: This is too subtle. It would be better to let
- * low level drivers do what they see fit.
- */
- if (!blk_rq_is_passthrough(req))
- req->errors = 0;
-
if (error && !blk_rq_is_passthrough(req) &&
!(req->rq_flags & RQF_QUIET)) {
char *error_type;
@@ -2601,6 +2624,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
if (bio_bytes == bio->bi_iter.bi_size)
req->bio = bio->bi_next;
+ /* Completion has already been traced */
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
req_bio_endio(req, bio, bio_bytes, error);
total_bytes += bio_bytes;
@@ -2699,7 +2724,7 @@ void blk_finish_request(struct request *req, int error)
struct request_queue *q = req->q;
if (req->rq_flags & RQF_STATS)
- blk_stat_add(&q->rq_stats[rq_data_dir(req)], req);
+ blk_stat_add(req);
if (req->rq_flags & RQF_QUEUED)
blk_queue_end_tag(q, req);
@@ -2776,7 +2801,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
* %false - we are done with this request
* %true - still buffers pending for this request
**/
-bool __blk_end_bidi_request(struct request *rq, int error,
+static bool __blk_end_bidi_request(struct request *rq, int error,
unsigned int nr_bytes, unsigned int bidi_bytes)
{
if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
@@ -2829,43 +2854,6 @@ void blk_end_request_all(struct request *rq, int error)
EXPORT_SYMBOL(blk_end_request_all);
/**
- * blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @error: %0 for success, < %0 for error
- *
- * Description:
- * Complete the current consecutively mapped chunk from @rq.
- *
- * Return:
- * %false - we are done with this request
- * %true - still buffers pending for this request
- */
-bool blk_end_request_cur(struct request *rq, int error)
-{
- return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
-EXPORT_SYMBOL(blk_end_request_cur);
-
-/**
- * blk_end_request_err - Finish a request till the next failure boundary.
- * @rq: the request to finish till the next failure boundary for
- * @error: must be negative errno
- *
- * Description:
- * Complete @rq till the next failure boundary.
- *
- * Return:
- * %false - we are done with this request
- * %true - still buffers pending for this request
- */
-bool blk_end_request_err(struct request *rq, int error)
-{
- WARN_ON(error >= 0);
- return blk_end_request(rq, error, blk_rq_err_bytes(rq));
-}
-EXPORT_SYMBOL_GPL(blk_end_request_err);
-
-/**
* __blk_end_request - Helper function for drivers to complete the request.
* @rq: the request being processed
* @error: %0 for success, < %0 for error
@@ -2924,26 +2912,6 @@ bool __blk_end_request_cur(struct request *rq, int error)
}
EXPORT_SYMBOL(__blk_end_request_cur);
-/**
- * __blk_end_request_err - Finish a request till the next failure boundary.
- * @rq: the request to finish till the next failure boundary for
- * @error: must be negative errno
- *
- * Description:
- * Complete @rq till the next failure boundary. Must be called
- * with queue lock held.
- *
- * Return:
- * %false - we are done with this request
- * %true - still buffers pending for this request
- */
-bool __blk_end_request_err(struct request *rq, int error)
-{
- WARN_ON(error >= 0);
- return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
-}
-EXPORT_SYMBOL_GPL(__blk_end_request_err);
-
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio)
{
@@ -3106,6 +3074,13 @@ int kblockd_schedule_work_on(int cpu, struct work_struct *work)
}
EXPORT_SYMBOL(kblockd_schedule_work_on);
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
+ unsigned long delay)
+{
+ return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
+
int kblockd_schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 8cd0e9bc8dc8..a9451e3b8587 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -69,8 +69,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
if (unlikely(blk_queue_dying(q))) {
rq->rq_flags |= RQF_QUIET;
- rq->errors = -ENXIO;
- __blk_end_request_all(rq, rq->errors);
+ __blk_end_request_all(rq, -ENXIO);
spin_unlock_irq(q->queue_lock);
return;
}
@@ -92,11 +91,10 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
* Insert a fully prepared request at the back of the I/O scheduler queue
* for execution and wait for completion.
*/
-int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
+void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
struct request *rq, int at_head)
{
DECLARE_COMPLETION_ONSTACK(wait);
- int err = 0;
unsigned long hang_check;
rq->end_io_data = &wait;
@@ -108,10 +106,5 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
else
wait_for_completion_io(&wait);
-
- if (rq->errors)
- err = -EIO;
-
- return err;
}
EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 0d5a9c1da1fc..c4e0880b54bb 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -447,7 +447,7 @@ void blk_insert_flush(struct request *rq)
if (q->mq_ops)
blk_mq_end_request(rq, 0);
else
- __blk_end_bidi_request(rq, 0, 0, 0);
+ __blk_end_request(rq, 0, 0);
return;
}
@@ -497,8 +497,7 @@ void blk_insert_flush(struct request *rq)
* Description:
* Issue a flush for the block device in question. Caller can supply
* room for storing the error offset in case of a flush error, if they
- * wish to. If WAIT flag is not passed then caller may check only what
- * request was pushed in some internal queue for later handling.
+ * wish to.
*/
int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
sector_t *error_sector)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 9f0ff5ba4f84..0f891a9aff4d 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter *iter)
return 0;
}
-static struct blk_integrity_profile nop_profile = {
+static const struct blk_integrity_profile nop_profile = {
.name = "nop",
.generate_fn = blk_integrity_nop_fn,
.verify_fn = blk_integrity_nop_fn,
@@ -412,12 +412,13 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE |
template->flags;
- bi->interval_exp = ilog2(queue_logical_block_size(disk->queue));
+ bi->interval_exp = template->interval_exp ? :
+ ilog2(queue_logical_block_size(disk->queue));
bi->profile = template->profile ? template->profile : &nop_profile;
bi->tuple_size = template->tuple_size;
bi->tag_size = template->tag_size;
- blk_integrity_revalidate(disk);
+ disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
}
EXPORT_SYMBOL(blk_integrity_register);
@@ -430,26 +431,11 @@ EXPORT_SYMBOL(blk_integrity_register);
*/
void blk_integrity_unregister(struct gendisk *disk)
{
- blk_integrity_revalidate(disk);
+ disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity));
}
EXPORT_SYMBOL(blk_integrity_unregister);
-void blk_integrity_revalidate(struct gendisk *disk)
-{
- struct blk_integrity *bi = &disk->queue->integrity;
-
- if (!(disk->flags & GENHD_FL_UP))
- return;
-
- if (bi->profile)
- disk->queue->backing_dev_info->capabilities |=
- BDI_CAP_STABLE_WRITES;
- else
- disk->queue->backing_dev_info->capabilities &=
- ~BDI_CAP_STABLE_WRITES;
-}
-
void blk_integrity_add(struct gendisk *disk)
{
if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype,
diff --git a/block/blk-lib.c b/block/blk-lib.c
index ed1e78e24db0..e8caecd71688 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,17 +37,12 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
return -ENXIO;
if (flags & BLKDEV_DISCARD_SECURE) {
- if (flags & BLKDEV_DISCARD_ZERO)
- return -EOPNOTSUPP;
if (!blk_queue_secure_erase(q))
return -EOPNOTSUPP;
op = REQ_OP_SECURE_ERASE;
} else {
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
- if ((flags & BLKDEV_DISCARD_ZERO) &&
- !q->limits.discard_zeroes_data)
- return -EOPNOTSUPP;
op = REQ_OP_DISCARD;
}
@@ -109,7 +104,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard);
* @sector: start sector
* @nr_sects: number of sectors to discard
* @gfp_mask: memory allocation flags (for bio_alloc)
- * @flags: BLKDEV_IFL_* flags to control behaviour
+ * @flags: BLKDEV_DISCARD_* flags to control behaviour
*
* Description:
* Issue a discard request for the sectors in question.
@@ -126,7 +121,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
&bio);
if (!ret && bio) {
ret = submit_bio_wait(bio);
- if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
+ if (ret == -EOPNOTSUPP)
ret = 0;
bio_put(bio);
}
@@ -226,20 +221,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_write_same);
-/**
- * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
- * @bdev: blockdev to issue
- * @sector: start sector
- * @nr_sects: number of sectors to write
- * @gfp_mask: memory allocation flags (for bio_alloc)
- * @biop: pointer to anchor bio
- *
- * Description:
- * Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
- */
static int __blkdev_issue_write_zeroes(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
- struct bio **biop)
+ struct bio **biop, unsigned flags)
{
struct bio *bio = *biop;
unsigned int max_write_zeroes_sectors;
@@ -258,7 +242,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
bio = next_bio(bio, 0, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio->bi_bdev = bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+ bio->bi_opf = REQ_OP_WRITE_ZEROES;
+ if (flags & BLKDEV_ZERO_NOUNMAP)
+ bio->bi_opf |= REQ_NOUNMAP;
if (nr_sects > max_write_zeroes_sectors) {
bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
@@ -282,14 +268,27 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
* @biop: pointer to anchor bio
- * @discard: discard flag
+ * @flags: controls detailed behavior
*
* Description:
- * Generate and issue number of bios with zerofiled pages.
+ * Zero-fill a block range, either using hardware offload or by explicitly
+ * writing zeroes to the device.
+ *
+ * Note that this function may fail with -EOPNOTSUPP if the driver signals
+ * zeroing offload support, but the device fails to process the command (for
+ * some devices there is no non-destructive way to verify whether this
+ * operation is actually supported). In this case the caller should call
+ * retry the call to blkdev_issue_zeroout() and the fallback path will be used.
+ *
+ * If a device is using logical block provisioning, the underlying space will
+ * not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
+ *
+ * If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return
+ * -EOPNOTSUPP if no explicit hardware offload for zeroing is provided.
*/
int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
- bool discard)
+ unsigned flags)
{
int ret;
int bi_size = 0;
@@ -302,8 +301,8 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
return -EINVAL;
ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
- biop);
- if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+ biop, flags);
+ if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
goto out;
ret = 0;
@@ -337,40 +336,23 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
- * @discard: whether to discard the block range
+ * @flags: controls detailed behavior
*
* Description:
- * Zero-fill a block range. If the discard flag is set and the block
- * device guarantees that subsequent READ operations to the block range
- * in question will return zeroes, the blocks will be discarded. Should
- * the discard request fail, if the discard flag is not set, or if
- * discard_zeroes_data is not supported, this function will resort to
- * zeroing the blocks manually, thus provisioning (allocating,
- * anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
- * command(s), blkdev_issue_zeroout() will use it to optimize the process of
- * clearing the block range. Otherwise the zeroing will be performed
- * using regular WRITE calls.
+ * Zero-fill a block range, either using hardware offload or by explicitly
+ * writing zeroes to the device. See __blkdev_issue_zeroout() for the
+ * valid values for %flags.
*/
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, bool discard)
+ sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
{
int ret;
struct bio *bio = NULL;
struct blk_plug plug;
- if (discard) {
- if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
- BLKDEV_DISCARD_ZERO))
- return 0;
- }
-
- if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
- ZERO_PAGE(0)))
- return 0;
-
blk_start_plug(&plug);
ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
- &bio, discard);
+ &bio, flags);
if (ret == 0 && bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2afa262425d1..3990ae406341 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -54,6 +54,20 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
return bio_split(bio, split_sectors, GFP_NOIO, bs);
}
+static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
+ struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+{
+ *nsegs = 1;
+
+ if (!q->limits.max_write_zeroes_sectors)
+ return NULL;
+
+ if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
+ return NULL;
+
+ return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
+}
+
static struct bio *blk_bio_write_same_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
@@ -200,8 +214,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
split = blk_bio_discard_split(q, *bio, bs, &nsegs);
break;
case REQ_OP_WRITE_ZEROES:
- split = NULL;
- nsegs = (*bio)->bi_phys_segments;
+ split = blk_bio_write_zeroes_split(q, *bio, bs, &nsegs);
break;
case REQ_OP_WRITE_SAME:
split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f6d917977b33..bcd2a7d4a3a5 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -43,11 +43,157 @@ static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file,
return ret;
}
+static int blk_flags_show(struct seq_file *m, const unsigned long flags,
+ const char *const *flag_name, int flag_name_count)
+{
+ bool sep = false;
+ int i;
+
+ for (i = 0; i < sizeof(flags) * BITS_PER_BYTE; i++) {
+ if (!(flags & BIT(i)))
+ continue;
+ if (sep)
+ seq_puts(m, " ");
+ sep = true;
+ if (i < flag_name_count && flag_name[i])
+ seq_puts(m, flag_name[i]);
+ else
+ seq_printf(m, "%d", i);
+ }
+ return 0;
+}
+
+static const char *const blk_queue_flag_name[] = {
+ [QUEUE_FLAG_QUEUED] = "QUEUED",
+ [QUEUE_FLAG_STOPPED] = "STOPPED",
+ [QUEUE_FLAG_SYNCFULL] = "SYNCFULL",
+ [QUEUE_FLAG_ASYNCFULL] = "ASYNCFULL",
+ [QUEUE_FLAG_DYING] = "DYING",
+ [QUEUE_FLAG_BYPASS] = "BYPASS",
+ [QUEUE_FLAG_BIDI] = "BIDI",
+ [QUEUE_FLAG_NOMERGES] = "NOMERGES",
+ [QUEUE_FLAG_SAME_COMP] = "SAME_COMP",
+ [QUEUE_FLAG_FAIL_IO] = "FAIL_IO",
+ [QUEUE_FLAG_STACKABLE] = "STACKABLE",
+ [QUEUE_FLAG_NONROT] = "NONROT",
+ [QUEUE_FLAG_IO_STAT] = "IO_STAT",
+ [QUEUE_FLAG_DISCARD] = "DISCARD",
+ [QUEUE_FLAG_NOXMERGES] = "NOXMERGES",
+ [QUEUE_FLAG_ADD_RANDOM] = "ADD_RANDOM",
+ [QUEUE_FLAG_SECERASE] = "SECERASE",
+ [QUEUE_FLAG_SAME_FORCE] = "SAME_FORCE",
+ [QUEUE_FLAG_DEAD] = "DEAD",
+ [QUEUE_FLAG_INIT_DONE] = "INIT_DONE",
+ [QUEUE_FLAG_NO_SG_MERGE] = "NO_SG_MERGE",
+ [QUEUE_FLAG_POLL] = "POLL",
+ [QUEUE_FLAG_WC] = "WC",
+ [QUEUE_FLAG_FUA] = "FUA",
+ [QUEUE_FLAG_FLUSH_NQ] = "FLUSH_NQ",
+ [QUEUE_FLAG_DAX] = "DAX",
+ [QUEUE_FLAG_STATS] = "STATS",
+ [QUEUE_FLAG_POLL_STATS] = "POLL_STATS",
+ [QUEUE_FLAG_REGISTERED] = "REGISTERED",
+};
+
+static int blk_queue_flags_show(struct seq_file *m, void *v)
+{
+ struct request_queue *q = m->private;
+
+ blk_flags_show(m, q->queue_flags, blk_queue_flag_name,
+ ARRAY_SIZE(blk_queue_flag_name));
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static ssize_t blk_queue_flags_store(struct file *file, const char __user *ubuf,
+ size_t len, loff_t *offp)
+{
+ struct request_queue *q = file_inode(file)->i_private;
+ char op[16] = { }, *s;
+
+ len = min(len, sizeof(op) - 1);
+ if (copy_from_user(op, ubuf, len))
+ return -EFAULT;
+ s = op;
+ strsep(&s, " \t\n"); /* strip trailing whitespace */
+ if (strcmp(op, "run") == 0) {
+ blk_mq_run_hw_queues(q, true);
+ } else if (strcmp(op, "start") == 0) {
+ blk_mq_start_stopped_hw_queues(q, true);
+ } else {
+ pr_err("%s: unsupported operation %s. Use either 'run' or 'start'\n",
+ __func__, op);
+ return -EINVAL;
+ }
+ return len;
+}
+
+static int blk_queue_flags_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, blk_queue_flags_show, inode->i_private);
+}
+
+static const struct file_operations blk_queue_flags_fops = {
+ .open = blk_queue_flags_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = blk_queue_flags_store,
+};
+
+static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
+{
+ if (stat->nr_samples) {
+ seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+ stat->nr_samples, stat->mean, stat->min, stat->max);
+ } else {
+ seq_puts(m, "samples=0");
+ }
+}
+
+static int queue_poll_stat_show(struct seq_file *m, void *v)
+{
+ struct request_queue *q = m->private;
+ int bucket;
+
+ for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) {
+ seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket));
+ print_stat(m, &q->poll_stat[2*bucket]);
+ seq_puts(m, "\n");
+
+ seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket));
+ print_stat(m, &q->poll_stat[2*bucket+1]);
+ seq_puts(m, "\n");
+ }
+ return 0;
+}
+
+static int queue_poll_stat_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, queue_poll_stat_show, inode->i_private);
+}
+
+static const struct file_operations queue_poll_stat_fops = {
+ .open = queue_poll_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const char *const hctx_state_name[] = {
+ [BLK_MQ_S_STOPPED] = "STOPPED",
+ [BLK_MQ_S_TAG_ACTIVE] = "TAG_ACTIVE",
+ [BLK_MQ_S_SCHED_RESTART] = "SCHED_RESTART",
+ [BLK_MQ_S_TAG_WAITING] = "TAG_WAITING",
+
+};
static int hctx_state_show(struct seq_file *m, void *v)
{
struct blk_mq_hw_ctx *hctx = m->private;
- seq_printf(m, "0x%lx\n", hctx->state);
+ blk_flags_show(m, hctx->state, hctx_state_name,
+ ARRAY_SIZE(hctx_state_name));
+ seq_puts(m, "\n");
return 0;
}
@@ -63,11 +209,35 @@ static const struct file_operations hctx_state_fops = {
.release = single_release,
};
+static const char *const alloc_policy_name[] = {
+ [BLK_TAG_ALLOC_FIFO] = "fifo",
+ [BLK_TAG_ALLOC_RR] = "rr",
+};
+
+static const char *const hctx_flag_name[] = {
+ [ilog2(BLK_MQ_F_SHOULD_MERGE)] = "SHOULD_MERGE",
+ [ilog2(BLK_MQ_F_TAG_SHARED)] = "TAG_SHARED",
+ [ilog2(BLK_MQ_F_SG_MERGE)] = "SG_MERGE",
+ [ilog2(BLK_MQ_F_BLOCKING)] = "BLOCKING",
+ [ilog2(BLK_MQ_F_NO_SCHED)] = "NO_SCHED",
+};
+
static int hctx_flags_show(struct seq_file *m, void *v)
{
struct blk_mq_hw_ctx *hctx = m->private;
-
- seq_printf(m, "0x%lx\n", hctx->flags);
+ const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags);
+
+ seq_puts(m, "alloc_policy=");
+ if (alloc_policy < ARRAY_SIZE(alloc_policy_name) &&
+ alloc_policy_name[alloc_policy])
+ seq_puts(m, alloc_policy_name[alloc_policy]);
+ else
+ seq_printf(m, "%d", alloc_policy);
+ seq_puts(m, " ");
+ blk_flags_show(m,
+ hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy),
+ hctx_flag_name, ARRAY_SIZE(hctx_flag_name));
+ seq_puts(m, "\n");
return 0;
}
@@ -83,13 +253,83 @@ static const struct file_operations hctx_flags_fops = {
.release = single_release,
};
+static const char *const op_name[] = {
+ [REQ_OP_READ] = "READ",
+ [REQ_OP_WRITE] = "WRITE",
+ [REQ_OP_FLUSH] = "FLUSH",
+ [REQ_OP_DISCARD] = "DISCARD",
+ [REQ_OP_ZONE_REPORT] = "ZONE_REPORT",
+ [REQ_OP_SECURE_ERASE] = "SECURE_ERASE",
+ [REQ_OP_ZONE_RESET] = "ZONE_RESET",
+ [REQ_OP_WRITE_SAME] = "WRITE_SAME",
+ [REQ_OP_WRITE_ZEROES] = "WRITE_ZEROES",
+ [REQ_OP_SCSI_IN] = "SCSI_IN",
+ [REQ_OP_SCSI_OUT] = "SCSI_OUT",
+ [REQ_OP_DRV_IN] = "DRV_IN",
+ [REQ_OP_DRV_OUT] = "DRV_OUT",
+};
+
+static const char *const cmd_flag_name[] = {
+ [__REQ_FAILFAST_DEV] = "FAILFAST_DEV",
+ [__REQ_FAILFAST_TRANSPORT] = "FAILFAST_TRANSPORT",
+ [__REQ_FAILFAST_DRIVER] = "FAILFAST_DRIVER",
+ [__REQ_SYNC] = "SYNC",
+ [__REQ_META] = "META",
+ [__REQ_PRIO] = "PRIO",
+ [__REQ_NOMERGE] = "NOMERGE",
+ [__REQ_IDLE] = "IDLE",
+ [__REQ_INTEGRITY] = "INTEGRITY",
+ [__REQ_FUA] = "FUA",
+ [__REQ_PREFLUSH] = "PREFLUSH",
+ [__REQ_RAHEAD] = "RAHEAD",
+ [__REQ_BACKGROUND] = "BACKGROUND",
+ [__REQ_NR_BITS] = "NR_BITS",
+};
+
+static const char *const rqf_name[] = {
+ [ilog2((__force u32)RQF_SORTED)] = "SORTED",
+ [ilog2((__force u32)RQF_STARTED)] = "STARTED",
+ [ilog2((__force u32)RQF_QUEUED)] = "QUEUED",
+ [ilog2((__force u32)RQF_SOFTBARRIER)] = "SOFTBARRIER",
+ [ilog2((__force u32)RQF_FLUSH_SEQ)] = "FLUSH_SEQ",
+ [ilog2((__force u32)RQF_MIXED_MERGE)] = "MIXED_MERGE",
+ [ilog2((__force u32)RQF_MQ_INFLIGHT)] = "MQ_INFLIGHT",
+ [ilog2((__force u32)RQF_DONTPREP)] = "DONTPREP",
+ [ilog2((__force u32)RQF_PREEMPT)] = "PREEMPT",
+ [ilog2((__force u32)RQF_COPY_USER)] = "COPY_USER",
+ [ilog2((__force u32)RQF_FAILED)] = "FAILED",
+ [ilog2((__force u32)RQF_QUIET)] = "QUIET",
+ [ilog2((__force u32)RQF_ELVPRIV)] = "ELVPRIV",
+ [ilog2((__force u32)RQF_IO_STAT)] = "IO_STAT",
+ [ilog2((__force u32)RQF_ALLOCED)] = "ALLOCED",
+ [ilog2((__force u32)RQF_PM)] = "PM",
+ [ilog2((__force u32)RQF_HASHED)] = "HASHED",
+ [ilog2((__force u32)RQF_STATS)] = "STATS",
+ [ilog2((__force u32)RQF_SPECIAL_PAYLOAD)] = "SPECIAL_PAYLOAD",
+};
+
static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
{
struct request *rq = list_entry_rq(v);
-
- seq_printf(m, "%p {.cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n",
- rq, rq->cmd_flags, (__force unsigned int)rq->rq_flags,
- rq->tag, rq->internal_tag);
+ const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
+ const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
+
+ seq_printf(m, "%p {.op=", rq);
+ if (op < ARRAY_SIZE(op_name) && op_name[op])
+ seq_printf(m, "%s", op_name[op]);
+ else
+ seq_printf(m, "%d", op);
+ seq_puts(m, ", .cmd_flags=");
+ blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
+ ARRAY_SIZE(cmd_flag_name));
+ seq_puts(m, ", .rq_flags=");
+ blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
+ ARRAY_SIZE(rqf_name));
+ seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
+ rq->internal_tag);
+ if (mq_ops->show_rq)
+ mq_ops->show_rq(m, rq);
+ seq_puts(m, "}\n");
return 0;
}
@@ -322,60 +562,6 @@ static const struct file_operations hctx_io_poll_fops = {
.release = single_release,
};
-static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
-{
- seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
- stat->nr_samples, stat->mean, stat->min, stat->max);
-}
-
-static int hctx_stats_show(struct seq_file *m, void *v)
-{
- struct blk_mq_hw_ctx *hctx = m->private;
- struct blk_rq_stat stat[2];
-
- blk_stat_init(&stat[BLK_STAT_READ]);
- blk_stat_init(&stat[BLK_STAT_WRITE]);
-
- blk_hctx_stat_get(hctx, stat);
-
- seq_puts(m, "read: ");
- print_stat(m, &stat[BLK_STAT_READ]);
- seq_puts(m, "\n");
-
- seq_puts(m, "write: ");
- print_stat(m, &stat[BLK_STAT_WRITE]);
- seq_puts(m, "\n");
- return 0;
-}
-
-static int hctx_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, hctx_stats_show, inode->i_private);
-}
-
-static ssize_t hctx_stats_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct seq_file *m = file->private_data;
- struct blk_mq_hw_ctx *hctx = m->private;
- struct blk_mq_ctx *ctx;
- int i;
-
- hctx_for_each_ctx(hctx, ctx, i) {
- blk_stat_init(&ctx->stat[BLK_STAT_READ]);
- blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
- }
- return count;
-}
-
-static const struct file_operations hctx_stats_fops = {
- .open = hctx_stats_open,
- .read = seq_read,
- .write = hctx_stats_write,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int hctx_dispatched_show(struct seq_file *m, void *v)
{
struct blk_mq_hw_ctx *hctx = m->private;
@@ -636,6 +822,12 @@ static const struct file_operations ctx_completed_fops = {
.release = single_release,
};
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
+ {"poll_stat", 0400, &queue_poll_stat_fops},
+ {"state", 0600, &blk_queue_flags_fops},
+ {},
+};
+
static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"state", 0400, &hctx_state_fops},
{"flags", 0400, &hctx_flags_fops},
@@ -646,7 +838,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"sched_tags", 0400, &hctx_sched_tags_fops},
{"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops},
{"io_poll", 0600, &hctx_io_poll_fops},
- {"stats", 0600, &hctx_stats_fops},
{"dispatched", 0600, &hctx_dispatched_fops},
{"queued", 0600, &hctx_queued_fops},
{"run", 0600, &hctx_run_fops},
@@ -662,16 +853,17 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
{},
};
-int blk_mq_debugfs_register(struct request_queue *q, const char *name)
+int blk_mq_debugfs_register(struct request_queue *q)
{
if (!blk_debugfs_root)
return -ENOENT;
- q->debugfs_dir = debugfs_create_dir(name, blk_debugfs_root);
+ q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
+ blk_debugfs_root);
if (!q->debugfs_dir)
goto err;
- if (blk_mq_debugfs_register_hctxs(q))
+ if (blk_mq_debugfs_register_mq(q))
goto err;
return 0;
@@ -741,7 +933,7 @@ static int blk_mq_debugfs_register_hctx(struct request_queue *q,
return 0;
}
-int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+int blk_mq_debugfs_register_mq(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
@@ -753,6 +945,9 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
if (!q->mq_debugfs_dir)
goto err;
+ if (!debugfs_create_files(q->mq_debugfs_dir, q, blk_mq_debugfs_queue_attrs))
+ goto err;
+
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_debugfs_register_hctx(q, hctx))
goto err;
@@ -761,11 +956,11 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
return 0;
err:
- blk_mq_debugfs_unregister_hctxs(q);
+ blk_mq_debugfs_unregister_mq(q);
return -ENOMEM;
}
-void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+void blk_mq_debugfs_unregister_mq(struct request_queue *q)
{
debugfs_remove_recursive(q->mq_debugfs_dir);
q->mq_debugfs_dir = NULL;
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index 966c2169762e..0c3354cf3552 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -23,7 +23,7 @@
* @pdev: PCI device associated with @set.
*
* This function assumes the PCI device @pdev has at least as many available
- * interrupt vetors as @set has queues. It will then queuery the vector
+ * interrupt vectors as @set has queues. It will then query the vector
* corresponding to each queue for it's affinity mask and built queue mapping
* that maps a queue to the CPUs that have irq affinity for the corresponding
* vector.
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c974a1bbf4cb..8b361e192e8a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -30,43 +30,6 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
- int (*init)(struct blk_mq_hw_ctx *),
- void (*exit)(struct blk_mq_hw_ctx *))
-{
- struct blk_mq_hw_ctx *hctx;
- int ret;
- int i;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
- if (!hctx->sched_data) {
- ret = -ENOMEM;
- goto error;
- }
-
- if (init) {
- ret = init(hctx);
- if (ret) {
- /*
- * We don't want to give exit() a partially
- * initialized sched_data. init() must clean up
- * if it fails.
- */
- kfree(hctx->sched_data);
- hctx->sched_data = NULL;
- goto error;
- }
- }
- }
-
- return 0;
-error:
- blk_mq_sched_free_hctx_data(q, exit);
- return ret;
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
-
static void __blk_mq_sched_assign_ioc(struct request_queue *q,
struct request *rq,
struct bio *bio,
@@ -119,7 +82,11 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
- if (e) {
+ /*
+ * For a reserved tag, allocate a normal request since we might
+ * have driver dependencies on the value of the internal tag.
+ */
+ if (e && !(data->flags & BLK_MQ_REQ_RESERVED)) {
data->flags |= BLK_MQ_REQ_INTERNAL;
/*
@@ -227,22 +194,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
}
}
-void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
- struct list_head *rq_list,
- struct request *(*get_rq)(struct blk_mq_hw_ctx *))
-{
- do {
- struct request *rq;
-
- rq = get_rq(hctx);
- if (!rq)
- break;
-
- list_add_tail(&rq->queuelist, rq_list);
- } while (1);
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
-
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
struct request **merged_request)
{
@@ -508,11 +459,24 @@ int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
{
struct elevator_queue *e = q->elevator;
+ int ret;
if (!e)
return 0;
- return blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
+ ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
+ if (ret)
+ return ret;
+
+ if (e->type->ops.mq.init_hctx) {
+ ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
+ if (ret) {
+ blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
+ return ret;
+ }
+ }
+
+ return 0;
}
void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
@@ -523,12 +487,18 @@ void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
if (!e)
return;
+ if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
+ e->type->ops.mq.exit_hctx(hctx, hctx_idx);
+ hctx->sched_data = NULL;
+ }
+
blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
}
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
struct blk_mq_hw_ctx *hctx;
+ struct elevator_queue *eq;
unsigned int i;
int ret;
@@ -553,6 +523,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
if (ret)
goto err;
+ if (e->ops.mq.init_hctx) {
+ queue_for_each_hw_ctx(q, hctx, i) {
+ ret = e->ops.mq.init_hctx(hctx, i);
+ if (ret) {
+ eq = q->elevator;
+ blk_mq_exit_sched(q, eq);
+ kobject_put(&eq->kobj);
+ return ret;
+ }
+ }
+ }
+
return 0;
err:
@@ -563,6 +545,17 @@ err:
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned int i;
+
+ if (e->type->ops.mq.exit_hctx) {
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (hctx->sched_data) {
+ e->type->ops.mq.exit_hctx(hctx, i);
+ hctx->sched_data = NULL;
+ }
+ }
+ }
if (e->type->ops.mq.exit_sched)
e->type->ops.mq.exit_sched(e);
blk_mq_sched_tags_teardown(q);
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 3a9e6e40558b..edafb5383b7b 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -4,10 +4,6 @@
#include "blk-mq.h"
#include "blk-mq-tag.h"
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
- int (*init)(struct blk_mq_hw_ctx *),
- void (*exit)(struct blk_mq_hw_ctx *));
-
void blk_mq_sched_free_hctx_data(struct request_queue *q,
void (*exit)(struct blk_mq_hw_ctx *));
@@ -28,9 +24,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
struct list_head *list, bool run_queue_async);
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
-void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
- struct list_head *rq_list,
- struct request *(*get_rq)(struct blk_mq_hw_ctx *));
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
@@ -86,17 +79,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
return true;
}
-static inline void
-blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static inline void blk_mq_sched_completed_request(struct request *rq)
{
- struct elevator_queue *e = hctx->queue->elevator;
+ struct elevator_queue *e = rq->q->elevator;
if (e && e->type->ops.mq.completed_request)
- e->type->ops.mq.completed_request(hctx, rq);
-
- BUG_ON(rq->internal_tag == -1);
-
- blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+ e->type->ops.mq.completed_request(rq);
}
static inline void blk_mq_sched_started_request(struct request *rq)
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index d745ab81033a..ec0afdf765e3 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -253,10 +253,12 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
+ lockdep_assert_held(&q->sysfs_lock);
+
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
- blk_mq_debugfs_unregister_hctxs(q);
+ blk_mq_debugfs_unregister_mq(q);
kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
kobject_del(&q->mq_kobj);
@@ -267,9 +269,9 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
- blk_mq_disable_hotplug();
+ mutex_lock(&q->sysfs_lock);
__blk_mq_unregister_dev(dev, q);
- blk_mq_enable_hotplug();
+ mutex_unlock(&q->sysfs_lock);
}
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
@@ -302,12 +304,13 @@ void blk_mq_sysfs_init(struct request_queue *q)
}
}
-int blk_mq_register_dev(struct device *dev, struct request_queue *q)
+int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int ret, i;
- blk_mq_disable_hotplug();
+ WARN_ON_ONCE(!q->kobj.parent);
+ lockdep_assert_held(&q->sysfs_lock);
ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
@@ -315,20 +318,38 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
kobject_uevent(&q->mq_kobj, KOBJ_ADD);
- blk_mq_debugfs_register(q, kobject_name(&dev->kobj));
+ blk_mq_debugfs_register(q);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
if (ret)
- break;
+ goto unreg;
}
- if (ret)
- __blk_mq_unregister_dev(dev, q);
- else
- q->mq_sysfs_init_done = true;
+ q->mq_sysfs_init_done = true;
+
out:
- blk_mq_enable_hotplug();
+ return ret;
+
+unreg:
+ while (--i >= 0)
+ blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
+
+ blk_mq_debugfs_unregister_mq(q);
+
+ kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
+ kobject_del(&q->mq_kobj);
+ kobject_put(&dev->kobj);
+ return ret;
+}
+
+int blk_mq_register_dev(struct device *dev, struct request_queue *q)
+{
+ int ret;
+
+ mutex_lock(&q->sysfs_lock);
+ ret = __blk_mq_register_dev(dev, q);
+ mutex_unlock(&q->sysfs_lock);
return ret;
}
@@ -339,13 +360,17 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
+ mutex_lock(&q->sysfs_lock);
if (!q->mq_sysfs_init_done)
- return;
+ goto unlock;
- blk_mq_debugfs_unregister_hctxs(q);
+ blk_mq_debugfs_unregister_mq(q);
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
+
+unlock:
+ mutex_unlock(&q->sysfs_lock);
}
int blk_mq_sysfs_register(struct request_queue *q)
@@ -353,10 +378,11 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
+ mutex_lock(&q->sysfs_lock);
if (!q->mq_sysfs_init_done)
- return ret;
+ goto unlock;
- blk_mq_debugfs_register_hctxs(q);
+ blk_mq_debugfs_register_mq(q);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
@@ -364,5 +390,8 @@ int blk_mq_sysfs_register(struct request_queue *q)
break;
}
+unlock:
+ mutex_unlock(&q->sysfs_lock);
+
return ret;
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9d97bfc4d465..d0be72ccb091 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -96,7 +96,10 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
!hctx_may_queue(data->hctx, bt))
return -1;
- return __sbitmap_queue_get(bt);
+ if (data->shallow_depth)
+ return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
+ else
+ return __sbitmap_queue_get(bt);
}
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c7836a1ded97..bf90684a007a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -39,6 +39,26 @@
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
+static void blk_mq_poll_stats_start(struct request_queue *q);
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
+
+static int blk_mq_poll_stats_bkt(const struct request *rq)
+{
+ int ddir, bytes, bucket;
+
+ ddir = rq_data_dir(rq);
+ bytes = blk_rq_bytes(rq);
+
+ bucket = ddir + 2*(ilog2(bytes) - 9);
+
+ if (bucket < 0)
+ return -1;
+ else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
+ return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
+
+ return bucket;
+}
+
/*
* Check if any of the ctx's have pending work in this hardware queue
*/
@@ -65,7 +85,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
}
-void blk_mq_freeze_queue_start(struct request_queue *q)
+void blk_freeze_queue_start(struct request_queue *q)
{
int freeze_depth;
@@ -75,7 +95,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
blk_mq_run_hw_queues(q, false);
}
}
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
+EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
void blk_mq_freeze_queue_wait(struct request_queue *q)
{
@@ -105,7 +125,7 @@ void blk_freeze_queue(struct request_queue *q)
* no blk_unfreeze_queue(), and blk_freeze_queue() is not
* exported to drivers as the only user for unfreeze is blk_mq.
*/
- blk_mq_freeze_queue_start(q);
+ blk_freeze_queue_start(q);
blk_mq_freeze_queue_wait(q);
}
@@ -210,7 +230,6 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
#endif
rq->special = NULL;
/* tag was already set */
- rq->errors = 0;
rq->extra_len = 0;
INIT_LIST_HEAD(&rq->timeout_list);
@@ -347,7 +366,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
if (rq->tag != -1)
blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
if (sched_tag != -1)
- blk_mq_sched_completed_request(hctx, rq);
+ blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
blk_queue_exit(q);
}
@@ -365,6 +384,7 @@ void blk_mq_finish_request(struct request *rq)
{
blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
}
+EXPORT_SYMBOL_GPL(blk_mq_finish_request);
void blk_mq_free_request(struct request *rq)
{
@@ -402,12 +422,19 @@ static void __blk_mq_complete_request_remote(void *data)
rq->q->softirq_done_fn(rq);
}
-static void blk_mq_ipi_complete_request(struct request *rq)
+static void __blk_mq_complete_request(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
bool shared = false;
int cpu;
+ if (rq->internal_tag != -1)
+ blk_mq_sched_completed_request(rq);
+ if (rq->rq_flags & RQF_STATS) {
+ blk_mq_poll_stats_start(rq->q);
+ blk_stat_add(rq);
+ }
+
if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
rq->q->softirq_done_fn(rq);
return;
@@ -428,33 +455,6 @@ static void blk_mq_ipi_complete_request(struct request *rq)
put_cpu();
}
-static void blk_mq_stat_add(struct request *rq)
-{
- if (rq->rq_flags & RQF_STATS) {
- /*
- * We could rq->mq_ctx here, but there's less of a risk
- * of races if we have the completion event add the stats
- * to the local software queue.
- */
- struct blk_mq_ctx *ctx;
-
- ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
- blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
- }
-}
-
-static void __blk_mq_complete_request(struct request *rq)
-{
- struct request_queue *q = rq->q;
-
- blk_mq_stat_add(rq);
-
- if (!q->softirq_done_fn)
- blk_mq_end_request(rq, rq->errors);
- else
- blk_mq_ipi_complete_request(rq);
-}
-
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
@@ -463,16 +463,14 @@ static void __blk_mq_complete_request(struct request *rq)
* Ends all I/O on a request. It does not handle partial completions.
* The actual completion happens out-of-order, through a IPI handler.
**/
-void blk_mq_complete_request(struct request *rq, int error)
+void blk_mq_complete_request(struct request *rq)
{
struct request_queue *q = rq->q;
if (unlikely(blk_should_fake_timeout(q)))
return;
- if (!blk_mark_rq_complete(rq)) {
- rq->errors = error;
+ if (!blk_mark_rq_complete(rq))
__blk_mq_complete_request(rq);
- }
}
EXPORT_SYMBOL(blk_mq_complete_request);
@@ -491,7 +489,7 @@ void blk_mq_start_request(struct request *rq)
trace_block_rq_issue(q, rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
- blk_stat_set_issue_time(&rq->issue_stat);
+ blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
rq->rq_flags |= RQF_STATS;
wbt_issue(q->rq_wb, &rq->issue_stat);
}
@@ -526,6 +524,15 @@ void blk_mq_start_request(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_start_request);
+/*
+ * When we reach here because queue is busy, REQ_ATOM_COMPLETE
+ * flag isn't set yet, so there may be race with timeout handler,
+ * but given rq->deadline is just set in .queue_rq() under
+ * this situation, the race won't be possible in reality because
+ * rq->timeout should be set as big enough to cover the window
+ * between blk_mq_start_request() called from .queue_rq() and
+ * clearing REQ_ATOM_STARTED here.
+ */
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
@@ -633,8 +640,7 @@ void blk_mq_abort_requeue_list(struct request_queue *q)
rq = list_first_entry(&rq_list, struct request, queuelist);
list_del_init(&rq->queuelist);
- rq->errors = -EIO;
- blk_mq_end_request(rq, rq->errors);
+ blk_mq_end_request(rq, -EIO);
}
}
EXPORT_SYMBOL(blk_mq_abort_requeue_list);
@@ -666,7 +672,7 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
* just be ignored. This can happen due to the bitflag ordering.
* Timeout first checks if STARTED is set, and if it is, assumes
* the request is active. But if we race with completion, then
- * we both flags will get cleared. So check here again, and ignore
+ * both flags will get cleared. So check here again, and ignore
* a timeout event with a request that isn't active.
*/
if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
@@ -699,6 +705,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
return;
+ /*
+ * The rq being checked may have been freed and reallocated
+ * out already here, we avoid this race by checking rq->deadline
+ * and REQ_ATOM_COMPLETE flag together:
+ *
+ * - if rq->deadline is observed as new value because of
+ * reusing, the rq won't be timed out because of timing.
+ * - if rq->deadline is observed as previous value,
+ * REQ_ATOM_COMPLETE flag won't be cleared in reuse path
+ * because we put a barrier between setting rq->deadline
+ * and clearing the flag in blk_mq_start_request(), so
+ * this rq won't be timed out too.
+ */
if (time_after_eq(jiffies, rq->deadline)) {
if (!blk_mark_rq_complete(rq))
blk_mq_rq_timed_out(rq, reserved);
@@ -727,7 +746,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
* percpu_ref_tryget directly, because we need to be able to
* obtain a reference even in the short window between the queue
* starting to freeze, by dropping the first reference in
- * blk_mq_freeze_queue_start, and the moment the last request is
+ * blk_freeze_queue_start, and the moment the last request is
* consumed, marked by the instant q_usage_counter reaches
* zero.
*/
@@ -845,6 +864,8 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
};
+ might_sleep_if(wait);
+
if (rq->tag != -1)
goto done;
@@ -964,20 +985,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
{
struct blk_mq_hw_ctx *hctx;
struct request *rq;
- LIST_HEAD(driver_list);
- struct list_head *dptr;
int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
if (list_empty(list))
return false;
/*
- * Start off with dptr being NULL, so we start the first request
- * immediately, even if we have more pending.
- */
- dptr = NULL;
-
- /*
* Now process all the entries, sending them to the driver.
*/
errors = queued = 0;
@@ -993,23 +1006,21 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed.
*/
- if (blk_mq_dispatch_wait_add(hctx)) {
- /*
- * It's possible that a tag was freed in the
- * window between the allocation failure and
- * adding the hardware queue to the wait queue.
- */
- if (!blk_mq_get_driver_tag(rq, &hctx, false))
- break;
- } else {
+ if (!blk_mq_dispatch_wait_add(hctx))
+ break;
+
+ /*
+ * It's possible that a tag was freed in the window
+ * between the allocation failure and adding the
+ * hardware queue to the wait queue.
+ */
+ if (!blk_mq_get_driver_tag(rq, &hctx, false))
break;
- }
}
list_del_init(&rq->queuelist);
bd.rq = rq;
- bd.list = dptr;
/*
* Flag last if we have no more requests, or if we have more
@@ -1038,20 +1049,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
pr_err("blk-mq: bad return on queue: %d\n", ret);
case BLK_MQ_RQ_QUEUE_ERROR:
errors++;
- rq->errors = -EIO;
- blk_mq_end_request(rq, rq->errors);
+ blk_mq_end_request(rq, -EIO);
break;
}
if (ret == BLK_MQ_RQ_QUEUE_BUSY)
break;
-
- /*
- * We've done the first request. If we have more than 1
- * left in the list, set dptr to defer issue.
- */
- if (!dptr && list->next != list->prev)
- dptr = &driver_list;
} while (!list_empty(list));
hctx->dispatched[queued_to_index(queued)]++;
@@ -1062,8 +1065,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
*/
if (!list_empty(list)) {
/*
- * If we got a driver tag for the next request already,
- * free it again.
+ * If an I/O scheduler has been configured and we got a driver
+ * tag for the next request already, free it again.
*/
rq = list_first_entry(list, struct request, queuelist);
blk_mq_put_driver_tag(rq);
@@ -1073,16 +1076,24 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
spin_unlock(&hctx->lock);
/*
- * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
- * it's possible the queue is stopped and restarted again
- * before this. Queue restart will dispatch requests. And since
- * requests in rq_list aren't added into hctx->dispatch yet,
- * the requests in rq_list might get lost.
+ * If SCHED_RESTART was set by the caller of this function and
+ * it is no longer set that means that it was cleared by another
+ * thread and hence that a queue rerun is needed.
*
- * blk_mq_run_hw_queue() already checks the STOPPED bit
+ * If TAG_WAITING is set that means that an I/O scheduler has
+ * been configured and another thread is waiting for a driver
+ * tag. To guarantee fairness, do not rerun this hardware queue
+ * but let the other thread grab the driver tag.
*
- * If RESTART or TAG_WAITING is set, then let completion restart
- * the queue instead of potentially looping here.
+ * If no I/O scheduler has been configured it is possible that
+ * the hardware queue got stopped and restarted before requests
+ * were pushed back onto the dispatch list. Rerun the queue to
+ * avoid starvation. Notes:
+ * - blk_mq_run_hw_queue() checks whether or not a queue has
+ * been stopped before rerunning a queue.
+ * - Some but not all block drivers stop a queue before
+ * returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq
+ * and dm-rq.
*/
if (!blk_mq_sched_needs_restart(hctx) &&
!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
@@ -1104,6 +1115,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
blk_mq_sched_dispatch_requests(hctx);
rcu_read_unlock();
} else {
+ might_sleep();
+
srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
blk_mq_sched_dispatch_requests(hctx);
srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
@@ -1153,13 +1166,9 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
put_cpu();
}
- if (msecs == 0)
- kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->run_work);
- else
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->delayed_run_work,
- msecs_to_jiffies(msecs));
+ kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ &hctx->run_work,
+ msecs_to_jiffies(msecs));
}
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@ -1172,6 +1181,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
__blk_mq_delay_run_hw_queue(hctx, async, 0);
}
+EXPORT_SYMBOL(blk_mq_run_hw_queue);
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
@@ -1210,8 +1220,7 @@ EXPORT_SYMBOL(blk_mq_queue_stopped);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
- cancel_work(&hctx->run_work);
- cancel_delayed_work(&hctx->delay_work);
+ cancel_delayed_work_sync(&hctx->run_work);
set_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -1268,38 +1277,40 @@ static void blk_mq_run_work_fn(struct work_struct *work)
{
struct blk_mq_hw_ctx *hctx;
- hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
-
- __blk_mq_run_hw_queue(hctx);
-}
+ hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
-static void blk_mq_delayed_run_work_fn(struct work_struct *work)
-{
- struct blk_mq_hw_ctx *hctx;
+ /*
+ * If we are stopped, don't run the queue. The exception is if
+ * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
+ * the STOPPED bit and run it.
+ */
+ if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
+ if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
+ return;
- hctx = container_of(work, struct blk_mq_hw_ctx, delayed_run_work.work);
+ clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+ clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+ }
__blk_mq_run_hw_queue(hctx);
}
-static void blk_mq_delay_work_fn(struct work_struct *work)
-{
- struct blk_mq_hw_ctx *hctx;
-
- hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
-
- if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
- __blk_mq_run_hw_queue(hctx);
-}
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
return;
+ /*
+ * Stop the hw queue, then modify currently delayed work.
+ * This should prevent us from running the queue prematurely.
+ * Mark the queue as auto-clearing STOPPED when it runs.
+ */
blk_mq_stop_hw_queue(hctx);
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->delay_work, msecs_to_jiffies(msecs));
+ set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+ kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ &hctx->run_work,
+ msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_queue);
@@ -1408,7 +1419,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
{
- init_request_from_bio(rq, bio);
+ blk_init_request_from_bio(rq, bio);
blk_account_io_start(rq, true);
}
@@ -1453,14 +1464,13 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
}
-static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
+static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
bool may_sleep)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
.rq = rq,
- .list = NULL,
- .last = 1
+ .last = true,
};
struct blk_mq_hw_ctx *hctx;
blk_qc_t new_cookie;
@@ -1485,31 +1495,42 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
return;
}
- __blk_mq_requeue_request(rq);
-
if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
*cookie = BLK_QC_T_NONE;
- rq->errors = -EIO;
- blk_mq_end_request(rq, rq->errors);
+ blk_mq_end_request(rq, -EIO);
return;
}
+ __blk_mq_requeue_request(rq);
insert:
blk_mq_sched_insert_request(rq, false, true, false, may_sleep);
}
-/*
- * Multiple hardware queue variant. This will not use per-process plugs,
- * but will attempt to bypass the hctx queueing if we can go straight to
- * hardware for SYNC IO.
- */
+static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq, blk_qc_t *cookie)
+{
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+ rcu_read_lock();
+ __blk_mq_try_issue_directly(rq, cookie, false);
+ rcu_read_unlock();
+ } else {
+ unsigned int srcu_idx;
+
+ might_sleep();
+
+ srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+ __blk_mq_try_issue_directly(rq, cookie, true);
+ srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+ }
+}
+
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
struct blk_mq_alloc_data data = { .flags = 0 };
struct request *rq;
- unsigned int request_count = 0, srcu_idx;
+ unsigned int request_count = 0;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
@@ -1545,147 +1566,21 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
cookie = request_to_qc_t(data.hctx, rq);
- if (unlikely(is_flush_fua)) {
- if (q->elevator)
- goto elv_insert;
- blk_mq_bio_to_request(rq, bio);
- blk_insert_flush(rq);
- goto run_queue;
- }
-
plug = current->plug;
- /*
- * If the driver supports defer issued based on 'last', then
- * queue it up like normal since we can potentially save some
- * CPU this way.
- */
- if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
- !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
- struct request *old_rq = NULL;
-
- blk_mq_bio_to_request(rq, bio);
-
- /*
- * We do limited plugging. If the bio can be merged, do that.
- * Otherwise the existing request in the plug list will be
- * issued. So the plug list will have one request at most
- */
- if (plug) {
- /*
- * The plug list might get flushed before this. If that
- * happens, same_queue_rq is invalid and plug list is
- * empty
- */
- if (same_queue_rq && !list_empty(&plug->mq_list)) {
- old_rq = same_queue_rq;
- list_del_init(&old_rq->queuelist);
- }
- list_add_tail(&rq->queuelist, &plug->mq_list);
- } else /* is_sync */
- old_rq = rq;
+ if (unlikely(is_flush_fua)) {
blk_mq_put_ctx(data.ctx);
- if (!old_rq)
- goto done;
-
- if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
- rcu_read_lock();
- blk_mq_try_issue_directly(old_rq, &cookie, false);
- rcu_read_unlock();
+ blk_mq_bio_to_request(rq, bio);
+ if (q->elevator) {
+ blk_mq_sched_insert_request(rq, false, true, true,
+ true);
} else {
- srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
- blk_mq_try_issue_directly(old_rq, &cookie, true);
- srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
+ blk_insert_flush(rq);
+ blk_mq_run_hw_queue(data.hctx, true);
}
- goto done;
- }
-
- if (q->elevator) {
-elv_insert:
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
- blk_mq_sched_insert_request(rq, false, true,
- !is_sync || is_flush_fua, true);
- goto done;
- }
- if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
- /*
- * For a SYNC request, send it to the hardware immediately. For
- * an ASYNC request, just ensure that we run it later on. The
- * latter allows for merging opportunities and more efficient
- * dispatching.
- */
-run_queue:
- blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
- }
- blk_mq_put_ctx(data.ctx);
-done:
- return cookie;
-}
-
-/*
- * Single hardware queue variant. This will attempt to use any per-process
- * plug for merging and IO deferral.
- */
-static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
-{
- const int is_sync = op_is_sync(bio->bi_opf);
- const int is_flush_fua = op_is_flush(bio->bi_opf);
- struct blk_plug *plug;
- unsigned int request_count = 0;
- struct blk_mq_alloc_data data = { .flags = 0 };
- struct request *rq;
- blk_qc_t cookie;
- unsigned int wb_acct;
-
- blk_queue_bounce(q, &bio);
-
- if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio_io_error(bio);
- return BLK_QC_T_NONE;
- }
-
- blk_queue_split(q, &bio, q->bio_split);
-
- if (!is_flush_fua && !blk_queue_nomerges(q)) {
- if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
- return BLK_QC_T_NONE;
- } else
- request_count = blk_plug_queued_count(q);
-
- if (blk_mq_sched_bio_merge(q, bio))
- return BLK_QC_T_NONE;
-
- wb_acct = wbt_wait(q->rq_wb, bio, NULL);
-
- trace_block_getrq(q, bio, bio->bi_opf);
-
- rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
- if (unlikely(!rq)) {
- __wbt_done(q->rq_wb, wb_acct);
- return BLK_QC_T_NONE;
- }
-
- wbt_track(&rq->issue_stat, wb_acct);
-
- cookie = request_to_qc_t(data.hctx, rq);
-
- if (unlikely(is_flush_fua)) {
- if (q->elevator)
- goto elv_insert;
- blk_mq_bio_to_request(rq, bio);
- blk_insert_flush(rq);
- goto run_queue;
- }
-
- /*
- * A task plug currently exists. Since this is completely lockless,
- * utilize that to temporarily store requests until the task is
- * either done or scheduled away.
- */
- plug = current->plug;
- if (plug) {
+ } else if (plug && q->nr_hw_queues == 1) {
struct request *last = NULL;
+ blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
/*
@@ -1694,13 +1589,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
*/
if (list_empty(&plug->mq_list))
request_count = 0;
+ else if (blk_queue_nomerges(q))
+ request_count = blk_plug_queued_count(q);
+
if (!request_count)
trace_block_plug(q);
else
last = list_entry_rq(plug->mq_list.prev);
- blk_mq_put_ctx(data.ctx);
-
if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
blk_flush_plug_list(plug, false);
@@ -1708,30 +1604,41 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
}
list_add_tail(&rq->queuelist, &plug->mq_list);
- return cookie;
- }
-
- if (q->elevator) {
-elv_insert:
- blk_mq_put_ctx(data.ctx);
+ } else if (plug && !blk_queue_nomerges(q)) {
blk_mq_bio_to_request(rq, bio);
- blk_mq_sched_insert_request(rq, false, true,
- !is_sync || is_flush_fua, true);
- goto done;
- }
- if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+
/*
- * For a SYNC request, send it to the hardware immediately. For
- * an ASYNC request, just ensure that we run it later on. The
- * latter allows for merging opportunities and more efficient
- * dispatching.
+ * We do limited plugging. If the bio can be merged, do that.
+ * Otherwise the existing request in the plug list will be
+ * issued. So the plug list will have one request at most
+ * The plug list might get flushed before this. If that happens,
+ * the plug list is empty, and same_queue_rq is invalid.
*/
-run_queue:
- blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
- }
+ if (list_empty(&plug->mq_list))
+ same_queue_rq = NULL;
+ if (same_queue_rq)
+ list_del_init(&same_queue_rq->queuelist);
+ list_add_tail(&rq->queuelist, &plug->mq_list);
+
+ blk_mq_put_ctx(data.ctx);
+
+ if (same_queue_rq)
+ blk_mq_try_issue_directly(data.hctx, same_queue_rq,
+ &cookie);
+ } else if (q->nr_hw_queues > 1 && is_sync) {
+ blk_mq_put_ctx(data.ctx);
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_try_issue_directly(data.hctx, rq, &cookie);
+ } else if (q->elevator) {
+ blk_mq_put_ctx(data.ctx);
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_sched_insert_request(rq, false, true, true, true);
+ } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+ blk_mq_put_ctx(data.ctx);
+ blk_mq_run_hw_queue(data.hctx, true);
+ } else
+ blk_mq_put_ctx(data.ctx);
- blk_mq_put_ctx(data.ctx);
-done:
return cookie;
}
@@ -1988,9 +1895,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (node == NUMA_NO_NODE)
node = hctx->numa_node = set->numa_node;
- INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
- INIT_DELAYED_WORK(&hctx->delayed_run_work, blk_mq_delayed_run_work_fn);
- INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
+ INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
hctx->queue = q;
@@ -2067,8 +1972,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
spin_lock_init(&__ctx->lock);
INIT_LIST_HEAD(&__ctx->rq_list);
__ctx->queue = q;
- blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
- blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
/* If the cpu isn't online, the cpu is mapped to first hctx */
if (!cpu_online(i))
@@ -2215,6 +2118,8 @@ static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared)
{
struct request_queue *q;
+ lockdep_assert_held(&set->tag_list_lock);
+
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_freeze_queue(q);
queue_set_hctx_shared(q, shared);
@@ -2227,7 +2132,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
struct blk_mq_tag_set *set = q->tag_set;
mutex_lock(&set->tag_list_lock);
- list_del_init(&q->tag_set_list);
+ list_del_rcu(&q->tag_set_list);
+ INIT_LIST_HEAD(&q->tag_set_list);
if (list_is_singular(&set->tag_list)) {
/* just transitioned to unshared */
set->flags &= ~BLK_MQ_F_TAG_SHARED;
@@ -2235,6 +2141,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
blk_mq_update_tag_set_depth(set, false);
}
mutex_unlock(&set->tag_list_lock);
+
+ synchronize_rcu();
}
static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
@@ -2252,7 +2160,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
}
if (set->flags & BLK_MQ_F_TAG_SHARED)
queue_set_hctx_shared(q, true);
- list_add_tail(&q->tag_set_list, &set->tag_list);
+ list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
mutex_unlock(&set->tag_list_lock);
}
@@ -2364,6 +2272,12 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/* mark the queue as mq asap */
q->mq_ops = set->ops;
+ q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
+ blk_mq_poll_stats_bkt,
+ BLK_MQ_POLL_STATS_BKTS, q);
+ if (!q->poll_cb)
+ goto err_exit;
+
q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
if (!q->queue_ctx)
goto err_exit;
@@ -2398,10 +2312,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
- if (q->nr_hw_queues > 1)
- blk_queue_make_request(q, blk_mq_make_request);
- else
- blk_queue_make_request(q, blk_sq_make_request);
+ blk_queue_make_request(q, blk_mq_make_request);
/*
* Do this after blk_queue_make_request() overrides it...
@@ -2456,8 +2367,6 @@ void blk_mq_free_queue(struct request_queue *q)
list_del_init(&q->all_q_node);
mutex_unlock(&all_q_mutex);
- wbt_exit(q);
-
blk_mq_del_queue_tag_set(q);
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
@@ -2502,7 +2411,7 @@ static void blk_mq_queue_reinit_work(void)
* take place in parallel.
*/
list_for_each_entry(q, &all_q_list, all_q_node)
- blk_mq_freeze_queue_start(q);
+ blk_freeze_queue_start(q);
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_freeze_queue_wait(q);
@@ -2743,6 +2652,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
struct request_queue *q;
+ lockdep_assert_held(&set->tag_list_lock);
+
if (nr_hw_queues > nr_cpu_ids)
nr_hw_queues = nr_cpu_ids;
if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
@@ -2755,16 +2666,6 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
-
- /*
- * Manually set the make_request_fn as blk_queue_make_request
- * resets a lot of the queue settings.
- */
- if (q->nr_hw_queues > 1)
- q->make_request_fn = blk_mq_make_request;
- else
- q->make_request_fn = blk_sq_make_request;
-
blk_mq_queue_reinit(q, cpu_online_mask);
}
@@ -2773,39 +2674,69 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
+/* Enable polling stats and return whether they were already enabled. */
+static bool blk_poll_stats_enable(struct request_queue *q)
+{
+ if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+ return true;
+ blk_stat_add_callback(q, q->poll_cb);
+ return false;
+}
+
+static void blk_mq_poll_stats_start(struct request_queue *q)
+{
+ /*
+ * We don't arm the callback if polling stats are not enabled or the
+ * callback is already active.
+ */
+ if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+ blk_stat_is_active(q->poll_cb))
+ return;
+
+ blk_stat_activate_msecs(q->poll_cb, 100);
+}
+
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
+{
+ struct request_queue *q = cb->data;
+ int bucket;
+
+ for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
+ if (cb->stat[bucket].nr_samples)
+ q->poll_stat[bucket] = cb->stat[bucket];
+ }
+}
+
static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- struct blk_rq_stat stat[2];
unsigned long ret = 0;
+ int bucket;
/*
* If stats collection isn't on, don't sleep but turn it on for
* future users
*/
- if (!blk_stat_enable(q))
+ if (!blk_poll_stats_enable(q))
return 0;
/*
- * We don't have to do this once per IO, should optimize this
- * to just use the current window of stats until it changes
- */
- memset(&stat, 0, sizeof(stat));
- blk_hctx_stat_get(hctx, stat);
-
- /*
* As an optimistic guess, use half of the mean service time
* for this type of request. We can (and should) make this smarter.
* For instance, if the completion latencies are tight, we can
* get closer than just half the mean. This is especially
* important on devices where the completion latencies are longer
- * than ~10 usec.
+ * than ~10 usec. We do use the stats for the relevant IO size
+ * if available which does lead to better estimates.
*/
- if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
- ret = (stat[BLK_STAT_READ].mean + 1) / 2;
- else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
- ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;
+ bucket = blk_mq_poll_stats_bkt(rq);
+ if (bucket < 0)
+ return ret;
+
+ if (q->poll_stat[bucket].nr_samples)
+ ret = (q->poll_stat[bucket].mean + 1) / 2;
return ret;
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 660a17e1d033..2814a14e529c 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -20,7 +20,6 @@ struct blk_mq_ctx {
/* incremented at completion time */
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
- struct blk_rq_stat stat[2];
struct request_queue *queue;
struct kobject kobj;
@@ -79,6 +78,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
*/
extern void blk_mq_sysfs_init(struct request_queue *q);
extern void blk_mq_sysfs_deinit(struct request_queue *q);
+extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
@@ -87,13 +87,12 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
* debugfs helpers
*/
#ifdef CONFIG_BLK_DEBUG_FS
-int blk_mq_debugfs_register(struct request_queue *q, const char *name);
+int blk_mq_debugfs_register(struct request_queue *q);
void blk_mq_debugfs_unregister(struct request_queue *q);
-int blk_mq_debugfs_register_hctxs(struct request_queue *q);
-void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
+int blk_mq_debugfs_register_mq(struct request_queue *q);
+void blk_mq_debugfs_unregister_mq(struct request_queue *q);
#else
-static inline int blk_mq_debugfs_register(struct request_queue *q,
- const char *name)
+static inline int blk_mq_debugfs_register(struct request_queue *q)
{
return 0;
}
@@ -102,12 +101,12 @@ static inline void blk_mq_debugfs_unregister(struct request_queue *q)
{
}
-static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+static inline int blk_mq_debugfs_register_mq(struct request_queue *q)
{
return 0;
}
-static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+static inline void blk_mq_debugfs_unregister_mq(struct request_queue *q)
{
}
#endif
@@ -142,6 +141,7 @@ struct blk_mq_alloc_data {
/* input parameter */
struct request_queue *q;
unsigned int flags;
+ unsigned int shallow_depth;
/* input & output parameter */
struct blk_mq_ctx *ctx;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1e7174ffc9d4..4fa81ed383ca 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -103,7 +103,6 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->discard_granularity = 0;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
- lim->discard_zeroes_data = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
lim->alignment_offset = 0;
@@ -127,7 +126,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
blk_set_default_limits(lim);
/* Inherit limits from component devices */
- lim->discard_zeroes_data = 1;
lim->max_segments = USHRT_MAX;
lim->max_discard_segments = 1;
lim->max_hw_sectors = UINT_MAX;
@@ -609,7 +607,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
t->cluster &= b->cluster;
- t->discard_zeroes_data &= b->discard_zeroes_data;
/* Physical block size a multiple of the logical block size? */
if (t->physical_block_size & (t->logical_block_size - 1)) {
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 186fcb981e9b..6c2f40940439 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -4,10 +4,27 @@
* Copyright (C) 2016 Jens Axboe
*/
#include <linux/kernel.h>
+#include <linux/rculist.h>
#include <linux/blk-mq.h>
#include "blk-stat.h"
#include "blk-mq.h"
+#include "blk.h"
+
+#define BLK_RQ_STAT_BATCH 64
+
+struct blk_queue_stats {
+ struct list_head callbacks;
+ spinlock_t lock;
+ bool enable_accounting;
+};
+
+static void blk_stat_init(struct blk_rq_stat *stat)
+{
+ stat->min = -1ULL;
+ stat->max = stat->nr_samples = stat->mean = 0;
+ stat->batch = stat->nr_batch = 0;
+}
static void blk_stat_flush_batch(struct blk_rq_stat *stat)
{
@@ -48,209 +65,185 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
dst->nr_samples += src->nr_samples;
}
-static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
{
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
- uint64_t latest = 0;
- int i, j, nr;
-
- blk_stat_init(&dst[BLK_STAT_READ]);
- blk_stat_init(&dst[BLK_STAT_WRITE]);
-
- nr = 0;
- do {
- uint64_t newest = 0;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx_for_each_ctx(hctx, ctx, j) {
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
-
- if (!ctx->stat[BLK_STAT_READ].nr_samples &&
- !ctx->stat[BLK_STAT_WRITE].nr_samples)
- continue;
- if (ctx->stat[BLK_STAT_READ].time > newest)
- newest = ctx->stat[BLK_STAT_READ].time;
- if (ctx->stat[BLK_STAT_WRITE].time > newest)
- newest = ctx->stat[BLK_STAT_WRITE].time;
- }
- }
+ stat->min = min(stat->min, value);
+ stat->max = max(stat->max, value);
- /*
- * No samples
- */
- if (!newest)
- break;
-
- if (newest > latest)
- latest = newest;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx_for_each_ctx(hctx, ctx, j) {
- if (ctx->stat[BLK_STAT_READ].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_READ],
- &ctx->stat[BLK_STAT_READ]);
- nr++;
- }
- if (ctx->stat[BLK_STAT_WRITE].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_WRITE],
- &ctx->stat[BLK_STAT_WRITE]);
- nr++;
- }
- }
- }
- /*
- * If we race on finding an entry, just loop back again.
- * Should be very rare.
- */
- } while (!nr);
+ if (stat->batch + value < stat->batch ||
+ stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+ blk_stat_flush_batch(stat);
- dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest;
+ stat->batch += value;
+ stat->nr_batch++;
}
-void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+void blk_stat_add(struct request *rq)
{
- if (q->mq_ops)
- blk_mq_stat_get(q, dst);
- else {
- blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]);
- blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]);
- memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ],
- sizeof(struct blk_rq_stat));
- memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE],
- sizeof(struct blk_rq_stat));
+ struct request_queue *q = rq->q;
+ struct blk_stat_callback *cb;
+ struct blk_rq_stat *stat;
+ int bucket;
+ s64 now, value;
+
+ now = __blk_stat_time(ktime_to_ns(ktime_get()));
+ if (now < blk_stat_time(&rq->issue_stat))
+ return;
+
+ value = now - blk_stat_time(&rq->issue_stat);
+
+ blk_throtl_stat_add(rq, value);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
+ if (blk_stat_is_active(cb)) {
+ bucket = cb->bucket_fn(rq);
+ if (bucket < 0)
+ continue;
+ stat = &this_cpu_ptr(cb->cpu_stat)[bucket];
+ __blk_stat_add(stat, value);
+ }
}
+ rcu_read_unlock();
}
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+static void blk_stat_timer_fn(unsigned long data)
{
- struct blk_mq_ctx *ctx;
- unsigned int i, nr;
+ struct blk_stat_callback *cb = (void *)data;
+ unsigned int bucket;
+ int cpu;
- nr = 0;
- do {
- uint64_t newest = 0;
+ for (bucket = 0; bucket < cb->buckets; bucket++)
+ blk_stat_init(&cb->stat[bucket]);
- hctx_for_each_ctx(hctx, ctx, i) {
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
+ for_each_online_cpu(cpu) {
+ struct blk_rq_stat *cpu_stat;
- if (!ctx->stat[BLK_STAT_READ].nr_samples &&
- !ctx->stat[BLK_STAT_WRITE].nr_samples)
- continue;
-
- if (ctx->stat[BLK_STAT_READ].time > newest)
- newest = ctx->stat[BLK_STAT_READ].time;
- if (ctx->stat[BLK_STAT_WRITE].time > newest)
- newest = ctx->stat[BLK_STAT_WRITE].time;
+ cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+ for (bucket = 0; bucket < cb->buckets; bucket++) {
+ blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
+ blk_stat_init(&cpu_stat[bucket]);
}
+ }
- if (!newest)
- break;
-
- hctx_for_each_ctx(hctx, ctx, i) {
- if (ctx->stat[BLK_STAT_READ].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_READ],
- &ctx->stat[BLK_STAT_READ]);
- nr++;
- }
- if (ctx->stat[BLK_STAT_WRITE].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_WRITE],
- &ctx->stat[BLK_STAT_WRITE]);
- nr++;
- }
- }
- /*
- * If we race on finding an entry, just loop back again.
- * Should be very rare, as the window is only updated
- * occasionally
- */
- } while (!nr);
+ cb->timer_fn(cb);
}
-static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+ int (*bucket_fn)(const struct request *),
+ unsigned int buckets, void *data)
{
- stat->min = -1ULL;
- stat->max = stat->nr_samples = stat->mean = 0;
- stat->batch = stat->nr_batch = 0;
- stat->time = time_now & BLK_STAT_NSEC_MASK;
-}
+ struct blk_stat_callback *cb;
-void blk_stat_init(struct blk_rq_stat *stat)
-{
- __blk_stat_init(stat, ktime_to_ns(ktime_get()));
-}
+ cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+ if (!cb)
+ return NULL;
-static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
-{
- return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
+ cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat),
+ GFP_KERNEL);
+ if (!cb->stat) {
+ kfree(cb);
+ return NULL;
+ }
+ cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat),
+ __alignof__(struct blk_rq_stat));
+ if (!cb->cpu_stat) {
+ kfree(cb->stat);
+ kfree(cb);
+ return NULL;
+ }
+
+ cb->timer_fn = timer_fn;
+ cb->bucket_fn = bucket_fn;
+ cb->data = data;
+ cb->buckets = buckets;
+ setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb);
+
+ return cb;
}
+EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
-bool blk_stat_is_current(struct blk_rq_stat *stat)
+void blk_stat_add_callback(struct request_queue *q,
+ struct blk_stat_callback *cb)
{
- return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+ unsigned int bucket;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct blk_rq_stat *cpu_stat;
+
+ cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+ for (bucket = 0; bucket < cb->buckets; bucket++)
+ blk_stat_init(&cpu_stat[bucket]);
+ }
+
+ spin_lock(&q->stats->lock);
+ list_add_tail_rcu(&cb->list, &q->stats->callbacks);
+ set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+ spin_unlock(&q->stats->lock);
}
+EXPORT_SYMBOL_GPL(blk_stat_add_callback);
-void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+void blk_stat_remove_callback(struct request_queue *q,
+ struct blk_stat_callback *cb)
{
- s64 now, value;
+ spin_lock(&q->stats->lock);
+ list_del_rcu(&cb->list);
+ if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
+ clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+ spin_unlock(&q->stats->lock);
- now = __blk_stat_time(ktime_to_ns(ktime_get()));
- if (now < blk_stat_time(&rq->issue_stat))
- return;
-
- if (!__blk_stat_is_current(stat, now))
- __blk_stat_init(stat, now);
+ del_timer_sync(&cb->timer);
+}
+EXPORT_SYMBOL_GPL(blk_stat_remove_callback);
- value = now - blk_stat_time(&rq->issue_stat);
- if (value > stat->max)
- stat->max = value;
- if (value < stat->min)
- stat->min = value;
+static void blk_stat_free_callback_rcu(struct rcu_head *head)
+{
+ struct blk_stat_callback *cb;
- if (stat->batch + value < stat->batch ||
- stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
- blk_stat_flush_batch(stat);
+ cb = container_of(head, struct blk_stat_callback, rcu);
+ free_percpu(cb->cpu_stat);
+ kfree(cb->stat);
+ kfree(cb);
+}
- stat->batch += value;
- stat->nr_batch++;
+void blk_stat_free_callback(struct blk_stat_callback *cb)
+{
+ if (cb)
+ call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
}
+EXPORT_SYMBOL_GPL(blk_stat_free_callback);
-void blk_stat_clear(struct request_queue *q)
+void blk_stat_enable_accounting(struct request_queue *q)
{
- if (q->mq_ops) {
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
- int i, j;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx_for_each_ctx(hctx, ctx, j) {
- blk_stat_init(&ctx->stat[BLK_STAT_READ]);
- blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
- }
- }
- } else {
- blk_stat_init(&q->rq_stats[BLK_STAT_READ]);
- blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]);
- }
+ spin_lock(&q->stats->lock);
+ q->stats->enable_accounting = true;
+ set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+ spin_unlock(&q->stats->lock);
}
-void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+struct blk_queue_stats *blk_alloc_queue_stats(void)
{
- stat->time = (stat->time & BLK_STAT_MASK) |
- (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+ struct blk_queue_stats *stats;
+
+ stats = kmalloc(sizeof(*stats), GFP_KERNEL);
+ if (!stats)
+ return NULL;
+
+ INIT_LIST_HEAD(&stats->callbacks);
+ spin_lock_init(&stats->lock);
+ stats->enable_accounting = false;
+
+ return stats;
}
-/*
- * Enable stat tracking, return whether it was enabled
- */
-bool blk_stat_enable(struct request_queue *q)
+void blk_free_queue_stats(struct blk_queue_stats *stats)
{
- if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
- set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
- return false;
- }
+ if (!stats)
+ return;
+
+ WARN_ON(!list_empty(&stats->callbacks));
- return true;
+ kfree(stats);
}
diff --git a/block/blk-stat.h b/block/blk-stat.h
index a2050a0a5314..2fb20d1a341a 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -1,33 +1,85 @@
#ifndef BLK_STAT_H
#define BLK_STAT_H
-/*
- * ~0.13s window as a power-of-2 (2^27 nsecs)
- */
-#define BLK_STAT_NSEC 134217728ULL
-#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1)
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/ktime.h>
+#include <linux/rcupdate.h>
+#include <linux/timer.h>
/*
- * Upper 3 bits can be used elsewhere
+ * from upper:
+ * 3 bits: reserved for other usage
+ * 12 bits: size
+ * 49 bits: time
*/
#define BLK_STAT_RES_BITS 3
-#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS)
-#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1)
-#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK
+#define BLK_STAT_SIZE_BITS 12
+#define BLK_STAT_RES_SHIFT (64 - BLK_STAT_RES_BITS)
+#define BLK_STAT_SIZE_SHIFT (BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
+#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SIZE_SHIFT) - 1)
+#define BLK_STAT_SIZE_MASK \
+ (((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT)
+#define BLK_STAT_RES_MASK (~((1ULL << BLK_STAT_RES_SHIFT) - 1))
+
+/**
+ * struct blk_stat_callback - Block statistics callback.
+ *
+ * A &struct blk_stat_callback is associated with a &struct request_queue. While
+ * @timer is active, that queue's request completion latencies are sorted into
+ * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the
+ * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked.
+ */
+struct blk_stat_callback {
+ /*
+ * @list: RCU list of callbacks for a &struct request_queue.
+ */
+ struct list_head list;
+
+ /**
+ * @timer: Timer for the next callback invocation.
+ */
+ struct timer_list timer;
+
+ /**
+ * @cpu_stat: Per-cpu statistics buckets.
+ */
+ struct blk_rq_stat __percpu *cpu_stat;
+
+ /**
+ * @bucket_fn: Given a request, returns which statistics bucket it
+ * should be accounted under. Return -1 for no bucket for this
+ * request.
+ */
+ int (*bucket_fn)(const struct request *);
+
+ /**
+ * @buckets: Number of statistics buckets.
+ */
+ unsigned int buckets;
+
+ /**
+ * @stat: Array of statistics buckets.
+ */
+ struct blk_rq_stat *stat;
+
+ /**
+ * @fn: Callback function.
+ */
+ void (*timer_fn)(struct blk_stat_callback *);
+
+ /**
+ * @data: Private pointer for the user.
+ */
+ void *data;
-enum {
- BLK_STAT_READ = 0,
- BLK_STAT_WRITE,
+ struct rcu_head rcu;
};
-void blk_stat_add(struct blk_rq_stat *, struct request *);
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
-void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
-void blk_stat_clear(struct request_queue *);
-void blk_stat_init(struct blk_rq_stat *);
-bool blk_stat_is_current(struct blk_rq_stat *);
-void blk_stat_set_issue_time(struct blk_issue_stat *);
-bool blk_stat_enable(struct request_queue *);
+struct blk_queue_stats *blk_alloc_queue_stats(void);
+void blk_free_queue_stats(struct blk_queue_stats *);
+
+void blk_stat_add(struct request *);
static inline u64 __blk_stat_time(u64 time)
{
@@ -36,7 +88,117 @@ static inline u64 __blk_stat_time(u64 time)
static inline u64 blk_stat_time(struct blk_issue_stat *stat)
{
- return __blk_stat_time(stat->time);
+ return __blk_stat_time(stat->stat);
+}
+
+static inline sector_t blk_capped_size(sector_t size)
+{
+ return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1);
+}
+
+static inline sector_t blk_stat_size(struct blk_issue_stat *stat)
+{
+ return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT;
+}
+
+static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
+ sector_t size)
+{
+ stat->stat = (stat->stat & BLK_STAT_RES_MASK) |
+ (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) |
+ (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
+}
+
+/* record time/size info in request but not add a callback */
+void blk_stat_enable_accounting(struct request_queue *q);
+
+/**
+ * blk_stat_alloc_callback() - Allocate a block statistics callback.
+ * @timer_fn: Timer callback function.
+ * @bucket_fn: Bucket callback function.
+ * @buckets: Number of statistics buckets.
+ * @data: Value for the @data field of the &struct blk_stat_callback.
+ *
+ * See &struct blk_stat_callback for details on the callback functions.
+ *
+ * Return: &struct blk_stat_callback on success or NULL on ENOMEM.
+ */
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+ int (*bucket_fn)(const struct request *),
+ unsigned int buckets, void *data);
+
+/**
+ * blk_stat_add_callback() - Add a block statistics callback to be run on a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * Note that a single &struct blk_stat_callback can only be added to a single
+ * &struct request_queue.
+ */
+void blk_stat_add_callback(struct request_queue *q,
+ struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_remove_callback() - Remove a block statistics callback from a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * When this returns, the callback is not running on any CPUs and will not be
+ * called again unless readded.
+ */
+void blk_stat_remove_callback(struct request_queue *q,
+ struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_free_callback() - Free a block statistics callback.
+ * @cb: The callback.
+ *
+ * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must
+ * not be associated with a request queue. I.e., if it was previously added with
+ * blk_stat_add_callback(), it must also have been removed since then with
+ * blk_stat_remove_callback().
+ */
+void blk_stat_free_callback(struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_is_active() - Check if a block statistics callback is currently
+ * gathering statistics.
+ * @cb: The callback.
+ */
+static inline bool blk_stat_is_active(struct blk_stat_callback *cb)
+{
+ return timer_pending(&cb->timer);
+}
+
+/**
+ * blk_stat_activate_nsecs() - Gather block statistics during a time window in
+ * nanoseconds.
+ * @cb: The callback.
+ * @nsecs: Number of nanoseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
+ u64 nsecs)
+{
+ mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs));
+}
+
+/**
+ * blk_stat_activate_msecs() - Gather block statistics during a time window in
+ * milliseconds.
+ * @cb: The callback.
+ * @msecs: Number of milliseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
+ unsigned int msecs)
+{
+ mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
}
#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 37f0b3ad635e..3f37813ccbaf 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -208,7 +208,7 @@ static ssize_t queue_discard_max_store(struct request_queue *q,
static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
{
- return queue_var_show(queue_discard_zeroes_data(q), page);
+ return queue_var_show(0, page);
}
static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
@@ -503,26 +503,6 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
return queue_var_show(blk_queue_dax(q), page);
}
-static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
-{
- return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
- pre, (long long) stat->nr_samples,
- (long long) stat->mean, (long long) stat->min,
- (long long) stat->max);
-}
-
-static ssize_t queue_stats_show(struct request_queue *q, char *page)
-{
- struct blk_rq_stat stat[2];
- ssize_t ret;
-
- blk_queue_stat_get(q, stat);
-
- ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
- ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
- return ret;
-}
-
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.show = queue_requests_show,
@@ -691,17 +671,20 @@ static struct queue_sysfs_entry queue_dax_entry = {
.show = queue_dax_show,
};
-static struct queue_sysfs_entry queue_stats_entry = {
- .attr = {.name = "stats", .mode = S_IRUGO },
- .show = queue_stats_show,
-};
-
static struct queue_sysfs_entry queue_wb_lat_entry = {
.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
.show = queue_wb_lat_show,
.store = queue_wb_lat_store,
};
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static struct queue_sysfs_entry throtl_sample_time_entry = {
+ .attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR },
+ .show = blk_throtl_sample_time_show,
+ .store = blk_throtl_sample_time_store,
+};
+#endif
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -733,9 +716,11 @@ static struct attribute *default_attrs[] = {
&queue_poll_entry.attr,
&queue_wc_entry.attr,
&queue_dax_entry.attr,
- &queue_stats_entry.attr,
&queue_wb_lat_entry.attr,
&queue_poll_delay_entry.attr,
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ &throtl_sample_time_entry.attr,
+#endif
NULL,
};
@@ -810,7 +795,9 @@ static void blk_release_queue(struct kobject *kobj)
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
- wbt_exit(q);
+ if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+ blk_stat_remove_callback(q, q->poll_cb);
+ blk_stat_free_callback(q->poll_cb);
bdi_put(q->backing_dev_info);
blkcg_exit_queue(q);
@@ -819,6 +806,8 @@ static void blk_release_queue(struct kobject *kobj)
elevator_exit(q, q->elevator);
}
+ blk_free_queue_stats(q->stats);
+
blk_exit_rl(&q->root_rl);
if (q->queue_tags)
@@ -855,23 +844,6 @@ struct kobj_type blk_queue_ktype = {
.release = blk_release_queue,
};
-static void blk_wb_init(struct request_queue *q)
-{
-#ifndef CONFIG_BLK_WBT_MQ
- if (q->mq_ops)
- return;
-#endif
-#ifndef CONFIG_BLK_WBT_SQ
- if (q->request_fn)
- return;
-#endif
-
- /*
- * If this fails, we don't get throttling
- */
- wbt_init(q);
-}
-
int blk_register_queue(struct gendisk *disk)
{
int ret;
@@ -881,6 +853,11 @@ int blk_register_queue(struct gendisk *disk)
if (WARN_ON(!q))
return -ENXIO;
+ WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+ "%s is registering an already registered queue\n",
+ kobject_name(&dev->kobj));
+ queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
+
/*
* SCSI probing may synchronously create and destroy a lot of
* request_queues for non-existent devices. Shutting down a fully
@@ -900,9 +877,6 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
return ret;
- if (q->mq_ops)
- blk_mq_register_dev(dev, q);
-
/* Prevent changes through sysfs until registration is completed. */
mutex_lock(&q->sysfs_lock);
@@ -912,9 +886,14 @@ int blk_register_queue(struct gendisk *disk)
goto unlock;
}
+ if (q->mq_ops)
+ __blk_mq_register_dev(dev, q);
+
kobject_uevent(&q->kobj, KOBJ_ADD);
- blk_wb_init(q);
+ wbt_enable_default(q);
+
+ blk_throtl_register_queue(q);
if (q->request_fn || (q->mq_ops && q->elevator)) {
ret = elv_register_queue(q);
@@ -939,6 +918,11 @@ void blk_unregister_queue(struct gendisk *disk)
if (WARN_ON(!q))
return;
+ queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
+
+ wbt_exit(q);
+
+
if (q->mq_ops)
blk_mq_unregister_dev(disk_to_dev(disk), q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8fab716e4059..b78db2e5fdff 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -18,8 +18,17 @@ static int throtl_grp_quantum = 8;
/* Total max dispatch from all groups in one round */
static int throtl_quantum = 32;
-/* Throttling is performed over 100ms slice and after that slice is renewed */
-static unsigned long throtl_slice = HZ/10; /* 100 ms */
+/* Throttling is performed over a slice and after that slice is renewed */
+#define DFL_THROTL_SLICE_HD (HZ / 10)
+#define DFL_THROTL_SLICE_SSD (HZ / 50)
+#define MAX_THROTL_SLICE (HZ)
+#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */
+#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */
+#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
+/* default latency target is 0, eg, guarantee IO latency by default */
+#define DFL_LATENCY_TARGET (0)
+
+#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
static struct blkcg_policy blkcg_policy_throtl;
@@ -83,6 +92,12 @@ enum tg_state_flags {
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
+enum {
+ LIMIT_LOW,
+ LIMIT_MAX,
+ LIMIT_CNT,
+};
+
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
@@ -119,20 +134,54 @@ struct throtl_grp {
/* are there any throtl rules between this group and td? */
bool has_rules[2];
- /* bytes per second rate limits */
- uint64_t bps[2];
+ /* internally used bytes per second rate limits */
+ uint64_t bps[2][LIMIT_CNT];
+ /* user configured bps limits */
+ uint64_t bps_conf[2][LIMIT_CNT];
- /* IOPS limits */
- unsigned int iops[2];
+ /* internally used IOPS limits */
+ unsigned int iops[2][LIMIT_CNT];
+ /* user configured IOPS limits */
+ unsigned int iops_conf[2][LIMIT_CNT];
/* Number of bytes disptached in current slice */
uint64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
+ unsigned long last_low_overflow_time[2];
+
+ uint64_t last_bytes_disp[2];
+ unsigned int last_io_disp[2];
+
+ unsigned long last_check_time;
+
+ unsigned long latency_target; /* us */
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
+
+ unsigned long last_finish_time; /* ns / 1024 */
+ unsigned long checked_last_finish_time; /* ns / 1024 */
+ unsigned long avg_idletime; /* ns / 1024 */
+ unsigned long idletime_threshold; /* us */
+
+ unsigned int bio_cnt; /* total bios */
+ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
+ unsigned long bio_cnt_reset_time;
+};
+
+/* We measure latency for request size from <= 4k to >= 1M */
+#define LATENCY_BUCKET_SIZE 9
+
+struct latency_bucket {
+ unsigned long total_latency; /* ns / 1024 */
+ int samples;
+};
+
+struct avg_latency_bucket {
+ unsigned long latency; /* ns / 1024 */
+ bool valid;
};
struct throtl_data
@@ -145,8 +194,26 @@ struct throtl_data
/* Total Number of queued bios on READ and WRITE lists */
unsigned int nr_queued[2];
+ unsigned int throtl_slice;
+
/* Work for dispatching throttled bios */
struct work_struct dispatch_work;
+ unsigned int limit_index;
+ bool limit_valid[LIMIT_CNT];
+
+ unsigned long dft_idletime_threshold; /* us */
+
+ unsigned long low_upgrade_time;
+ unsigned long low_downgrade_time;
+
+ unsigned int scale;
+
+ struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
+ struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
+ struct latency_bucket __percpu *latency_buckets;
+ unsigned long last_calculate_time;
+
+ bool track_bio_latency;
};
static void throtl_pending_timer_fn(unsigned long arg);
@@ -198,6 +265,76 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
return container_of(sq, struct throtl_data, service_queue);
}
+/*
+ * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
+ * make the IO dispatch more smooth.
+ * Scale up: linearly scale up according to lapsed time since upgrade. For
+ * every throtl_slice, the limit scales up 1/2 .low limit till the
+ * limit hits .max limit
+ * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
+ */
+static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
+{
+ /* arbitrary value to avoid too big scale */
+ if (td->scale < 4096 && time_after_eq(jiffies,
+ td->low_upgrade_time + td->scale * td->throtl_slice))
+ td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
+
+ return low + (low >> 1) * td->scale;
+}
+
+static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
+{
+ struct blkcg_gq *blkg = tg_to_blkg(tg);
+ struct throtl_data *td;
+ uint64_t ret;
+
+ if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
+ return U64_MAX;
+
+ td = tg->td;
+ ret = tg->bps[rw][td->limit_index];
+ if (ret == 0 && td->limit_index == LIMIT_LOW)
+ return tg->bps[rw][LIMIT_MAX];
+
+ if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
+ tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
+ uint64_t adjusted;
+
+ adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
+ ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
+ }
+ return ret;
+}
+
+static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
+{
+ struct blkcg_gq *blkg = tg_to_blkg(tg);
+ struct throtl_data *td;
+ unsigned int ret;
+
+ if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
+ return UINT_MAX;
+ td = tg->td;
+ ret = tg->iops[rw][td->limit_index];
+ if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
+ return tg->iops[rw][LIMIT_MAX];
+
+ if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
+ tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
+ uint64_t adjusted;
+
+ adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
+ if (adjusted > UINT_MAX)
+ adjusted = UINT_MAX;
+ ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
+ }
+ return ret;
+}
+
+#define request_bucket_index(sectors) \
+ clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
+
/**
* throtl_log - log debug message via blktrace
* @sq: the service_queue being reported
@@ -334,10 +471,17 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
}
RB_CLEAR_NODE(&tg->rb_node);
- tg->bps[READ] = -1;
- tg->bps[WRITE] = -1;
- tg->iops[READ] = -1;
- tg->iops[WRITE] = -1;
+ tg->bps[READ][LIMIT_MAX] = U64_MAX;
+ tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
+ tg->iops[READ][LIMIT_MAX] = UINT_MAX;
+ tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
+ tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
+ tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
+ tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
+ tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
+ /* LIMIT_LOW will have default value 0 */
+
+ tg->latency_target = DFL_LATENCY_TARGET;
return &tg->pd;
}
@@ -366,6 +510,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
tg->td = td;
+
+ tg->idletime_threshold = td->dft_idletime_threshold;
}
/*
@@ -376,20 +522,59 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
static void tg_update_has_rules(struct throtl_grp *tg)
{
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
+ struct throtl_data *td = tg->td;
int rw;
for (rw = READ; rw <= WRITE; rw++)
tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
- (tg->bps[rw] != -1 || tg->iops[rw] != -1);
+ (td->limit_valid[td->limit_index] &&
+ (tg_bps_limit(tg, rw) != U64_MAX ||
+ tg_iops_limit(tg, rw) != UINT_MAX));
}
static void throtl_pd_online(struct blkg_policy_data *pd)
{
+ struct throtl_grp *tg = pd_to_tg(pd);
/*
* We don't want new groups to escape the limits of its ancestors.
* Update has_rules[] after a new group is brought online.
*/
- tg_update_has_rules(pd_to_tg(pd));
+ tg_update_has_rules(tg);
+}
+
+static void blk_throtl_update_limit_valid(struct throtl_data *td)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+ bool low_valid = false;
+
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
+ tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+ low_valid = true;
+ }
+ rcu_read_unlock();
+
+ td->limit_valid[LIMIT_LOW] = low_valid;
+}
+
+static void throtl_upgrade_state(struct throtl_data *td);
+static void throtl_pd_offline(struct blkg_policy_data *pd)
+{
+ struct throtl_grp *tg = pd_to_tg(pd);
+
+ tg->bps[READ][LIMIT_LOW] = 0;
+ tg->bps[WRITE][LIMIT_LOW] = 0;
+ tg->iops[READ][LIMIT_LOW] = 0;
+ tg->iops[WRITE][LIMIT_LOW] = 0;
+
+ blk_throtl_update_limit_valid(tg->td);
+
+ if (!tg->td->limit_valid[tg->td->limit_index])
+ throtl_upgrade_state(tg->td);
}
static void throtl_pd_free(struct blkg_policy_data *pd)
@@ -499,6 +684,17 @@ static void throtl_dequeue_tg(struct throtl_grp *tg)
static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
unsigned long expires)
{
+ unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice;
+
+ /*
+ * Since we are adjusting the throttle limit dynamically, the sleep
+ * time calculated according to previous limit might be invalid. It's
+ * possible the cgroup sleep time is very long and no other cgroups
+ * have IO running so notify the limit changes. Make sure the cgroup
+ * doesn't sleep too long to avoid the missed notification.
+ */
+ if (time_after(expires, max_expire))
+ expires = max_expire;
mod_timer(&sq->pending_timer, expires);
throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
expires - jiffies, jiffies);
@@ -556,7 +752,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
if (time_after_eq(start, tg->slice_start[rw]))
tg->slice_start[rw] = start;
- tg->slice_end[rw] = jiffies + throtl_slice;
+ tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
throtl_log(&tg->service_queue,
"[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -568,7 +764,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies;
- tg->slice_end[rw] = jiffies + throtl_slice;
+ tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -578,13 +774,13 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
- tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+ tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
}
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
- tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+ tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
throtl_log(&tg->service_queue,
"[%c] extend slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -624,19 +820,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
* is bad because it does not allow new slice to start.
*/
- throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
+ throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
time_elapsed = jiffies - tg->slice_start[rw];
- nr_slices = time_elapsed / throtl_slice;
+ nr_slices = time_elapsed / tg->td->throtl_slice;
if (!nr_slices)
return;
- tmp = tg->bps[rw] * throtl_slice * nr_slices;
+ tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
do_div(tmp, HZ);
bytes_trim = tmp;
- io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
+ io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
+ HZ;
if (!bytes_trim && !io_trim)
return;
@@ -651,7 +848,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
else
tg->io_disp[rw] = 0;
- tg->slice_start[rw] += nr_slices * throtl_slice;
+ tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
throtl_log(&tg->service_queue,
"[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
@@ -671,9 +868,9 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
/* Slice has just started. Consider one slice interval */
if (!jiffy_elapsed)
- jiffy_elapsed_rnd = throtl_slice;
+ jiffy_elapsed_rnd = tg->td->throtl_slice;
- jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+ jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
/*
* jiffy_elapsed_rnd should not be a big value as minimum iops can be
@@ -682,7 +879,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
* have been trimmed.
*/
- tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+ tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
do_div(tmp, HZ);
if (tmp > UINT_MAX)
@@ -697,7 +894,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
}
/* Calc approx time to dispatch */
- jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
+ jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1;
if (jiffy_wait > jiffy_elapsed)
jiffy_wait = jiffy_wait - jiffy_elapsed;
@@ -720,11 +917,11 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* Slice has just started. Consider one slice interval */
if (!jiffy_elapsed)
- jiffy_elapsed_rnd = throtl_slice;
+ jiffy_elapsed_rnd = tg->td->throtl_slice;
- jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+ jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
- tmp = tg->bps[rw] * jiffy_elapsed_rnd;
+ tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
do_div(tmp, HZ);
bytes_allowed = tmp;
@@ -736,7 +933,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* Calc approx time to dispatch */
extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
- jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
+ jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
if (!jiffy_wait)
jiffy_wait = 1;
@@ -771,7 +968,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
- if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
+ if (tg_bps_limit(tg, rw) == U64_MAX &&
+ tg_iops_limit(tg, rw) == UINT_MAX) {
if (wait)
*wait = 0;
return true;
@@ -787,8 +985,10 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
throtl_start_new_slice(tg, rw);
else {
- if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
- throtl_extend_slice(tg, rw, jiffies + throtl_slice);
+ if (time_before(tg->slice_end[rw],
+ jiffies + tg->td->throtl_slice))
+ throtl_extend_slice(tg, rw,
+ jiffies + tg->td->throtl_slice);
}
if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
@@ -816,6 +1016,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
/* Charge the bio to the group */
tg->bytes_disp[rw] += bio->bi_iter.bi_size;
tg->io_disp[rw]++;
+ tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
+ tg->last_io_disp[rw]++;
/*
* BIO_THROTTLED is used to prevent the same bio to be throttled
@@ -999,6 +1201,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
return nr_disp;
}
+static bool throtl_can_upgrade(struct throtl_data *td,
+ struct throtl_grp *this_tg);
/**
* throtl_pending_timer_fn - timer function for service_queue->pending_timer
* @arg: the throtl_service_queue being serviced
@@ -1025,6 +1229,9 @@ static void throtl_pending_timer_fn(unsigned long arg)
int ret;
spin_lock_irq(q->queue_lock);
+ if (throtl_can_upgrade(td, NULL))
+ throtl_upgrade_state(td);
+
again:
parent_sq = sq->parent_sq;
dispatched = false;
@@ -1112,7 +1319,7 @@ static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
struct throtl_grp *tg = pd_to_tg(pd);
u64 v = *(u64 *)((void *)tg + off);
- if (v == -1)
+ if (v == U64_MAX)
return 0;
return __blkg_prfill_u64(sf, pd, v);
}
@@ -1123,7 +1330,7 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
struct throtl_grp *tg = pd_to_tg(pd);
unsigned int v = *(unsigned int *)((void *)tg + off);
- if (v == -1)
+ if (v == UINT_MAX)
return 0;
return __blkg_prfill_u64(sf, pd, v);
}
@@ -1150,8 +1357,8 @@ static void tg_conf_updated(struct throtl_grp *tg)
throtl_log(&tg->service_queue,
"limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
- tg->bps[READ], tg->bps[WRITE],
- tg->iops[READ], tg->iops[WRITE]);
+ tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
+ tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
/*
* Update has_rules[] flags for the updated tg's subtree. A tg is
@@ -1197,7 +1404,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
if (sscanf(ctx.body, "%llu", &v) != 1)
goto out_finish;
if (!v)
- v = -1;
+ v = U64_MAX;
tg = blkg_to_tg(ctx.blkg);
@@ -1228,25 +1435,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
static struct cftype throtl_legacy_files[] = {
{
.name = "throttle.read_bps_device",
- .private = offsetof(struct throtl_grp, bps[READ]),
+ .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.write_bps_device",
- .private = offsetof(struct throtl_grp, bps[WRITE]),
+ .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.read_iops_device",
- .private = offsetof(struct throtl_grp, iops[READ]),
+ .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
{
.name = "throttle.write_iops_device",
- .private = offsetof(struct throtl_grp, iops[WRITE]),
+ .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
@@ -1263,48 +1470,87 @@ static struct cftype throtl_legacy_files[] = {
{ } /* terminate */
};
-static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
struct throtl_grp *tg = pd_to_tg(pd);
const char *dname = blkg_dev_name(pd->blkg);
char bufs[4][21] = { "max", "max", "max", "max" };
+ u64 bps_dft;
+ unsigned int iops_dft;
+ char idle_time[26] = "";
+ char latency_time[26] = "";
if (!dname)
return 0;
- if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
- tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+
+ if (off == LIMIT_LOW) {
+ bps_dft = 0;
+ iops_dft = 0;
+ } else {
+ bps_dft = U64_MAX;
+ iops_dft = UINT_MAX;
+ }
+
+ if (tg->bps_conf[READ][off] == bps_dft &&
+ tg->bps_conf[WRITE][off] == bps_dft &&
+ tg->iops_conf[READ][off] == iops_dft &&
+ tg->iops_conf[WRITE][off] == iops_dft &&
+ (off != LIMIT_LOW ||
+ (tg->idletime_threshold == tg->td->dft_idletime_threshold &&
+ tg->latency_target == DFL_LATENCY_TARGET)))
return 0;
- if (tg->bps[READ] != -1)
- snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
- if (tg->bps[WRITE] != -1)
- snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
- if (tg->iops[READ] != -1)
- snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
- if (tg->iops[WRITE] != -1)
- snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
-
- seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
- dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+ if (tg->bps_conf[READ][off] != bps_dft)
+ snprintf(bufs[0], sizeof(bufs[0]), "%llu",
+ tg->bps_conf[READ][off]);
+ if (tg->bps_conf[WRITE][off] != bps_dft)
+ snprintf(bufs[1], sizeof(bufs[1]), "%llu",
+ tg->bps_conf[WRITE][off]);
+ if (tg->iops_conf[READ][off] != iops_dft)
+ snprintf(bufs[2], sizeof(bufs[2]), "%u",
+ tg->iops_conf[READ][off]);
+ if (tg->iops_conf[WRITE][off] != iops_dft)
+ snprintf(bufs[3], sizeof(bufs[3]), "%u",
+ tg->iops_conf[WRITE][off]);
+ if (off == LIMIT_LOW) {
+ if (tg->idletime_threshold == ULONG_MAX)
+ strcpy(idle_time, " idle=max");
+ else
+ snprintf(idle_time, sizeof(idle_time), " idle=%lu",
+ tg->idletime_threshold);
+
+ if (tg->latency_target == ULONG_MAX)
+ strcpy(latency_time, " latency=max");
+ else
+ snprintf(latency_time, sizeof(latency_time),
+ " latency=%lu", tg->latency_target);
+ }
+
+ seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
+ dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
+ latency_time);
return 0;
}
-static int tg_print_max(struct seq_file *sf, void *v)
+static int tg_print_limit(struct seq_file *sf, void *v)
{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
&blkcg_policy_throtl, seq_cft(sf)->private, false);
return 0;
}
-static ssize_t tg_set_max(struct kernfs_open_file *of,
+static ssize_t tg_set_limit(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct blkg_conf_ctx ctx;
struct throtl_grp *tg;
u64 v[4];
+ unsigned long idle_time;
+ unsigned long latency_time;
int ret;
+ int index = of_cft(of)->private;
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
if (ret)
@@ -1312,15 +1558,17 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
tg = blkg_to_tg(ctx.blkg);
- v[0] = tg->bps[READ];
- v[1] = tg->bps[WRITE];
- v[2] = tg->iops[READ];
- v[3] = tg->iops[WRITE];
+ v[0] = tg->bps_conf[READ][index];
+ v[1] = tg->bps_conf[WRITE][index];
+ v[2] = tg->iops_conf[READ][index];
+ v[3] = tg->iops_conf[WRITE][index];
+ idle_time = tg->idletime_threshold;
+ latency_time = tg->latency_target;
while (true) {
char tok[27]; /* wiops=18446744073709551616 */
char *p;
- u64 val = -1;
+ u64 val = U64_MAX;
int len;
if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
@@ -1348,15 +1596,43 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
v[2] = min_t(u64, val, UINT_MAX);
else if (!strcmp(tok, "wiops"))
v[3] = min_t(u64, val, UINT_MAX);
+ else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
+ idle_time = val;
+ else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
+ latency_time = val;
else
goto out_finish;
}
- tg->bps[READ] = v[0];
- tg->bps[WRITE] = v[1];
- tg->iops[READ] = v[2];
- tg->iops[WRITE] = v[3];
+ tg->bps_conf[READ][index] = v[0];
+ tg->bps_conf[WRITE][index] = v[1];
+ tg->iops_conf[READ][index] = v[2];
+ tg->iops_conf[WRITE][index] = v[3];
+ if (index == LIMIT_MAX) {
+ tg->bps[READ][index] = v[0];
+ tg->bps[WRITE][index] = v[1];
+ tg->iops[READ][index] = v[2];
+ tg->iops[WRITE][index] = v[3];
+ }
+ tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
+ tg->bps_conf[READ][LIMIT_MAX]);
+ tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
+ tg->bps_conf[WRITE][LIMIT_MAX]);
+ tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
+ tg->iops_conf[READ][LIMIT_MAX]);
+ tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
+ tg->iops_conf[WRITE][LIMIT_MAX]);
+
+ if (index == LIMIT_LOW) {
+ blk_throtl_update_limit_valid(tg->td);
+ if (tg->td->limit_valid[LIMIT_LOW])
+ tg->td->limit_index = LIMIT_LOW;
+ tg->idletime_threshold = (idle_time == ULONG_MAX) ?
+ ULONG_MAX : idle_time;
+ tg->latency_target = (latency_time == ULONG_MAX) ?
+ ULONG_MAX : latency_time;
+ }
tg_conf_updated(tg);
ret = 0;
out_finish:
@@ -1365,11 +1641,21 @@ out_finish:
}
static struct cftype throtl_files[] = {
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ {
+ .name = "low",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = tg_print_limit,
+ .write = tg_set_limit,
+ .private = LIMIT_LOW,
+ },
+#endif
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = tg_print_max,
- .write = tg_set_max,
+ .seq_show = tg_print_limit,
+ .write = tg_set_limit,
+ .private = LIMIT_MAX,
},
{ } /* terminate */
};
@@ -1388,9 +1674,376 @@ static struct blkcg_policy blkcg_policy_throtl = {
.pd_alloc_fn = throtl_pd_alloc,
.pd_init_fn = throtl_pd_init,
.pd_online_fn = throtl_pd_online,
+ .pd_offline_fn = throtl_pd_offline,
.pd_free_fn = throtl_pd_free,
};
+static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+ unsigned long rtime = jiffies, wtime = jiffies;
+
+ if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
+ rtime = tg->last_low_overflow_time[READ];
+ if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+ wtime = tg->last_low_overflow_time[WRITE];
+ return min(rtime, wtime);
+}
+
+/* tg should not be an intermediate node */
+static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+ struct throtl_service_queue *parent_sq;
+ struct throtl_grp *parent = tg;
+ unsigned long ret = __tg_last_low_overflow_time(tg);
+
+ while (true) {
+ parent_sq = parent->service_queue.parent_sq;
+ parent = sq_to_tg(parent_sq);
+ if (!parent)
+ break;
+
+ /*
+ * The parent doesn't have low limit, it always reaches low
+ * limit. Its overflow time is useless for children
+ */
+ if (!parent->bps[READ][LIMIT_LOW] &&
+ !parent->iops[READ][LIMIT_LOW] &&
+ !parent->bps[WRITE][LIMIT_LOW] &&
+ !parent->iops[WRITE][LIMIT_LOW])
+ continue;
+ if (time_after(__tg_last_low_overflow_time(parent), ret))
+ ret = __tg_last_low_overflow_time(parent);
+ }
+ return ret;
+}
+
+static bool throtl_tg_is_idle(struct throtl_grp *tg)
+{
+ /*
+ * cgroup is idle if:
+ * - single idle is too long, longer than a fixed value (in case user
+ * configure a too big threshold) or 4 times of slice
+ * - average think time is more than threshold
+ * - IO latency is largely below threshold
+ */
+ unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
+
+ time = min_t(unsigned long, MAX_IDLE_TIME, time);
+ return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+ tg->avg_idletime > tg->idletime_threshold ||
+ (tg->latency_target && tg->bio_cnt &&
+ tg->bad_bio_cnt * 5 < tg->bio_cnt);
+}
+
+static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+{
+ struct throtl_service_queue *sq = &tg->service_queue;
+ bool read_limit, write_limit;
+
+ /*
+ * if cgroup reaches low limit (if low limit is 0, the cgroup always
+ * reaches), it's ok to upgrade to next limit
+ */
+ read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
+ write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
+ if (!read_limit && !write_limit)
+ return true;
+ if (read_limit && sq->nr_queued[READ] &&
+ (!write_limit || sq->nr_queued[WRITE]))
+ return true;
+ if (write_limit && sq->nr_queued[WRITE] &&
+ (!read_limit || sq->nr_queued[READ]))
+ return true;
+
+ if (time_after_eq(jiffies,
+ tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
+ throtl_tg_is_idle(tg))
+ return true;
+ return false;
+}
+
+static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
+{
+ while (true) {
+ if (throtl_tg_can_upgrade(tg))
+ return true;
+ tg = sq_to_tg(tg->service_queue.parent_sq);
+ if (!tg || !tg_to_blkg(tg)->parent)
+ return false;
+ }
+ return false;
+}
+
+static bool throtl_can_upgrade(struct throtl_data *td,
+ struct throtl_grp *this_tg)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ if (td->limit_index != LIMIT_LOW)
+ return false;
+
+ if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
+ return false;
+
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ if (tg == this_tg)
+ continue;
+ if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+ continue;
+ if (!throtl_hierarchy_can_upgrade(tg)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+ return true;
+}
+
+static void throtl_upgrade_check(struct throtl_grp *tg)
+{
+ unsigned long now = jiffies;
+
+ if (tg->td->limit_index != LIMIT_LOW)
+ return;
+
+ if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
+ return;
+
+ tg->last_check_time = now;
+
+ if (!time_after_eq(now,
+ __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
+ return;
+
+ if (throtl_can_upgrade(tg->td, NULL))
+ throtl_upgrade_state(tg->td);
+}
+
+static void throtl_upgrade_state(struct throtl_data *td)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ td->limit_index = LIMIT_MAX;
+ td->low_upgrade_time = jiffies;
+ td->scale = 0;
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ tg->disptime = jiffies - 1;
+ throtl_select_dispatch(sq);
+ throtl_schedule_next_dispatch(sq, false);
+ }
+ rcu_read_unlock();
+ throtl_select_dispatch(&td->service_queue);
+ throtl_schedule_next_dispatch(&td->service_queue, false);
+ queue_work(kthrotld_workqueue, &td->dispatch_work);
+}
+
+static void throtl_downgrade_state(struct throtl_data *td, int new)
+{
+ td->scale /= 2;
+
+ if (td->scale) {
+ td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
+ return;
+ }
+
+ td->limit_index = new;
+ td->low_downgrade_time = jiffies;
+}
+
+static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
+{
+ struct throtl_data *td = tg->td;
+ unsigned long now = jiffies;
+
+ /*
+ * If cgroup is below low limit, consider downgrade and throttle other
+ * cgroups
+ */
+ if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
+ time_after_eq(now, tg_last_low_overflow_time(tg) +
+ td->throtl_slice) &&
+ (!throtl_tg_is_idle(tg) ||
+ !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
+ return true;
+ return false;
+}
+
+static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
+{
+ while (true) {
+ if (!throtl_tg_can_downgrade(tg))
+ return false;
+ tg = sq_to_tg(tg->service_queue.parent_sq);
+ if (!tg || !tg_to_blkg(tg)->parent)
+ break;
+ }
+ return true;
+}
+
+static void throtl_downgrade_check(struct throtl_grp *tg)
+{
+ uint64_t bps;
+ unsigned int iops;
+ unsigned long elapsed_time;
+ unsigned long now = jiffies;
+
+ if (tg->td->limit_index != LIMIT_MAX ||
+ !tg->td->limit_valid[LIMIT_LOW])
+ return;
+ if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+ return;
+ if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
+ return;
+
+ elapsed_time = now - tg->last_check_time;
+ tg->last_check_time = now;
+
+ if (time_before(now, tg_last_low_overflow_time(tg) +
+ tg->td->throtl_slice))
+ return;
+
+ if (tg->bps[READ][LIMIT_LOW]) {
+ bps = tg->last_bytes_disp[READ] * HZ;
+ do_div(bps, elapsed_time);
+ if (bps >= tg->bps[READ][LIMIT_LOW])
+ tg->last_low_overflow_time[READ] = now;
+ }
+
+ if (tg->bps[WRITE][LIMIT_LOW]) {
+ bps = tg->last_bytes_disp[WRITE] * HZ;
+ do_div(bps, elapsed_time);
+ if (bps >= tg->bps[WRITE][LIMIT_LOW])
+ tg->last_low_overflow_time[WRITE] = now;
+ }
+
+ if (tg->iops[READ][LIMIT_LOW]) {
+ iops = tg->last_io_disp[READ] * HZ / elapsed_time;
+ if (iops >= tg->iops[READ][LIMIT_LOW])
+ tg->last_low_overflow_time[READ] = now;
+ }
+
+ if (tg->iops[WRITE][LIMIT_LOW]) {
+ iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
+ if (iops >= tg->iops[WRITE][LIMIT_LOW])
+ tg->last_low_overflow_time[WRITE] = now;
+ }
+
+ /*
+ * If cgroup is below low limit, consider downgrade and throttle other
+ * cgroups
+ */
+ if (throtl_hierarchy_can_downgrade(tg))
+ throtl_downgrade_state(tg->td, LIMIT_LOW);
+
+ tg->last_bytes_disp[READ] = 0;
+ tg->last_bytes_disp[WRITE] = 0;
+ tg->last_io_disp[READ] = 0;
+ tg->last_io_disp[WRITE] = 0;
+}
+
+static void blk_throtl_update_idletime(struct throtl_grp *tg)
+{
+ unsigned long now = ktime_get_ns() >> 10;
+ unsigned long last_finish_time = tg->last_finish_time;
+
+ if (now <= last_finish_time || last_finish_time == 0 ||
+ last_finish_time == tg->checked_last_finish_time)
+ return;
+
+ tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
+ tg->checked_last_finish_time = last_finish_time;
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_update_latency_buckets(struct throtl_data *td)
+{
+ struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+ int i, cpu;
+ unsigned long last_latency = 0;
+ unsigned long latency;
+
+ if (!blk_queue_nonrot(td->queue))
+ return;
+ if (time_before(jiffies, td->last_calculate_time + HZ))
+ return;
+ td->last_calculate_time = jiffies;
+
+ memset(avg_latency, 0, sizeof(avg_latency));
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ struct latency_bucket *tmp = &td->tmp_buckets[i];
+
+ for_each_possible_cpu(cpu) {
+ struct latency_bucket *bucket;
+
+ /* this isn't race free, but ok in practice */
+ bucket = per_cpu_ptr(td->latency_buckets, cpu);
+ tmp->total_latency += bucket[i].total_latency;
+ tmp->samples += bucket[i].samples;
+ bucket[i].total_latency = 0;
+ bucket[i].samples = 0;
+ }
+
+ if (tmp->samples >= 32) {
+ int samples = tmp->samples;
+
+ latency = tmp->total_latency;
+
+ tmp->total_latency = 0;
+ tmp->samples = 0;
+ latency /= samples;
+ if (latency == 0)
+ continue;
+ avg_latency[i].latency = latency;
+ }
+ }
+
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ if (!avg_latency[i].latency) {
+ if (td->avg_buckets[i].latency < last_latency)
+ td->avg_buckets[i].latency = last_latency;
+ continue;
+ }
+
+ if (!td->avg_buckets[i].valid)
+ latency = avg_latency[i].latency;
+ else
+ latency = (td->avg_buckets[i].latency * 7 +
+ avg_latency[i].latency) >> 3;
+
+ td->avg_buckets[i].latency = max(latency, last_latency);
+ td->avg_buckets[i].valid = true;
+ last_latency = td->avg_buckets[i].latency;
+ }
+}
+#else
+static inline void throtl_update_latency_buckets(struct throtl_data *td)
+{
+}
+#endif
+
+static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
+{
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ int ret;
+
+ ret = bio_associate_current(bio);
+ if (ret == 0 || ret == -EBUSY)
+ bio->bi_cg_private = tg;
+ blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
+#else
+ bio_associate_current(bio);
+#endif
+}
+
bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct bio *bio)
{
@@ -1399,6 +2052,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct throtl_service_queue *sq;
bool rw = bio_data_dir(bio);
bool throttled = false;
+ struct throtl_data *td = tg->td;
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -1408,19 +2062,35 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
spin_lock_irq(q->queue_lock);
+ throtl_update_latency_buckets(td);
+
if (unlikely(blk_queue_bypass(q)))
goto out_unlock;
+ blk_throtl_assoc_bio(tg, bio);
+ blk_throtl_update_idletime(tg);
+
sq = &tg->service_queue;
+again:
while (true) {
+ if (tg->last_low_overflow_time[rw] == 0)
+ tg->last_low_overflow_time[rw] = jiffies;
+ throtl_downgrade_check(tg);
+ throtl_upgrade_check(tg);
/* throtl is FIFO - if bios are already queued, should queue */
if (sq->nr_queued[rw])
break;
/* if above limits, break to queue */
- if (!tg_may_dispatch(tg, bio, NULL))
+ if (!tg_may_dispatch(tg, bio, NULL)) {
+ tg->last_low_overflow_time[rw] = jiffies;
+ if (throtl_can_upgrade(td, tg)) {
+ throtl_upgrade_state(td);
+ goto again;
+ }
break;
+ }
/* within limits, let's charge and dispatch directly */
throtl_charge_bio(tg, bio);
@@ -1453,12 +2123,14 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
/* out-of-limit, queue to @tg */
throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
rw == READ ? 'R' : 'W',
- tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw],
- tg->io_disp[rw], tg->iops[rw],
+ tg->bytes_disp[rw], bio->bi_iter.bi_size,
+ tg_bps_limit(tg, rw),
+ tg->io_disp[rw], tg_iops_limit(tg, rw),
sq->nr_queued[READ], sq->nr_queued[WRITE]);
- bio_associate_current(bio);
- tg->td->nr_queued[rw]++;
+ tg->last_low_overflow_time[rw] = jiffies;
+
+ td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
throttled = true;
@@ -1483,9 +2155,94 @@ out:
*/
if (!throttled)
bio_clear_flag(bio, BIO_THROTTLED);
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ if (throttled || !td->track_bio_latency)
+ bio->bi_issue_stat.stat |= SKIP_LATENCY;
+#endif
return throttled;
}
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_track_latency(struct throtl_data *td, sector_t size,
+ int op, unsigned long time)
+{
+ struct latency_bucket *latency;
+ int index;
+
+ if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+ !blk_queue_nonrot(td->queue))
+ return;
+
+ index = request_bucket_index(size);
+
+ latency = get_cpu_ptr(td->latency_buckets);
+ latency[index].total_latency += time;
+ latency[index].samples++;
+ put_cpu_ptr(td->latency_buckets);
+}
+
+void blk_throtl_stat_add(struct request *rq, u64 time_ns)
+{
+ struct request_queue *q = rq->q;
+ struct throtl_data *td = q->td;
+
+ throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
+ req_op(rq), time_ns >> 10);
+}
+
+void blk_throtl_bio_endio(struct bio *bio)
+{
+ struct throtl_grp *tg;
+ u64 finish_time_ns;
+ unsigned long finish_time;
+ unsigned long start_time;
+ unsigned long lat;
+
+ tg = bio->bi_cg_private;
+ if (!tg)
+ return;
+ bio->bi_cg_private = NULL;
+
+ finish_time_ns = ktime_get_ns();
+ tg->last_finish_time = finish_time_ns >> 10;
+
+ start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
+ finish_time = __blk_stat_time(finish_time_ns) >> 10;
+ if (!start_time || finish_time <= start_time)
+ return;
+
+ lat = finish_time - start_time;
+ /* this is only for bio based driver */
+ if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
+ throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
+ bio_op(bio), lat);
+
+ if (tg->latency_target) {
+ int bucket;
+ unsigned int threshold;
+
+ bucket = request_bucket_index(
+ blk_stat_size(&bio->bi_issue_stat));
+ threshold = tg->td->avg_buckets[bucket].latency +
+ tg->latency_target;
+ if (lat > threshold)
+ tg->bad_bio_cnt++;
+ /*
+ * Not race free, could get wrong count, which means cgroups
+ * will be throttled
+ */
+ tg->bio_cnt++;
+ }
+
+ if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
+ tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
+ tg->bio_cnt /= 2;
+ tg->bad_bio_cnt /= 2;
+ }
+}
+#endif
+
/*
* Dispatch all bios from all children tg's queued on @parent_sq. On
* return, @parent_sq is guaranteed to not have any active children tg's
@@ -1558,6 +2315,12 @@ int blk_throtl_init(struct request_queue *q)
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
if (!td)
return -ENOMEM;
+ td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+ LATENCY_BUCKET_SIZE, __alignof__(u64));
+ if (!td->latency_buckets) {
+ kfree(td);
+ return -ENOMEM;
+ }
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
throtl_service_queue_init(&td->service_queue);
@@ -1565,10 +2328,17 @@ int blk_throtl_init(struct request_queue *q)
q->td = td;
td->queue = q;
+ td->limit_valid[LIMIT_MAX] = true;
+ td->limit_index = LIMIT_MAX;
+ td->low_upgrade_time = jiffies;
+ td->low_downgrade_time = jiffies;
+
/* activate policy */
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
- if (ret)
+ if (ret) {
+ free_percpu(td->latency_buckets);
kfree(td);
+ }
return ret;
}
@@ -1577,9 +2347,74 @@ void blk_throtl_exit(struct request_queue *q)
BUG_ON(!q->td);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+ free_percpu(q->td->latency_buckets);
kfree(q->td);
}
+void blk_throtl_register_queue(struct request_queue *q)
+{
+ struct throtl_data *td;
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ td = q->td;
+ BUG_ON(!td);
+
+ if (blk_queue_nonrot(q)) {
+ td->throtl_slice = DFL_THROTL_SLICE_SSD;
+ td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
+ } else {
+ td->throtl_slice = DFL_THROTL_SLICE_HD;
+ td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD;
+ }
+#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
+ /* if no low limit, use previous default */
+ td->throtl_slice = DFL_THROTL_SLICE_HD;
+#endif
+
+ td->track_bio_latency = !q->mq_ops && !q->request_fn;
+ if (!td->track_bio_latency)
+ blk_stat_enable_accounting(q);
+
+ /*
+ * some tg are created before queue is fully initialized, eg, nonrot
+ * isn't initialized yet
+ */
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ tg->idletime_threshold = td->dft_idletime_threshold;
+ }
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
+{
+ if (!q->td)
+ return -EINVAL;
+ return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
+}
+
+ssize_t blk_throtl_sample_time_store(struct request_queue *q,
+ const char *page, size_t count)
+{
+ unsigned long v;
+ unsigned long t;
+
+ if (!q->td)
+ return -EINVAL;
+ if (kstrtoul(page, 10, &v))
+ return -EINVAL;
+ t = msecs_to_jiffies(v);
+ if (t == 0 || t > MAX_THROTL_SLICE)
+ return -EINVAL;
+ q->td->throtl_slice = t;
+ return count;
+}
+#endif
+
static int __init throtl_init(void)
{
kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index a30441a200c0..cbff183f3d9f 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -89,7 +89,6 @@ static void blk_rq_timed_out(struct request *req)
ret = q->rq_timed_out_fn(req);
switch (ret) {
case BLK_EH_HANDLED:
- /* Can we use req->errors here? */
__blk_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 1aedb1f7ee0c..17676f4d7fd1 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -255,8 +255,8 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
* that it's writes impacting us, and not just some sole read on
* a device that is in a lower power state.
*/
- return stat[BLK_STAT_READ].nr_samples >= 1 &&
- stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+ return (stat[READ].nr_samples >= 1 &&
+ stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
}
static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
@@ -277,7 +277,7 @@ enum {
LAT_EXCEEDED,
};
-static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
{
struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
u64 thislat;
@@ -293,7 +293,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
*/
thislat = rwb_sync_issue_lat(rwb);
if (thislat > rwb->cur_win_nsec ||
- (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) {
+ (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
trace_wbt_lat(bdi, thislat);
return LAT_EXCEEDED;
}
@@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
* waited or still has writes in flights, consider us doing
* just writes as well.
*/
- if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) ||
- wb_recent_wait(rwb) || wbt_inflight(rwb))
+ if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
+ wbt_inflight(rwb))
return LAT_UNKNOWN_WRITES;
return LAT_UNKNOWN;
}
@@ -317,8 +317,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
/*
* If the 'min' latency exceeds our target, step down.
*/
- if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) {
- trace_wbt_lat(bdi, stat[BLK_STAT_READ].min);
+ if (stat[READ].min > rwb->min_lat_nsec) {
+ trace_wbt_lat(bdi, stat[READ].min);
trace_wbt_stat(bdi, stat);
return LAT_EXCEEDED;
}
@@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
return LAT_OK;
}
-static int latency_exceeded(struct rq_wb *rwb)
-{
- struct blk_rq_stat stat[2];
-
- blk_queue_stat_get(rwb->queue, stat);
- return __latency_exceeded(rwb, stat);
-}
-
static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
{
struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
@@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb)
rwb->scale_step--;
rwb->unknown_cnt = 0;
- blk_stat_clear(rwb->queue);
rwb->scaled_max = calc_wb_limits(rwb);
@@ -385,15 +376,12 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle)
rwb->scaled_max = false;
rwb->unknown_cnt = 0;
- blk_stat_clear(rwb->queue);
calc_wb_limits(rwb);
rwb_trace_step(rwb, "step down");
}
static void rwb_arm_timer(struct rq_wb *rwb)
{
- unsigned long expires;
-
if (rwb->scale_step > 0) {
/*
* We should speed this up, using some variant of a fast
@@ -411,17 +399,16 @@ static void rwb_arm_timer(struct rq_wb *rwb)
rwb->cur_win_nsec = rwb->win_nsec;
}
- expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
- mod_timer(&rwb->window_timer, expires);
+ blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
}
-static void wb_timer_fn(unsigned long data)
+static void wb_timer_fn(struct blk_stat_callback *cb)
{
- struct rq_wb *rwb = (struct rq_wb *) data;
+ struct rq_wb *rwb = cb->data;
unsigned int inflight = wbt_inflight(rwb);
int status;
- status = latency_exceeded(rwb);
+ status = latency_exceeded(rwb, cb->stat);
trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
inflight);
@@ -614,7 +601,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
__wbt_wait(rwb, bio->bi_opf, lock);
- if (!timer_pending(&rwb->window_timer))
+ if (!blk_stat_is_active(rwb->cb))
rwb_arm_timer(rwb);
if (current_is_kswapd())
@@ -666,22 +653,37 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
rwb->wc = write_cache_on;
}
- /*
- * Disable wbt, if enabled by default. Only called from CFQ, if we have
- * cgroups enabled
+/*
+ * Disable wbt, if enabled by default. Only called from CFQ.
*/
void wbt_disable_default(struct request_queue *q)
{
struct rq_wb *rwb = q->rq_wb;
- if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
- del_timer_sync(&rwb->window_timer);
- rwb->win_nsec = rwb->min_lat_nsec = 0;
- wbt_update_limits(rwb);
- }
+ if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
+ wbt_exit(q);
}
EXPORT_SYMBOL_GPL(wbt_disable_default);
+/*
+ * Enable wbt if defaults are configured that way
+ */
+void wbt_enable_default(struct request_queue *q)
+{
+ /* Throttling already enabled? */
+ if (q->rq_wb)
+ return;
+
+ /* Queue not registered? Maybe shutting down... */
+ if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ return;
+
+ if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) ||
+ (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
+ wbt_init(q);
+}
+EXPORT_SYMBOL_GPL(wbt_enable_default);
+
u64 wbt_default_latency_nsec(struct request_queue *q)
{
/*
@@ -694,29 +696,33 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
return 75000000ULL;
}
+static int wbt_data_dir(const struct request *rq)
+{
+ return rq_data_dir(rq);
+}
+
int wbt_init(struct request_queue *q)
{
struct rq_wb *rwb;
int i;
- /*
- * For now, we depend on the stats window being larger than
- * our monitoring window. Ensure that this isn't inadvertently
- * violated.
- */
- BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
if (!rwb)
return -ENOMEM;
+ rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
+ if (!rwb->cb) {
+ kfree(rwb);
+ return -ENOMEM;
+ }
+
for (i = 0; i < WBT_NUM_RWQ; i++) {
atomic_set(&rwb->rq_wait[i].inflight, 0);
init_waitqueue_head(&rwb->rq_wait[i].wait);
}
- setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
rwb->wc = 1;
rwb->queue_depth = RWB_DEF_DEPTH;
rwb->last_comp = rwb->last_issue = jiffies;
@@ -726,10 +732,10 @@ int wbt_init(struct request_queue *q)
wbt_update_limits(rwb);
/*
- * Assign rwb, and turn on stats tracking for this queue
+ * Assign rwb and add the stats callback.
*/
q->rq_wb = rwb;
- blk_stat_enable(q);
+ blk_stat_add_callback(q, rwb->cb);
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
@@ -744,7 +750,8 @@ void wbt_exit(struct request_queue *q)
struct rq_wb *rwb = q->rq_wb;
if (rwb) {
- del_timer_sync(&rwb->window_timer);
+ blk_stat_remove_callback(q, rwb->cb);
+ blk_stat_free_callback(rwb->cb);
q->rq_wb = NULL;
kfree(rwb);
}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 65f1de519f67..df6de50c5d59 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -32,27 +32,27 @@ enum {
static inline void wbt_clear_state(struct blk_issue_stat *stat)
{
- stat->time &= BLK_STAT_TIME_MASK;
+ stat->stat &= ~BLK_STAT_RES_MASK;
}
static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
{
- return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT;
+ return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
}
static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
{
- stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT;
+ stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT;
}
static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
{
- return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED;
+ return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
}
static inline bool wbt_is_read(struct blk_issue_stat *stat)
{
- return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
+ return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
}
struct rq_wait {
@@ -81,7 +81,7 @@ struct rq_wb {
u64 win_nsec; /* default window size */
u64 cur_win_nsec; /* current window size */
- struct timer_list window_timer;
+ struct blk_stat_callback *cb;
s64 sync_issue;
void *sync_cookie;
@@ -117,6 +117,7 @@ void wbt_update_limits(struct rq_wb *);
void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
void wbt_disable_default(struct request_queue *);
+void wbt_enable_default(struct request_queue *);
void wbt_set_queue_depth(struct rq_wb *, unsigned int);
void wbt_set_write_cache(struct rq_wb *, bool);
@@ -155,6 +156,9 @@ static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
static inline void wbt_disable_default(struct request_queue *q)
{
}
+static inline void wbt_enable_default(struct request_queue *q)
+{
+}
static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
{
}
diff --git a/block/blk.h b/block/blk.h
index d1ea4bd9b9a3..2ed70228e44f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -60,15 +60,12 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask);
void blk_exit_rl(struct request_list *rl);
-void init_request_from_bio(struct request *req, struct bio *bio);
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio);
void blk_queue_bypass_start(struct request_queue *q);
void blk_queue_bypass_end(struct request_queue *q);
void blk_dequeue_request(struct request *rq);
void __blk_queue_free_tags(struct request_queue *q);
-bool __blk_end_bidi_request(struct request *rq, int error,
- unsigned int nr_bytes, unsigned int bidi_bytes);
void blk_freeze_queue(struct request_queue *q);
static inline void blk_queue_enter_live(struct request_queue *q)
@@ -319,10 +316,22 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
extern void blk_throtl_drain(struct request_queue *q);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
+extern void blk_throtl_register_queue(struct request_queue *q);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline void blk_throtl_drain(struct request_queue *q) { }
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
+static inline void blk_throtl_register_queue(struct request_queue *q) { }
#endif /* CONFIG_BLK_DEV_THROTTLING */
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
+extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
+ const char *page, size_t count);
+extern void blk_throtl_bio_endio(struct bio *bio);
+extern void blk_throtl_stat_add(struct request *rq, u64 time);
+#else
+static inline void blk_throtl_bio_endio(struct bio *bio) { }
+static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
+#endif
#endif /* BLK_INTERNAL_H */
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index cd15f9dbb147..0a23dbba2d30 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -37,7 +37,7 @@ static void bsg_destroy_job(struct kref *kref)
struct bsg_job *job = container_of(kref, struct bsg_job, kref);
struct request *rq = job->req;
- blk_end_request_all(rq, rq->errors);
+ blk_end_request_all(rq, scsi_req(rq)->result);
put_device(job->dev); /* release reference for the request */
@@ -74,7 +74,7 @@ void bsg_job_done(struct bsg_job *job, int result,
struct scsi_request *rq = scsi_req(req);
int err;
- err = job->req->errors = result;
+ err = scsi_req(job->req)->result = result;
if (err < 0)
/* we're only returning the result field in the reply */
rq->sense_len = sizeof(u32);
@@ -177,7 +177,7 @@ failjob_rls_job:
* @q: request queue to manage
*
* On error the create_bsg_job function should return a -Exyz error value
- * that will be set to the req->errors.
+ * that will be set to ->result.
*
* Drivers/subsys should pass this to the queue init function.
*/
@@ -201,7 +201,7 @@ static void bsg_request_fn(struct request_queue *q)
ret = bsg_create_job(dev, req);
if (ret) {
- req->errors = ret;
+ scsi_req(req)->result = ret;
blk_end_request_all(req, ret);
spin_lock_irq(q->queue_lock);
continue;
diff --git a/block/bsg.c b/block/bsg.c
index 74835dbf0c47..d9da1b613ced 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -391,13 +391,13 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
struct scsi_request *req = scsi_req(rq);
int ret = 0;
- dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors);
+ dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
/*
* fill in all the output members
*/
- hdr->device_status = rq->errors & 0xff;
- hdr->transport_status = host_byte(rq->errors);
- hdr->driver_status = driver_byte(rq->errors);
+ hdr->device_status = req->result & 0xff;
+ hdr->transport_status = host_byte(req->result);
+ hdr->driver_status = driver_byte(req->result);
hdr->info = 0;
if (hdr->device_status || hdr->transport_status || hdr->driver_status)
hdr->info |= SG_INFO_CHECK;
@@ -431,8 +431,8 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
* just a protocol response (i.e. non negative), that gets
* processed above.
*/
- if (!ret && rq->errors < 0)
- ret = rq->errors;
+ if (!ret && req->result < 0)
+ ret = req->result;
blk_rq_unmap_user(bio);
scsi_req_free_cmd(req);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 440b95ee593c..da69b079725f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3761,16 +3761,14 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
{
struct cfq_data *cfqd = cic_to_cfqd(cic);
struct cfq_queue *cfqq;
uint64_t serial_nr;
- bool nonroot_cg;
rcu_read_lock();
serial_nr = bio_blkcg(bio)->css.serial_nr;
- nonroot_cg = bio_blkcg(bio) != &blkcg_root;
rcu_read_unlock();
/*
@@ -3778,7 +3776,7 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
* spuriously on a newly created cic but there's no harm.
*/
if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
- return nonroot_cg;
+ return;
/*
* Drop reference to queues. New queues will be assigned in new
@@ -3799,12 +3797,10 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
}
cic->blkcg_serial_nr = serial_nr;
- return nonroot_cg;
}
#else
-static inline bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
{
- return false;
}
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -4449,12 +4445,11 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
const int rw = rq_data_dir(rq);
const bool is_sync = rq_is_sync(rq);
struct cfq_queue *cfqq;
- bool disable_wbt;
spin_lock_irq(q->queue_lock);
check_ioprio_changed(cic, bio);
- disable_wbt = check_blkcg_changed(cic, bio);
+ check_blkcg_changed(cic, bio);
new_queue:
cfqq = cic_to_cfqq(cic, is_sync);
if (!cfqq || cfqq == &cfqd->oom_cfqq) {
@@ -4491,9 +4486,6 @@ new_queue:
rq->elv.priv[1] = cfqq->cfqg;
spin_unlock_irq(q->queue_lock);
- if (disable_wbt)
- wbt_disable_default(q);
-
return 0;
}
@@ -4706,6 +4698,7 @@ static void cfq_registered_queue(struct request_queue *q)
*/
if (blk_queue_nonrot(q))
cfqd->cfq_slice_idle = 0;
+ wbt_disable_default(q);
}
/*
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 570021a0dc1c..04325b81c2b4 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -685,7 +685,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKALIGNOFF:
return compat_put_int(arg, bdev_alignment_offset(bdev));
case BLKDISCARDZEROES:
- return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
+ return compat_put_uint(arg, 0);
case BLKFLSBUF:
case BLKROSET:
case BLKDISCARD:
diff --git a/block/elevator.c b/block/elevator.c
index 4d9084a14c10..bf11e70f008b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -41,6 +41,7 @@
#include "blk.h"
#include "blk-mq-sched.h"
+#include "blk-wbt.h"
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -877,6 +878,8 @@ void elv_unregister_queue(struct request_queue *q)
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
e->registered = 0;
+ /* Re-enable throttling in case elevator disabled it */
+ wbt_enable_default(q);
}
}
EXPORT_SYMBOL(elv_unregister_queue);
diff --git a/block/genhd.c b/block/genhd.c
index a9c516a8b37d..9a2d01abfa3b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1060,8 +1060,19 @@ static struct attribute *disk_attrs[] = {
NULL
};
+static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, typeof(*dev), kobj);
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (a == &dev_attr_badblocks.attr && !disk->bb)
+ return 0;
+ return a->mode;
+}
+
static struct attribute_group disk_attr_group = {
.attrs = disk_attrs,
+ .is_visible = disk_visible,
};
static const struct attribute_group *disk_attr_groups[] = {
@@ -1352,7 +1363,7 @@ struct kobject *get_disk(struct gendisk *disk)
owner = disk->fops->owner;
if (owner && !try_module_get(owner))
return NULL;
- kobj = kobject_get(&disk_to_dev(disk)->kobj);
+ kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj);
if (kobj == NULL) {
module_put(owner);
return NULL;
diff --git a/block/ioctl.c b/block/ioctl.c
index 7b88820b93d9..0de02ee67eed 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -255,7 +255,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
truncate_inode_pages_range(mapping, start, end);
return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
- false);
+ BLKDEV_ZERO_NOUNMAP);
}
static int put_ushort(unsigned long arg, unsigned short val)
@@ -547,7 +547,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKALIGNOFF:
return put_int(arg, bdev_alignment_offset(bdev));
case BLKDISCARDZEROES:
- return put_uint(arg, bdev_discard_zeroes_data(bdev));
+ return put_uint(arg, 0);
case BLKSECTGET:
max_sectors = min_t(unsigned int, USHRT_MAX,
queue_max_sectors(bdev_get_queue(bdev)));
diff --git a/block/ioprio.c b/block/ioprio.c
index 0c47a00f92a8..4b120c9cf7e8 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -163,22 +163,12 @@ out:
int ioprio_best(unsigned short aprio, unsigned short bprio)
{
- unsigned short aclass;
- unsigned short bclass;
-
if (!ioprio_valid(aprio))
aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
if (!ioprio_valid(bprio))
bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
- aclass = IOPRIO_PRIO_CLASS(aprio);
- bclass = IOPRIO_PRIO_CLASS(bprio);
- if (aclass == bclass)
- return min(aprio, bprio);
- if (aclass > bclass)
- return bprio;
- else
- return aprio;
+ return min(aprio, bprio);
}
SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
new file mode 100644
index 000000000000..3b0090bc5dd1
--- /dev/null
+++ b/block/kyber-iosched.c
@@ -0,0 +1,719 @@
+/*
+ * The Kyber I/O scheduler. Controls latency by throttling queue depths using
+ * scalable techniques.
+ *
+ * Copyright (C) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/module.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-stat.h"
+
+/* Scheduling domains. */
+enum {
+ KYBER_READ,
+ KYBER_SYNC_WRITE,
+ KYBER_OTHER, /* Async writes, discard, etc. */
+ KYBER_NUM_DOMAINS,
+};
+
+enum {
+ KYBER_MIN_DEPTH = 256,
+
+ /*
+ * In order to prevent starvation of synchronous requests by a flood of
+ * asynchronous requests, we reserve 25% of requests for synchronous
+ * operations.
+ */
+ KYBER_ASYNC_PERCENT = 75,
+};
+
+/*
+ * Initial device-wide depths for each scheduling domain.
+ *
+ * Even for fast devices with lots of tags like NVMe, you can saturate
+ * the device with only a fraction of the maximum possible queue depth.
+ * So, we cap these to a reasonable value.
+ */
+static const unsigned int kyber_depth[] = {
+ [KYBER_READ] = 256,
+ [KYBER_SYNC_WRITE] = 128,
+ [KYBER_OTHER] = 64,
+};
+
+/*
+ * Scheduling domain batch sizes. We favor reads.
+ */
+static const unsigned int kyber_batch_size[] = {
+ [KYBER_READ] = 16,
+ [KYBER_SYNC_WRITE] = 8,
+ [KYBER_OTHER] = 8,
+};
+
+struct kyber_queue_data {
+ struct request_queue *q;
+
+ struct blk_stat_callback *cb;
+
+ /*
+ * The device is divided into multiple scheduling domains based on the
+ * request type. Each domain has a fixed number of in-flight requests of
+ * that type device-wide, limited by these tokens.
+ */
+ struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
+
+ /*
+ * Async request percentage, converted to per-word depth for
+ * sbitmap_get_shallow().
+ */
+ unsigned int async_depth;
+
+ /* Target latencies in nanoseconds. */
+ u64 read_lat_nsec, write_lat_nsec;
+};
+
+struct kyber_hctx_data {
+ spinlock_t lock;
+ struct list_head rqs[KYBER_NUM_DOMAINS];
+ unsigned int cur_domain;
+ unsigned int batching;
+ wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
+ atomic_t wait_index[KYBER_NUM_DOMAINS];
+};
+
+static int rq_sched_domain(const struct request *rq)
+{
+ unsigned int op = rq->cmd_flags;
+
+ if ((op & REQ_OP_MASK) == REQ_OP_READ)
+ return KYBER_READ;
+ else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
+ return KYBER_SYNC_WRITE;
+ else
+ return KYBER_OTHER;
+}
+
+enum {
+ NONE = 0,
+ GOOD = 1,
+ GREAT = 2,
+ BAD = -1,
+ AWFUL = -2,
+};
+
+#define IS_GOOD(status) ((status) > 0)
+#define IS_BAD(status) ((status) < 0)
+
+static int kyber_lat_status(struct blk_stat_callback *cb,
+ unsigned int sched_domain, u64 target)
+{
+ u64 latency;
+
+ if (!cb->stat[sched_domain].nr_samples)
+ return NONE;
+
+ latency = cb->stat[sched_domain].mean;
+ if (latency >= 2 * target)
+ return AWFUL;
+ else if (latency > target)
+ return BAD;
+ else if (latency <= target / 2)
+ return GREAT;
+ else /* (latency <= target) */
+ return GOOD;
+}
+
+/*
+ * Adjust the read or synchronous write depth given the status of reads and
+ * writes. The goal is that the latencies of the two domains are fair (i.e., if
+ * one is good, then the other is good).
+ */
+static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
+ unsigned int sched_domain, int this_status,
+ int other_status)
+{
+ unsigned int orig_depth, depth;
+
+ /*
+ * If this domain had no samples, or reads and writes are both good or
+ * both bad, don't adjust the depth.
+ */
+ if (this_status == NONE ||
+ (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
+ (IS_BAD(this_status) && IS_BAD(other_status)))
+ return;
+
+ orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
+
+ if (other_status == NONE) {
+ depth++;
+ } else {
+ switch (this_status) {
+ case GOOD:
+ if (other_status == AWFUL)
+ depth -= max(depth / 4, 1U);
+ else
+ depth -= max(depth / 8, 1U);
+ break;
+ case GREAT:
+ if (other_status == AWFUL)
+ depth /= 2;
+ else
+ depth -= max(depth / 4, 1U);
+ break;
+ case BAD:
+ depth++;
+ break;
+ case AWFUL:
+ if (other_status == GREAT)
+ depth += 2;
+ else
+ depth++;
+ break;
+ }
+ }
+
+ depth = clamp(depth, 1U, kyber_depth[sched_domain]);
+ if (depth != orig_depth)
+ sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
+}
+
+/*
+ * Adjust the depth of other requests given the status of reads and synchronous
+ * writes. As long as either domain is doing fine, we don't throttle, but if
+ * both domains are doing badly, we throttle heavily.
+ */
+static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
+ int read_status, int write_status,
+ bool have_samples)
+{
+ unsigned int orig_depth, depth;
+ int status;
+
+ orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
+
+ if (read_status == NONE && write_status == NONE) {
+ depth += 2;
+ } else if (have_samples) {
+ if (read_status == NONE)
+ status = write_status;
+ else if (write_status == NONE)
+ status = read_status;
+ else
+ status = max(read_status, write_status);
+ switch (status) {
+ case GREAT:
+ depth += 2;
+ break;
+ case GOOD:
+ depth++;
+ break;
+ case BAD:
+ depth -= max(depth / 4, 1U);
+ break;
+ case AWFUL:
+ depth /= 2;
+ break;
+ }
+ }
+
+ depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
+ if (depth != orig_depth)
+ sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
+}
+
+/*
+ * Apply heuristics for limiting queue depths based on gathered latency
+ * statistics.
+ */
+static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
+{
+ struct kyber_queue_data *kqd = cb->data;
+ int read_status, write_status;
+
+ read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
+ write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
+
+ kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
+ kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
+ kyber_adjust_other_depth(kqd, read_status, write_status,
+ cb->stat[KYBER_OTHER].nr_samples != 0);
+
+ /*
+ * Continue monitoring latencies if we aren't hitting the targets or
+ * we're still throttling other requests.
+ */
+ if (!blk_stat_is_active(kqd->cb) &&
+ ((IS_BAD(read_status) || IS_BAD(write_status) ||
+ kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
+ blk_stat_activate_msecs(kqd->cb, 100);
+}
+
+static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
+{
+ /*
+ * All of the hardware queues have the same depth, so we can just grab
+ * the shift of the first one.
+ */
+ return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
+}
+
+static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
+{
+ struct kyber_queue_data *kqd;
+ unsigned int max_tokens;
+ unsigned int shift;
+ int ret = -ENOMEM;
+ int i;
+
+ kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
+ if (!kqd)
+ goto err;
+ kqd->q = q;
+
+ kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
+ KYBER_NUM_DOMAINS, kqd);
+ if (!kqd->cb)
+ goto err_kqd;
+
+ /*
+ * The maximum number of tokens for any scheduling domain is at least
+ * the queue depth of a single hardware queue. If the hardware doesn't
+ * have many tags, still provide a reasonable number.
+ */
+ max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
+ KYBER_MIN_DEPTH);
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ WARN_ON(!kyber_depth[i]);
+ WARN_ON(!kyber_batch_size[i]);
+ ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
+ max_tokens, -1, false, GFP_KERNEL,
+ q->node);
+ if (ret) {
+ while (--i >= 0)
+ sbitmap_queue_free(&kqd->domain_tokens[i]);
+ goto err_cb;
+ }
+ sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
+ }
+
+ shift = kyber_sched_tags_shift(kqd);
+ kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+
+ kqd->read_lat_nsec = 2000000ULL;
+ kqd->write_lat_nsec = 10000000ULL;
+
+ return kqd;
+
+err_cb:
+ blk_stat_free_callback(kqd->cb);
+err_kqd:
+ kfree(kqd);
+err:
+ return ERR_PTR(ret);
+}
+
+static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
+{
+ struct kyber_queue_data *kqd;
+ struct elevator_queue *eq;
+
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return -ENOMEM;
+
+ kqd = kyber_queue_data_alloc(q);
+ if (IS_ERR(kqd)) {
+ kobject_put(&eq->kobj);
+ return PTR_ERR(kqd);
+ }
+
+ eq->elevator_data = kqd;
+ q->elevator = eq;
+
+ blk_stat_add_callback(q, kqd->cb);
+
+ return 0;
+}
+
+static void kyber_exit_sched(struct elevator_queue *e)
+{
+ struct kyber_queue_data *kqd = e->elevator_data;
+ struct request_queue *q = kqd->q;
+ int i;
+
+ blk_stat_remove_callback(q, kqd->cb);
+
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+ sbitmap_queue_free(&kqd->domain_tokens[i]);
+ blk_stat_free_callback(kqd->cb);
+ kfree(kqd);
+}
+
+static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+ struct kyber_hctx_data *khd;
+ int i;
+
+ khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
+ if (!khd)
+ return -ENOMEM;
+
+ spin_lock_init(&khd->lock);
+
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ INIT_LIST_HEAD(&khd->rqs[i]);
+ INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
+ atomic_set(&khd->wait_index[i], 0);
+ }
+
+ khd->cur_domain = 0;
+ khd->batching = 0;
+
+ hctx->sched_data = khd;
+
+ return 0;
+}
+
+static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+ kfree(hctx->sched_data);
+}
+
+static int rq_get_domain_token(struct request *rq)
+{
+ return (long)rq->elv.priv[0];
+}
+
+static void rq_set_domain_token(struct request *rq, int token)
+{
+ rq->elv.priv[0] = (void *)(long)token;
+}
+
+static void rq_clear_domain_token(struct kyber_queue_data *kqd,
+ struct request *rq)
+{
+ unsigned int sched_domain;
+ int nr;
+
+ nr = rq_get_domain_token(rq);
+ if (nr != -1) {
+ sched_domain = rq_sched_domain(rq);
+ sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
+ rq->mq_ctx->cpu);
+ }
+}
+
+static struct request *kyber_get_request(struct request_queue *q,
+ unsigned int op,
+ struct blk_mq_alloc_data *data)
+{
+ struct kyber_queue_data *kqd = q->elevator->elevator_data;
+ struct request *rq;
+
+ /*
+ * We use the scheduler tags as per-hardware queue queueing tokens.
+ * Async requests can be limited at this stage.
+ */
+ if (!op_is_sync(op))
+ data->shallow_depth = kqd->async_depth;
+
+ rq = __blk_mq_alloc_request(data, op);
+ if (rq)
+ rq_set_domain_token(rq, -1);
+ return rq;
+}
+
+static void kyber_put_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct kyber_queue_data *kqd = q->elevator->elevator_data;
+
+ rq_clear_domain_token(kqd, rq);
+ blk_mq_finish_request(rq);
+}
+
+static void kyber_completed_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct kyber_queue_data *kqd = q->elevator->elevator_data;
+ unsigned int sched_domain;
+ u64 now, latency, target;
+
+ /*
+ * Check if this request met our latency goal. If not, quickly gather
+ * some statistics and start throttling.
+ */
+ sched_domain = rq_sched_domain(rq);
+ switch (sched_domain) {
+ case KYBER_READ:
+ target = kqd->read_lat_nsec;
+ break;
+ case KYBER_SYNC_WRITE:
+ target = kqd->write_lat_nsec;
+ break;
+ default:
+ return;
+ }
+
+ /* If we are already monitoring latencies, don't check again. */
+ if (blk_stat_is_active(kqd->cb))
+ return;
+
+ now = __blk_stat_time(ktime_to_ns(ktime_get()));
+ if (now < blk_stat_time(&rq->issue_stat))
+ return;
+
+ latency = now - blk_stat_time(&rq->issue_stat);
+
+ if (latency > target)
+ blk_stat_activate_msecs(kqd->cb, 10);
+}
+
+static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
+ struct blk_mq_hw_ctx *hctx)
+{
+ LIST_HEAD(rq_list);
+ struct request *rq, *next;
+
+ blk_mq_flush_busy_ctxs(hctx, &rq_list);
+ list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
+ unsigned int sched_domain;
+
+ sched_domain = rq_sched_domain(rq);
+ list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
+ }
+}
+
+static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
+ void *key)
+{
+ struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
+
+ list_del_init(&wait->task_list);
+ blk_mq_run_hw_queue(hctx, true);
+ return 1;
+}
+
+static int kyber_get_domain_token(struct kyber_queue_data *kqd,
+ struct kyber_hctx_data *khd,
+ struct blk_mq_hw_ctx *hctx)
+{
+ unsigned int sched_domain = khd->cur_domain;
+ struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
+ wait_queue_t *wait = &khd->domain_wait[sched_domain];
+ struct sbq_wait_state *ws;
+ int nr;
+
+ nr = __sbitmap_queue_get(domain_tokens);
+ if (nr >= 0)
+ return nr;
+
+ /*
+ * If we failed to get a domain token, make sure the hardware queue is
+ * run when one becomes available. Note that this is serialized on
+ * khd->lock, but we still need to be careful about the waker.
+ */
+ if (list_empty_careful(&wait->task_list)) {
+ init_waitqueue_func_entry(wait, kyber_domain_wake);
+ wait->private = hctx;
+ ws = sbq_wait_ptr(domain_tokens,
+ &khd->wait_index[sched_domain]);
+ add_wait_queue(&ws->wait, wait);
+
+ /*
+ * Try again in case a token was freed before we got on the wait
+ * queue.
+ */
+ nr = __sbitmap_queue_get(domain_tokens);
+ }
+ return nr;
+}
+
+static struct request *
+kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
+ struct kyber_hctx_data *khd,
+ struct blk_mq_hw_ctx *hctx,
+ bool *flushed)
+{
+ struct list_head *rqs;
+ struct request *rq;
+ int nr;
+
+ rqs = &khd->rqs[khd->cur_domain];
+ rq = list_first_entry_or_null(rqs, struct request, queuelist);
+
+ /*
+ * If there wasn't already a pending request and we haven't flushed the
+ * software queues yet, flush the software queues and check again.
+ */
+ if (!rq && !*flushed) {
+ kyber_flush_busy_ctxs(khd, hctx);
+ *flushed = true;
+ rq = list_first_entry_or_null(rqs, struct request, queuelist);
+ }
+
+ if (rq) {
+ nr = kyber_get_domain_token(kqd, khd, hctx);
+ if (nr >= 0) {
+ khd->batching++;
+ rq_set_domain_token(rq, nr);
+ list_del_init(&rq->queuelist);
+ return rq;
+ }
+ }
+
+ /* There were either no pending requests or no tokens. */
+ return NULL;
+}
+
+static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
+ struct kyber_hctx_data *khd = hctx->sched_data;
+ bool flushed = false;
+ struct request *rq;
+ int i;
+
+ spin_lock(&khd->lock);
+
+ /*
+ * First, if we are still entitled to batch, try to dispatch a request
+ * from the batch.
+ */
+ if (khd->batching < kyber_batch_size[khd->cur_domain]) {
+ rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+ if (rq)
+ goto out;
+ }
+
+ /*
+ * Either,
+ * 1. We were no longer entitled to a batch.
+ * 2. The domain we were batching didn't have any requests.
+ * 3. The domain we were batching was out of tokens.
+ *
+ * Start another batch. Note that this wraps back around to the original
+ * domain if no other domains have requests or tokens.
+ */
+ khd->batching = 0;
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
+ khd->cur_domain = 0;
+ else
+ khd->cur_domain++;
+
+ rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+ if (rq)
+ goto out;
+ }
+
+ rq = NULL;
+out:
+ spin_unlock(&khd->lock);
+ return rq;
+}
+
+static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct kyber_hctx_data *khd = hctx->sched_data;
+ int i;
+
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ if (!list_empty_careful(&khd->rqs[i]))
+ return true;
+ }
+ return false;
+}
+
+#define KYBER_LAT_SHOW_STORE(op) \
+static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
+ char *page) \
+{ \
+ struct kyber_queue_data *kqd = e->elevator_data; \
+ \
+ return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
+} \
+ \
+static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
+ const char *page, size_t count) \
+{ \
+ struct kyber_queue_data *kqd = e->elevator_data; \
+ unsigned long long nsec; \
+ int ret; \
+ \
+ ret = kstrtoull(page, 10, &nsec); \
+ if (ret) \
+ return ret; \
+ \
+ kqd->op##_lat_nsec = nsec; \
+ \
+ return count; \
+}
+KYBER_LAT_SHOW_STORE(read);
+KYBER_LAT_SHOW_STORE(write);
+#undef KYBER_LAT_SHOW_STORE
+
+#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
+static struct elv_fs_entry kyber_sched_attrs[] = {
+ KYBER_LAT_ATTR(read),
+ KYBER_LAT_ATTR(write),
+ __ATTR_NULL
+};
+#undef KYBER_LAT_ATTR
+
+static struct elevator_type kyber_sched = {
+ .ops.mq = {
+ .init_sched = kyber_init_sched,
+ .exit_sched = kyber_exit_sched,
+ .init_hctx = kyber_init_hctx,
+ .exit_hctx = kyber_exit_hctx,
+ .get_request = kyber_get_request,
+ .put_request = kyber_put_request,
+ .completed_request = kyber_completed_request,
+ .dispatch_request = kyber_dispatch_request,
+ .has_work = kyber_has_work,
+ },
+ .uses_mq = true,
+ .elevator_attrs = kyber_sched_attrs,
+ .elevator_name = "kyber",
+ .elevator_owner = THIS_MODULE,
+};
+
+static int __init kyber_init(void)
+{
+ return elv_register(&kyber_sched);
+}
+
+static void __exit kyber_exit(void)
+{
+ elv_unregister(&kyber_sched);
+}
+
+module_init(kyber_init);
+module_exit(kyber_exit);
+
+MODULE_AUTHOR("Omar Sandoval");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Kyber I/O scheduler");
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 7afb9907821f..0171a2faad68 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -497,7 +497,6 @@ rescan:
if (disk->fops->revalidate_disk)
disk->fops->revalidate_disk(disk);
- blk_integrity_revalidate(disk);
check_disk_size_change(disk, bdev);
bdev->bd_invalidated = 0;
if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 2a2fc768b27a..4a294a5f7fab 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -262,11 +262,11 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
/*
* fill in all the output members
*/
- hdr->status = rq->errors & 0xff;
- hdr->masked_status = status_byte(rq->errors);
- hdr->msg_status = msg_byte(rq->errors);
- hdr->host_status = host_byte(rq->errors);
- hdr->driver_status = driver_byte(rq->errors);
+ hdr->status = req->result & 0xff;
+ hdr->masked_status = status_byte(req->result);
+ hdr->msg_status = msg_byte(req->result);
+ hdr->host_status = host_byte(req->result);
+ hdr->driver_status = driver_byte(req->result);
hdr->info = 0;
if (hdr->masked_status || hdr->host_status || hdr->driver_status)
hdr->info |= SG_INFO_CHECK;
@@ -362,7 +362,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
goto out_free_cdb;
bio = rq->bio;
- rq->retries = 0;
+ req->retries = 0;
start_time = jiffies;
@@ -476,13 +476,13 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
goto error;
/* default. possible overriden later */
- rq->retries = 5;
+ req->retries = 5;
switch (opcode) {
case SEND_DIAGNOSTIC:
case FORMAT_UNIT:
rq->timeout = FORMAT_UNIT_TIMEOUT;
- rq->retries = 1;
+ req->retries = 1;
break;
case START_STOP:
rq->timeout = START_STOP_TIMEOUT;
@@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
break;
case READ_DEFECT_DATA:
rq->timeout = READ_DEFECT_DATA_TIMEOUT;
- rq->retries = 1;
+ req->retries = 1;
break;
default:
rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
@@ -509,7 +509,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
blk_execute_rq(q, disk, rq, 0);
- err = rq->errors & 0xff; /* only 8 bit SCSI status */
+ err = req->result & 0xff; /* only 8 bit SCSI status */
if (err) {
if (req->sense_len && req->sense) {
bytes = (OMAX_SB_LEN > req->sense_len) ?
@@ -547,7 +547,8 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
scsi_req(rq)->cmd[0] = cmd;
scsi_req(rq)->cmd[4] = data;
scsi_req(rq)->cmd_len = 6;
- err = blk_execute_rq(q, bd_disk, rq, 0);
+ blk_execute_rq(q, bd_disk, rq, 0);
+ err = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
return err;
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 14035f826b5e..9b30ae5ab843 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -275,8 +275,8 @@ static bool check_tper(const void *data)
u8 flags = tper->supported_features;
if (!(flags & TPER_SYNC_SUPPORTED)) {
- pr_err("TPer sync not supported. flags = %d\n",
- tper->supported_features);
+ pr_debug("TPer sync not supported. flags = %d\n",
+ tper->supported_features);
return false;
}
@@ -289,7 +289,7 @@ static bool check_sum(const void *data)
u32 nlo = be32_to_cpu(sum->num_locking_objects);
if (nlo == 0) {
- pr_err("Need at least one locking object.\n");
+ pr_debug("Need at least one locking object.\n");
return false;
}
@@ -385,9 +385,9 @@ static int next(struct opal_dev *dev)
error = step->fn(dev, step->data);
if (error) {
- pr_err("Error on step function: %d with error %d: %s\n",
- state, error,
- opal_error_to_human(error));
+ pr_debug("Error on step function: %d with error %d: %s\n",
+ state, error,
+ opal_error_to_human(error));
/* For each OPAL command we do a discovery0 then we
* start some sort of session.
@@ -419,8 +419,8 @@ static int opal_discovery0_end(struct opal_dev *dev)
print_buffer(dev->resp, hlen);
if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
- pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n",
- sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
+ pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n",
+ sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
return -EFAULT;
}
@@ -503,7 +503,7 @@ static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
if (*err)
return;
if (cmd->pos >= IO_BUFFER_LENGTH - 1) {
- pr_err("Error adding u8: end of buffer.\n");
+ pr_debug("Error adding u8: end of buffer.\n");
*err = -ERANGE;
return;
}
@@ -553,7 +553,7 @@ static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
len = DIV_ROUND_UP(msb, 4);
if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) {
- pr_err("Error adding u64: end of buffer.\n");
+ pr_debug("Error adding u64: end of buffer.\n");
*err = -ERANGE;
return;
}
@@ -579,7 +579,7 @@ static void add_token_bytestring(int *err, struct opal_dev *cmd,
}
if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) {
- pr_err("Error adding bytestring: end of buffer.\n");
+ pr_debug("Error adding bytestring: end of buffer.\n");
*err = -ERANGE;
return;
}
@@ -597,7 +597,7 @@ static void add_token_bytestring(int *err, struct opal_dev *cmd,
static int build_locking_range(u8 *buffer, size_t length, u8 lr)
{
if (length > OPAL_UID_LENGTH) {
- pr_err("Can't build locking range. Length OOB\n");
+ pr_debug("Can't build locking range. Length OOB\n");
return -ERANGE;
}
@@ -614,7 +614,7 @@ static int build_locking_range(u8 *buffer, size_t length, u8 lr)
static int build_locking_user(u8 *buffer, size_t length, u8 lr)
{
if (length > OPAL_UID_LENGTH) {
- pr_err("Can't build locking range user, Length OOB\n");
+ pr_debug("Can't build locking range user, Length OOB\n");
return -ERANGE;
}
@@ -648,7 +648,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
add_token_u8(&err, cmd, OPAL_ENDLIST);
if (err) {
- pr_err("Error finalizing command.\n");
+ pr_debug("Error finalizing command.\n");
return -EFAULT;
}
@@ -660,7 +660,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr));
while (cmd->pos % 4) {
if (cmd->pos >= IO_BUFFER_LENGTH) {
- pr_err("Error: Buffer overrun\n");
+ pr_debug("Error: Buffer overrun\n");
return -ERANGE;
}
cmd->cmd[cmd->pos++] = 0;
@@ -679,14 +679,14 @@ static const struct opal_resp_tok *response_get_token(
const struct opal_resp_tok *tok;
if (n >= resp->num) {
- pr_err("Token number doesn't exist: %d, resp: %d\n",
- n, resp->num);
+ pr_debug("Token number doesn't exist: %d, resp: %d\n",
+ n, resp->num);
return ERR_PTR(-EINVAL);
}
tok = &resp->toks[n];
if (tok->len == 0) {
- pr_err("Token length must be non-zero\n");
+ pr_debug("Token length must be non-zero\n");
return ERR_PTR(-EINVAL);
}
@@ -727,7 +727,7 @@ static ssize_t response_parse_short(struct opal_resp_tok *tok,
tok->type = OPAL_DTA_TOKENID_UINT;
if (tok->len > 9) {
- pr_warn("uint64 with more than 8 bytes\n");
+ pr_debug("uint64 with more than 8 bytes\n");
return -EINVAL;
}
for (i = tok->len - 1; i > 0; i--) {
@@ -814,8 +814,8 @@ static int response_parse(const u8 *buf, size_t length,
if (clen == 0 || plen == 0 || slen == 0 ||
slen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
- pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
- clen, plen, slen);
+ pr_debug("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
+ clen, plen, slen);
print_buffer(pos, sizeof(*hdr));
return -EINVAL;
}
@@ -848,7 +848,7 @@ static int response_parse(const u8 *buf, size_t length,
}
if (num_entries == 0) {
- pr_err("Couldn't parse response.\n");
+ pr_debug("Couldn't parse response.\n");
return -EINVAL;
}
resp->num = num_entries;
@@ -861,18 +861,18 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
{
*store = NULL;
if (!resp) {
- pr_err("Response is NULL\n");
+ pr_debug("Response is NULL\n");
return 0;
}
if (n > resp->num) {
- pr_err("Response has %d tokens. Can't access %d\n",
- resp->num, n);
+ pr_debug("Response has %d tokens. Can't access %d\n",
+ resp->num, n);
return 0;
}
if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) {
- pr_err("Token is not a byte string!\n");
+ pr_debug("Token is not a byte string!\n");
return 0;
}
@@ -883,26 +883,26 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
static u64 response_get_u64(const struct parsed_resp *resp, int n)
{
if (!resp) {
- pr_err("Response is NULL\n");
+ pr_debug("Response is NULL\n");
return 0;
}
if (n > resp->num) {
- pr_err("Response has %d tokens. Can't access %d\n",
- resp->num, n);
+ pr_debug("Response has %d tokens. Can't access %d\n",
+ resp->num, n);
return 0;
}
if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) {
- pr_err("Token is not unsigned it: %d\n",
- resp->toks[n].type);
+ pr_debug("Token is not unsigned it: %d\n",
+ resp->toks[n].type);
return 0;
}
if (!(resp->toks[n].width == OPAL_WIDTH_TINY ||
resp->toks[n].width == OPAL_WIDTH_SHORT)) {
- pr_err("Atom is not short or tiny: %d\n",
- resp->toks[n].width);
+ pr_debug("Atom is not short or tiny: %d\n",
+ resp->toks[n].width);
return 0;
}
@@ -949,7 +949,7 @@ static int parse_and_check_status(struct opal_dev *dev)
error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed);
if (error) {
- pr_err("Couldn't parse response.\n");
+ pr_debug("Couldn't parse response.\n");
return error;
}
@@ -975,7 +975,7 @@ static int start_opal_session_cont(struct opal_dev *dev)
tsn = response_get_u64(&dev->parsed, 5);
if (hsn == 0 && tsn == 0) {
- pr_err("Couldn't authenticate session\n");
+ pr_debug("Couldn't authenticate session\n");
return -EPERM;
}
@@ -1012,7 +1012,7 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
ret = cmd_finalize(dev, dev->hsn, dev->tsn);
if (ret) {
- pr_err("Error finalizing command buffer: %d\n", ret);
+ pr_debug("Error finalizing command buffer: %d\n", ret);
return ret;
}
@@ -1041,7 +1041,7 @@ static int gen_key(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building gen key command\n");
+ pr_debug("Error building gen key command\n");
return err;
}
@@ -1059,8 +1059,8 @@ static int get_active_key_cont(struct opal_dev *dev)
return error;
keylen = response_get_string(&dev->parsed, 4, &activekey);
if (!activekey) {
- pr_err("%s: Couldn't extract the Activekey from the response\n",
- __func__);
+ pr_debug("%s: Couldn't extract the Activekey from the response\n",
+ __func__);
return OPAL_INVAL_PARAM;
}
dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL);
@@ -1103,7 +1103,7 @@ static int get_active_key(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building get active key command\n");
+ pr_debug("Error building get active key command\n");
return err;
}
@@ -1159,7 +1159,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE,
0, 0);
if (err)
- pr_err("Failed to create enable global lr command\n");
+ pr_debug("Failed to create enable global lr command\n");
return err;
}
@@ -1217,7 +1217,7 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
}
if (err) {
- pr_err("Error building Setup Locking range command.\n");
+ pr_debug("Error building Setup Locking range command.\n");
return err;
}
@@ -1234,11 +1234,8 @@ static int start_generic_opal_session(struct opal_dev *dev,
u32 hsn;
int err = 0;
- if (key == NULL && auth != OPAL_ANYBODY_UID) {
- pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \
- "Challenge, and not as the Anybody UID\n", __func__);
+ if (key == NULL && auth != OPAL_ANYBODY_UID)
return OPAL_INVAL_PARAM;
- }
clear_opal_cmd(dev);
@@ -1273,12 +1270,12 @@ static int start_generic_opal_session(struct opal_dev *dev,
add_token_u8(&err, dev, OPAL_ENDLIST);
break;
default:
- pr_err("Cannot start Admin SP session with auth %d\n", auth);
+ pr_debug("Cannot start Admin SP session with auth %d\n", auth);
return OPAL_INVAL_PARAM;
}
if (err) {
- pr_err("Error building start adminsp session command.\n");
+ pr_debug("Error building start adminsp session command.\n");
return err;
}
@@ -1369,7 +1366,7 @@ static int start_auth_opal_session(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building STARTSESSION command.\n");
+ pr_debug("Error building STARTSESSION command.\n");
return err;
}
@@ -1391,7 +1388,7 @@ static int revert_tper(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_STARTLIST);
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building REVERT TPER command.\n");
+ pr_debug("Error building REVERT TPER command.\n");
return err;
}
@@ -1426,7 +1423,7 @@ static int internal_activate_user(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building Activate UserN command.\n");
+ pr_debug("Error building Activate UserN command.\n");
return err;
}
@@ -1453,7 +1450,7 @@ static int erase_locking_range(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building Erase Locking Range Command.\n");
+ pr_debug("Error building Erase Locking Range Command.\n");
return err;
}
return finalize_and_send(dev, parse_and_check_status);
@@ -1484,7 +1481,7 @@ static int set_mbr_done(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error Building set MBR Done command\n");
+ pr_debug("Error Building set MBR Done command\n");
return err;
}
@@ -1516,7 +1513,7 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error Building set MBR done command\n");
+ pr_debug("Error Building set MBR done command\n");
return err;
}
@@ -1567,7 +1564,7 @@ static int set_new_pw(struct opal_dev *dev, void *data)
if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len,
cpin_uid, dev)) {
- pr_err("Error building set password command.\n");
+ pr_debug("Error building set password command.\n");
return -ERANGE;
}
@@ -1582,7 +1579,7 @@ static int set_sid_cpin_pin(struct opal_dev *dev, void *data)
memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH);
if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) {
- pr_err("Error building Set SID cpin\n");
+ pr_debug("Error building Set SID cpin\n");
return -ERANGE;
}
return finalize_and_send(dev, parse_and_check_status);
@@ -1657,7 +1654,7 @@ static int add_user_to_lr(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building add user to locking range command.\n");
+ pr_debug("Error building add user to locking range command.\n");
return err;
}
@@ -1691,7 +1688,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
/* vars are initalized to locked */
break;
default:
- pr_err("Tried to set an invalid locking state... returning to uland\n");
+ pr_debug("Tried to set an invalid locking state... returning to uland\n");
return OPAL_INVAL_PARAM;
}
@@ -1718,7 +1715,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building SET command.\n");
+ pr_debug("Error building SET command.\n");
return err;
}
return finalize_and_send(dev, parse_and_check_status);
@@ -1752,14 +1749,14 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
/* vars are initalized to locked */
break;
default:
- pr_err("Tried to set an invalid locking state.\n");
+ pr_debug("Tried to set an invalid locking state.\n");
return OPAL_INVAL_PARAM;
}
ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1,
read_locked, write_locked);
if (ret < 0) {
- pr_err("Error building SET command.\n");
+ pr_debug("Error building SET command.\n");
return ret;
}
return finalize_and_send(dev, parse_and_check_status);
@@ -1811,7 +1808,7 @@ static int activate_lsp(struct opal_dev *dev, void *data)
}
if (err) {
- pr_err("Error building Activate LockingSP command.\n");
+ pr_debug("Error building Activate LockingSP command.\n");
return err;
}
@@ -1831,7 +1828,7 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev)
/* 0x08 is Manufacured Inactive */
/* 0x09 is Manufactured */
if (lc_status != OPAL_MANUFACTURED_INACTIVE) {
- pr_err("Couldn't determine the status of the Lifcycle state\n");
+ pr_debug("Couldn't determine the status of the Lifecycle state\n");
return -ENODEV;
}
@@ -1868,7 +1865,7 @@ static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error Building GET Lifecycle Status command\n");
+ pr_debug("Error Building GET Lifecycle Status command\n");
return err;
}
@@ -1887,7 +1884,7 @@ static int get_msid_cpin_pin_cont(struct opal_dev *dev)
strlen = response_get_string(&dev->parsed, 4, &msid_pin);
if (!msid_pin) {
- pr_err("%s: Couldn't extract PIN from response\n", __func__);
+ pr_debug("%s: Couldn't extract PIN from response\n", __func__);
return OPAL_INVAL_PARAM;
}
@@ -1929,7 +1926,7 @@ static int get_msid_cpin_pin(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building Get MSID CPIN PIN command.\n");
+ pr_debug("Error building Get MSID CPIN PIN command.\n");
return err;
}
@@ -2124,18 +2121,18 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
if (lk_unlk->l_state != OPAL_RO &&
lk_unlk->l_state != OPAL_RW) {
- pr_err("Locking state was not RO or RW\n");
+ pr_debug("Locking state was not RO or RW\n");
return -EINVAL;
}
if (lk_unlk->session.who < OPAL_USER1 ||
lk_unlk->session.who > OPAL_USER9) {
- pr_err("Authority was not within the range of users: %d\n",
- lk_unlk->session.who);
+ pr_debug("Authority was not within the range of users: %d\n",
+ lk_unlk->session.who);
return -EINVAL;
}
if (lk_unlk->session.sum) {
- pr_err("%s not supported in sum. Use setup locking range\n",
- __func__);
+ pr_debug("%s not supported in sum. Use setup locking range\n",
+ __func__);
return -EINVAL;
}
@@ -2312,7 +2309,7 @@ static int opal_activate_user(struct opal_dev *dev,
/* We can't activate Admin1 it's active as manufactured */
if (opal_session->who < OPAL_USER1 ||
opal_session->who > OPAL_USER9) {
- pr_err("Who was not a valid user: %d\n", opal_session->who);
+ pr_debug("Who was not a valid user: %d\n", opal_session->who);
return -EINVAL;
}
@@ -2343,9 +2340,9 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
ret = __opal_lock_unlock(dev, &suspend->unlk);
if (ret) {
- pr_warn("Failed to unlock LR %hhu with sum %d\n",
- suspend->unlk.session.opal_key.lr,
- suspend->unlk.session.sum);
+ pr_debug("Failed to unlock LR %hhu with sum %d\n",
+ suspend->unlk.session.opal_key.lr,
+ suspend->unlk.session.sum);
was_failure = true;
}
}
@@ -2363,10 +2360,8 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
return -EACCES;
if (!dev)
return -ENOTSUPP;
- if (!dev->supported) {
- pr_err("Not supported\n");
+ if (!dev->supported)
return -ENOTSUPP;
- }
p = memdup_user(arg, _IOC_SIZE(cmd));
if (IS_ERR(p))
@@ -2410,7 +2405,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
ret = opal_secure_erase_locking_range(dev, p);
break;
default:
- pr_warn("No such Opal Ioctl %u\n", cmd);
+ break;
}
kfree(p);
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 2c97912335a9..680c6d636298 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
return t10_pi_verify(iter, t10_pi_ip_fn, 3);
}
-struct blk_integrity_profile t10_pi_type1_crc = {
+const struct blk_integrity_profile t10_pi_type1_crc = {
.name = "T10-DIF-TYPE1-CRC",
.generate_fn = t10_pi_type1_generate_crc,
.verify_fn = t10_pi_type1_verify_crc,
};
EXPORT_SYMBOL(t10_pi_type1_crc);
-struct blk_integrity_profile t10_pi_type1_ip = {
+const struct blk_integrity_profile t10_pi_type1_ip = {
.name = "T10-DIF-TYPE1-IP",
.generate_fn = t10_pi_type1_generate_ip,
.verify_fn = t10_pi_type1_verify_ip,
};
EXPORT_SYMBOL(t10_pi_type1_ip);
-struct blk_integrity_profile t10_pi_type3_crc = {
+const struct blk_integrity_profile t10_pi_type3_crc = {
.name = "T10-DIF-TYPE3-CRC",
.generate_fn = t10_pi_type3_generate_crc,
.verify_fn = t10_pi_type3_verify_crc,
};
EXPORT_SYMBOL(t10_pi_type3_crc);
-struct blk_integrity_profile t10_pi_type3_ip = {
+const struct blk_integrity_profile t10_pi_type3_ip = {
.name = "T10-DIF-TYPE3-IP",
.generate_fn = t10_pi_type3_generate_ip,
.verify_fn = t10_pi_type3_verify_ip,
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 83e5f7e1a20d..c6db511b5f73 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -256,7 +256,7 @@ config ACPI_PROCESSOR
config ACPI_IPMI
tristate "IPMI"
- depends on IPMI_SI
+ depends on IPMI_HANDLER
default n
help
This driver enables the ACPI to access the BMC controller. And it
@@ -469,9 +469,8 @@ config ACPI_WATCHDOG
config ACPI_EXTLOG
tristate "Extended Error Log support"
- depends on X86_MCE && X86_LOCAL_APIC
+ depends on X86_MCE && X86_LOCAL_APIC && EDAC
select UEFI_CPER
- select RAS
default n
help
Certain usages such as Predictive Failure Analysis (PFA) require
diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index a15270a806fc..502ea4dc2080 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -229,7 +229,7 @@ static int __init extlog_init(void)
if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr())
return -ENODEV;
- if (get_edac_report_status() == EDAC_REPORTING_FORCE) {
+ if (edac_get_report_status() == EDAC_REPORTING_FORCE) {
pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n");
return -EPERM;
}
@@ -285,8 +285,8 @@ static int __init extlog_init(void)
* eMCA event report method has higher priority than EDAC method,
* unless EDAC event report method is mandatory.
*/
- old_edac_report_status = get_edac_report_status();
- set_edac_report_status(EDAC_REPORTING_DISABLED);
+ old_edac_report_status = edac_get_report_status();
+ edac_set_report_status(EDAC_REPORTING_DISABLED);
mce_register_decode_chain(&extlog_mce_dec);
/* enable OS to be involved to take over management from BIOS */
((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
@@ -308,7 +308,7 @@ err:
static void __exit extlog_exit(void)
{
- set_edac_report_status(old_edac_report_status);
+ edac_set_report_status(old_edac_report_status);
mce_unregister_decode_chain(&extlog_mce_dec);
((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
if (extlog_l1_addr)
diff --git a/drivers/acpi/acpi_ipmi.c b/drivers/acpi/acpi_ipmi.c
index 747c2ba98534..1b64419e2fec 100644
--- a/drivers/acpi/acpi_ipmi.c
+++ b/drivers/acpi/acpi_ipmi.c
@@ -429,8 +429,7 @@ static void ipmi_msg_handler(struct ipmi_recv_msg *msg, void *user_msg_data)
if (msg->recv_type == IPMI_RESPONSE_RECV_TYPE &&
msg->msg.data_len == 1) {
if (msg->msg.data[0] == IPMI_TIMEOUT_COMPLETION_CODE) {
- dev_WARN_ONCE(dev, true,
- "Unexpected response (timeout).\n");
+ dev_dbg_once(dev, "Unexpected response (timeout).\n");
tx_msg->msg_done = ACPI_IPMI_TIMEOUT;
}
goto out_comp;
diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
index 70b57d2229d6..ff6cb9e4c381 100644
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -14,7 +14,6 @@ menuconfig ATA
tristate "Serial ATA and Parallel ATA drivers (libata)"
depends on HAS_IOMEM
depends on BLOCK
- depends on !(M32R || S390) || BROKEN
select SCSI
select GLOB
---help---
@@ -118,6 +117,15 @@ config AHCI_DA850
If unsure, say N.
+config AHCI_DM816
+ tristate "DaVinci DM816 AHCI SATA support"
+ depends on ARCH_OMAP2PLUS
+ help
+ This option enables support for the DaVinci DM816 SoC's
+ onboard AHCI SATA controller.
+
+ If unsure, say N.
+
config AHCI_ST
tristate "ST AHCI SATA support"
depends on ARCH_STI
@@ -885,14 +893,6 @@ config PATA_AT32
If unsure, say N.
-config PATA_AT91
- tristate "PATA support for AT91SAM9260"
- depends on ARM && SOC_AT91SAM9
- help
- This option enables support for IDE devices on the Atmel AT91SAM9260 SoC.
-
- If unsure, say N.
-
config PATA_CMD640_PCI
tristate "CMD640 PCI PATA support (Experimental)"
depends on PCI
diff --git a/drivers/ata/Makefile b/drivers/ata/Makefile
index 89a0a1915d36..3048cc100a46 100644
--- a/drivers/ata/Makefile
+++ b/drivers/ata/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_SATA_HIGHBANK) += sata_highbank.o libahci.o
obj-$(CONFIG_AHCI_BRCM) += ahci_brcm.o libahci.o libahci_platform.o
obj-$(CONFIG_AHCI_CEVA) += ahci_ceva.o libahci.o libahci_platform.o
obj-$(CONFIG_AHCI_DA850) += ahci_da850.o libahci.o libahci_platform.o
+obj-$(CONFIG_AHCI_DM816) += ahci_dm816.o libahci.o libahci_platform.o
obj-$(CONFIG_AHCI_IMX) += ahci_imx.o libahci.o libahci_platform.o
obj-$(CONFIG_AHCI_MVEBU) += ahci_mvebu.o libahci.o libahci_platform.o
obj-$(CONFIG_AHCI_OCTEON) += ahci_octeon.o
@@ -91,7 +92,6 @@ obj-$(CONFIG_PATA_WINBOND) += pata_sl82c105.o
# SFF PIO only
obj-$(CONFIG_PATA_AT32) += pata_at32.o
-obj-$(CONFIG_PATA_AT91) += pata_at91.o
obj-$(CONFIG_PATA_CMD640_PCI) += pata_cmd640.o
obj-$(CONFIG_PATA_FALCON) += pata_falcon.o
obj-$(CONFIG_PATA_ISAPNP) += pata_isapnp.o
diff --git a/drivers/ata/ahci_dm816.c b/drivers/ata/ahci_dm816.c
new file mode 100644
index 000000000000..fbd827c3a75c
--- /dev/null
+++ b/drivers/ata/ahci_dm816.c
@@ -0,0 +1,200 @@
+/*
+ * DaVinci DM816 AHCI SATA platform driver
+ *
+ * Copyright (C) 2017 BayLibre SAS
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/pm.h>
+#include <linux/platform_device.h>
+#include <linux/libata.h>
+#include <linux/ahci_platform.h>
+
+#include "ahci.h"
+
+#define AHCI_DM816_DRV_NAME "ahci-dm816"
+
+#define AHCI_DM816_PHY_ENPLL(x) ((x) << 0)
+#define AHCI_DM816_PHY_MPY(x) ((x) << 1)
+#define AHCI_DM816_PHY_LOS(x) ((x) << 12)
+#define AHCI_DM816_PHY_RXCDR(x) ((x) << 13)
+#define AHCI_DM816_PHY_RXEQ(x) ((x) << 16)
+#define AHCI_DM816_PHY_TXSWING(x) ((x) << 23)
+
+#define AHCI_DM816_P0PHYCR_REG 0x178
+#define AHCI_DM816_P1PHYCR_REG 0x1f8
+
+#define AHCI_DM816_PLL_OUT 1500000000LU
+
+static const unsigned long pll_mpy_table[] = {
+ 400, 500, 600, 800, 825, 1000, 1200,
+ 1250, 1500, 1600, 1650, 2000, 2200, 2500
+};
+
+static int ahci_dm816_get_mpy_bits(unsigned long refclk_rate)
+{
+ unsigned long pll_multiplier;
+ int i;
+
+ /*
+ * We need to determine the value of the multiplier (MPY) bits.
+ * In order to include the 8.25 multiplier we need to first divide
+ * the refclk rate by 100.
+ */
+ pll_multiplier = AHCI_DM816_PLL_OUT / (refclk_rate / 100);
+
+ for (i = 0; i < ARRAY_SIZE(pll_mpy_table); i++) {
+ if (pll_mpy_table[i] == pll_multiplier)
+ return i;
+ }
+
+ /*
+ * We should have divided evenly - if not, return an invalid
+ * value.
+ */
+ return -1;
+}
+
+static int ahci_dm816_phy_init(struct ahci_host_priv *hpriv, struct device *dev)
+{
+ unsigned long refclk_rate;
+ int mpy;
+ u32 val;
+
+ /*
+ * We should have been supplied two clocks: the functional and
+ * keep-alive clock and the external reference clock. We need the
+ * rate of the latter to calculate the correct value of MPY bits.
+ */
+ if (!hpriv->clks[1]) {
+ dev_err(dev, "reference clock not supplied\n");
+ return -EINVAL;
+ }
+
+ refclk_rate = clk_get_rate(hpriv->clks[1]);
+ if ((refclk_rate % 100) != 0) {
+ dev_err(dev, "reference clock rate must be divisible by 100\n");
+ return -EINVAL;
+ }
+
+ mpy = ahci_dm816_get_mpy_bits(refclk_rate);
+ if (mpy < 0) {
+ dev_err(dev, "can't calculate the MPY bits value\n");
+ return -EINVAL;
+ }
+
+ /* Enable the PHY and configure the first HBA port. */
+ val = AHCI_DM816_PHY_MPY(mpy) | AHCI_DM816_PHY_LOS(1) |
+ AHCI_DM816_PHY_RXCDR(4) | AHCI_DM816_PHY_RXEQ(1) |
+ AHCI_DM816_PHY_TXSWING(3) | AHCI_DM816_PHY_ENPLL(1);
+ writel(val, hpriv->mmio + AHCI_DM816_P0PHYCR_REG);
+
+ /* Configure the second HBA port. */
+ val = AHCI_DM816_PHY_LOS(1) | AHCI_DM816_PHY_RXCDR(4) |
+ AHCI_DM816_PHY_RXEQ(1) | AHCI_DM816_PHY_TXSWING(3);
+ writel(val, hpriv->mmio + AHCI_DM816_P1PHYCR_REG);
+
+ return 0;
+}
+
+static int ahci_dm816_softreset(struct ata_link *link,
+ unsigned int *class, unsigned long deadline)
+{
+ int pmp, ret;
+
+ pmp = sata_srst_pmp(link);
+
+ /*
+ * There's an issue with the SATA controller on DM816 SoC: if we
+ * enable Port Multiplier support, but the drive is connected directly
+ * to the board, it can't be detected. As a workaround: if PMP is
+ * enabled, we first call ahci_do_softreset() and pass it the result of
+ * sata_srst_pmp(). If this call fails, we retry with pmp = 0.
+ */
+ ret = ahci_do_softreset(link, class, pmp, deadline, ahci_check_ready);
+ if (pmp && ret == -EBUSY)
+ return ahci_do_softreset(link, class, 0,
+ deadline, ahci_check_ready);
+
+ return ret;
+}
+
+static struct ata_port_operations ahci_dm816_port_ops = {
+ .inherits = &ahci_platform_ops,
+ .softreset = ahci_dm816_softreset,
+};
+
+static const struct ata_port_info ahci_dm816_port_info = {
+ .flags = AHCI_FLAG_COMMON,
+ .pio_mask = ATA_PIO4,
+ .udma_mask = ATA_UDMA6,
+ .port_ops = &ahci_dm816_port_ops,
+};
+
+static struct scsi_host_template ahci_dm816_platform_sht = {
+ AHCI_SHT(AHCI_DM816_DRV_NAME),
+};
+
+static int ahci_dm816_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct ahci_host_priv *hpriv;
+ int rc;
+
+ hpriv = ahci_platform_get_resources(pdev);
+ if (IS_ERR(hpriv))
+ return PTR_ERR(hpriv);
+
+ rc = ahci_platform_enable_resources(hpriv);
+ if (rc)
+ return rc;
+
+ rc = ahci_dm816_phy_init(hpriv, dev);
+ if (rc)
+ goto disable_resources;
+
+ rc = ahci_platform_init_host(pdev, hpriv,
+ &ahci_dm816_port_info,
+ &ahci_dm816_platform_sht);
+ if (rc)
+ goto disable_resources;
+
+ return 0;
+
+disable_resources:
+ ahci_platform_disable_resources(hpriv);
+
+ return rc;
+}
+
+static SIMPLE_DEV_PM_OPS(ahci_dm816_pm_ops,
+ ahci_platform_suspend,
+ ahci_platform_resume);
+
+static const struct of_device_id ahci_dm816_of_match[] = {
+ { .compatible = "ti,dm816-ahci", },
+ { },
+};
+MODULE_DEVICE_TABLE(of, ahci_dm816_of_match);
+
+static struct platform_driver ahci_dm816_driver = {
+ .probe = ahci_dm816_probe,
+ .remove = ata_platform_remove_one,
+ .driver = {
+ .name = AHCI_DM816_DRV_NAME,
+ .of_match_table = ahci_dm816_of_match,
+ .pm = &ahci_dm816_pm_ops,
+ },
+};
+module_platform_driver(ahci_dm816_driver);
+
+MODULE_DESCRIPTION("DaVinci DM816 AHCI SATA platform driver");
+MODULE_AUTHOR("Bartosz Golaszewski <bgolaszewski@baylibre.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/ata/ahci_octeon.c b/drivers/ata/ahci_octeon.c
index ea865fe953e1..5a44e089c6bb 100644
--- a/drivers/ata/ahci_octeon.c
+++ b/drivers/ata/ahci_octeon.c
@@ -38,11 +38,6 @@ static int ahci_octeon_probe(struct platform_device *pdev)
int ret;
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- if (!res) {
- dev_err(&pdev->dev, "Platform resource[0] is missing\n");
- return -ENODEV;
- }
-
base = devm_ioremap_resource(&pdev->dev, res);
if (IS_ERR(base))
return PTR_ERR(base);
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index ca75823697dd..2d83b8c75965 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4910,7 +4910,7 @@ void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
* LOCKING:
* spin_lock_irqsave(host lock)
*/
-void ata_sg_clean(struct ata_queued_cmd *qc)
+static void ata_sg_clean(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct scatterlist *sg = qc->sg;
@@ -5902,9 +5902,9 @@ struct ata_port *ata_port_alloc(struct ata_host *host)
INIT_LIST_HEAD(&ap->eh_done_q);
init_waitqueue_head(&ap->eh_wait_q);
init_completion(&ap->park_req_pending);
- init_timer_deferrable(&ap->fastdrain_timer);
- ap->fastdrain_timer.function = ata_eh_fastdrain_timerfn;
- ap->fastdrain_timer.data = (unsigned long)ap;
+ setup_deferrable_timer(&ap->fastdrain_timer,
+ ata_eh_fastdrain_timerfn,
+ (unsigned long)ap);
ap->cbl = ATA_CBL_NONE;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 1ac70744ae7b..49ba9834c715 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3393,46 +3393,6 @@ static size_t ata_format_dsm_trim_descr(struct scsi_cmnd *cmd, u32 trmax,
}
/**
- * ata_format_dsm_trim_descr() - SATL Write Same to ATA SCT Write Same
- * @cmd: SCSI command being translated
- * @lba: Starting sector
- * @num: Number of sectors to be zero'd.
- *
- * Rewrite the WRITE SAME payload to be an SCT Write Same formatted
- * descriptor.
- * NOTE: Writes a pattern (0's) in the foreground.
- *
- * Return: Number of bytes copied into sglist.
- */
-static size_t ata_format_sct_write_same(struct scsi_cmnd *cmd, u64 lba, u64 num)
-{
- struct scsi_device *sdp = cmd->device;
- size_t len = sdp->sector_size;
- size_t r;
- u16 *buf;
- unsigned long flags;
-
- spin_lock_irqsave(&ata_scsi_rbuf_lock, flags);
- buf = ((void *)ata_scsi_rbuf);
-
- put_unaligned_le16(0x0002, &buf[0]); /* SCT_ACT_WRITE_SAME */
- put_unaligned_le16(0x0101, &buf[1]); /* WRITE PTRN FG */
- put_unaligned_le64(lba, &buf[2]);
- put_unaligned_le64(num, &buf[6]);
- put_unaligned_le32(0u, &buf[10]); /* pattern */
-
- WARN_ON(len > ATA_SCSI_RBUF_SIZE);
-
- if (len > ATA_SCSI_RBUF_SIZE)
- len = ATA_SCSI_RBUF_SIZE;
-
- r = sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, len);
- spin_unlock_irqrestore(&ata_scsi_rbuf_lock, flags);
-
- return r;
-}
-
-/**
* ata_scsi_write_same_xlat() - SATL Write Same to ATA SCT Write Same
* @qc: Command to be translated
*
@@ -3462,32 +3422,31 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
if (unlikely(!dev->dma_mode))
goto invalid_opcode;
+ /*
+ * We only allow sending this command through the block layer,
+ * as it modifies the DATA OUT buffer, which would corrupt user
+ * memory for SG_IO commands.
+ */
+ if (unlikely(blk_rq_is_passthrough(scmd->request)))
+ goto invalid_opcode;
+
if (unlikely(scmd->cmd_len < 16)) {
fp = 15;
goto invalid_fld;
}
scsi_16_lba_len(cdb, &block, &n_block);
- if (unmap) {
- /* If trim is not enabled the cmd is invalid. */
- if ((dev->horkage & ATA_HORKAGE_NOTRIM) ||
- !ata_id_has_trim(dev->id)) {
- fp = 1;
- bp = 3;
- goto invalid_fld;
- }
- /* If the request is too large the cmd is invalid */
- if (n_block > 0xffff * trmax) {
- fp = 2;
- goto invalid_fld;
- }
- } else {
- /* If write same is not available the cmd is invalid */
- if (!ata_id_sct_write_same(dev->id)) {
- fp = 1;
- bp = 3;
- goto invalid_fld;
- }
+ if (!unmap ||
+ (dev->horkage & ATA_HORKAGE_NOTRIM) ||
+ !ata_id_has_trim(dev->id)) {
+ fp = 1;
+ bp = 3;
+ goto invalid_fld;
+ }
+ /* If the request is too large the cmd is invalid */
+ if (n_block > 0xffff * trmax) {
+ fp = 2;
+ goto invalid_fld;
}
/*
@@ -3502,49 +3461,28 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
* For DATA SET MANAGEMENT TRIM in ACS-2 nsect (aka count)
* is defined as number of 512 byte blocks to be transferred.
*/
- if (unmap) {
- size = ata_format_dsm_trim_descr(scmd, trmax, block, n_block);
- if (size != len)
- goto invalid_param_len;
- if (ata_ncq_enabled(dev) && ata_fpdma_dsm_supported(dev)) {
- /* Newer devices support queued TRIM commands */
- tf->protocol = ATA_PROT_NCQ;
- tf->command = ATA_CMD_FPDMA_SEND;
- tf->hob_nsect = ATA_SUBCMD_FPDMA_SEND_DSM & 0x1f;
- tf->nsect = qc->tag << 3;
- tf->hob_feature = (size / 512) >> 8;
- tf->feature = size / 512;
+ size = ata_format_dsm_trim_descr(scmd, trmax, block, n_block);
+ if (size != len)
+ goto invalid_param_len;
- tf->auxiliary = 1;
- } else {
- tf->protocol = ATA_PROT_DMA;
- tf->hob_feature = 0;
- tf->feature = ATA_DSM_TRIM;
- tf->hob_nsect = (size / 512) >> 8;
- tf->nsect = size / 512;
- tf->command = ATA_CMD_DSM;
- }
- } else {
- size = ata_format_sct_write_same(scmd, block, n_block);
- if (size != len)
- goto invalid_param_len;
+ if (ata_ncq_enabled(dev) && ata_fpdma_dsm_supported(dev)) {
+ /* Newer devices support queued TRIM commands */
+ tf->protocol = ATA_PROT_NCQ;
+ tf->command = ATA_CMD_FPDMA_SEND;
+ tf->hob_nsect = ATA_SUBCMD_FPDMA_SEND_DSM & 0x1f;
+ tf->nsect = qc->tag << 3;
+ tf->hob_feature = (size / 512) >> 8;
+ tf->feature = size / 512;
- tf->hob_feature = 0;
- tf->feature = 0;
- tf->hob_nsect = 0;
- tf->nsect = 1;
- tf->lbah = 0;
- tf->lbam = 0;
- tf->lbal = ATA_CMD_STANDBYNOW1;
- tf->hob_lbah = 0;
- tf->hob_lbam = 0;
- tf->hob_lbal = 0;
- tf->device = ATA_CMD_STANDBYNOW1;
+ tf->auxiliary = 1;
+ } else {
tf->protocol = ATA_PROT_DMA;
- tf->command = ATA_CMD_WRITE_LOG_DMA_EXT;
- if (unlikely(dev->flags & ATA_DFLAG_PIO))
- tf->command = ATA_CMD_WRITE_LOG_EXT;
+ tf->hob_feature = 0;
+ tf->feature = ATA_DSM_TRIM;
+ tf->hob_nsect = (size / 512) >> 8;
+ tf->nsect = size / 512;
+ tf->command = ATA_CMD_DSM;
}
tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48 |
@@ -3619,10 +3557,6 @@ static unsigned int ata_scsiop_maint_in(struct ata_scsi_args *args, u8 *rbuf)
case START_STOP:
supported = 3;
break;
- case WRITE_SAME_16:
- if (!ata_id_sct_write_same(dev->id))
- break;
- /* fallthrough: if SCT ... only enable for ZBC */
case ZBC_IN:
case ZBC_OUT:
if (ata_id_zoned_cap(dev->id) ||
diff --git a/drivers/ata/pata_at91.c b/drivers/ata/pata_at91.c
deleted file mode 100644
index fd5b34f0d007..000000000000
--- a/drivers/ata/pata_at91.c
+++ /dev/null
@@ -1,503 +0,0 @@
-/*
- * PATA driver for AT91SAM9260 Static Memory Controller
- * with CompactFlash interface in True IDE mode
- *
- * Copyright (C) 2009 Matyukevich Sergey
- * 2011 Igor Plyatov
- *
- * Based on:
- * * generic platform driver by Paul Mundt: drivers/ata/pata_platform.c
- * * pata_at32 driver by Kristoffer Nyborg Gregertsen
- * * at91_ide driver by Stanislaw Gruszka
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/blkdev.h>
-#include <linux/gfp.h>
-#include <scsi/scsi_host.h>
-#include <linux/ata.h>
-#include <linux/clk.h>
-#include <linux/libata.h>
-#include <linux/mfd/syscon.h>
-#include <linux/mfd/syscon/atmel-smc.h>
-#include <linux/platform_device.h>
-#include <linux/ata_platform.h>
-#include <linux/platform_data/atmel.h>
-#include <linux/regmap.h>
-#include <linux/gpio.h>
-
-#define DRV_NAME "pata_at91"
-#define DRV_VERSION "0.3"
-
-#define CF_IDE_OFFSET 0x00c00000
-#define CF_ALT_IDE_OFFSET 0x00e00000
-#define CF_IDE_RES_SIZE 0x08
-#define CS_PULSE_MAXIMUM 319
-#define ER_SMC_CALC 1
-#define ER_SMC_RECALC 2
-
-struct at91_ide_info {
- unsigned long mode;
- unsigned int cs;
- struct clk *mck;
- void __iomem *ide_addr;
- void __iomem *alt_addr;
-};
-
-/**
- * struct smc_range - range of valid values for SMC register.
- */
-struct smc_range {
- int min;
- int max;
-};
-
-struct regmap *smc;
-
-struct at91sam9_smc_generic_fields {
- struct regmap_field *setup;
- struct regmap_field *pulse;
- struct regmap_field *cycle;
- struct regmap_field *mode;
-} fields;
-
-/**
- * adjust_smc_value - adjust value for one of SMC registers.
- * @value: adjusted value
- * @range: array of SMC ranges with valid values
- * @size: SMC ranges array size
- *
- * This returns the difference between input and output value or negative
- * in case of invalid input value.
- * If negative returned, then output value = maximal possible from ranges.
- */
-static int adjust_smc_value(int *value, struct smc_range *range, int size)
-{
- int maximum = (range + size - 1)->max;
- int remainder;
-
- do {
- if (*value < range->min) {
- remainder = range->min - *value;
- *value = range->min; /* nearest valid value */
- return remainder;
- } else if ((range->min <= *value) && (*value <= range->max))
- return 0;
-
- range++;
- } while (--size);
- *value = maximum;
-
- return -1; /* invalid value */
-}
-
-/**
- * calc_smc_vals - calculate SMC register values
- * @dev: ATA device
- * @setup: SMC_SETUP register value
- * @pulse: SMC_PULSE register value
- * @cycle: SMC_CYCLE register value
- *
- * This returns negative in case of invalid values for SMC registers:
- * -ER_SMC_RECALC - recalculation required for SMC values,
- * -ER_SMC_CALC - calculation failed (invalid input values).
- *
- * SMC use special coding scheme, see "Coding and Range of Timing
- * Parameters" table from AT91SAM9 datasheets.
- *
- * SMC_SETUP = 128*setup[5] + setup[4:0]
- * SMC_PULSE = 256*pulse[6] + pulse[5:0]
- * SMC_CYCLE = 256*cycle[8:7] + cycle[6:0]
- */
-static int calc_smc_vals(struct device *dev,
- int *setup, int *pulse, int *cycle, int *cs_pulse)
-{
- int ret_val;
- int err = 0;
- struct smc_range range_setup[] = { /* SMC_SETUP valid values */
- {.min = 0, .max = 31}, /* first range */
- {.min = 128, .max = 159} /* second range */
- };
- struct smc_range range_pulse[] = { /* SMC_PULSE valid values */
- {.min = 0, .max = 63}, /* first range */
- {.min = 256, .max = 319} /* second range */
- };
- struct smc_range range_cycle[] = { /* SMC_CYCLE valid values */
- {.min = 0, .max = 127}, /* first range */
- {.min = 256, .max = 383}, /* second range */
- {.min = 512, .max = 639}, /* third range */
- {.min = 768, .max = 895} /* fourth range */
- };
-
- ret_val = adjust_smc_value(setup, range_setup, ARRAY_SIZE(range_setup));
- if (ret_val < 0)
- dev_warn(dev, "maximal SMC Setup value\n");
- else
- *cycle += ret_val;
-
- ret_val = adjust_smc_value(pulse, range_pulse, ARRAY_SIZE(range_pulse));
- if (ret_val < 0)
- dev_warn(dev, "maximal SMC Pulse value\n");
- else
- *cycle += ret_val;
-
- ret_val = adjust_smc_value(cycle, range_cycle, ARRAY_SIZE(range_cycle));
- if (ret_val < 0)
- dev_warn(dev, "maximal SMC Cycle value\n");
-
- *cs_pulse = *cycle;
- if (*cs_pulse > CS_PULSE_MAXIMUM) {
- dev_err(dev, "unable to calculate valid SMC settings\n");
- return -ER_SMC_CALC;
- }
-
- ret_val = adjust_smc_value(cs_pulse, range_pulse,
- ARRAY_SIZE(range_pulse));
- if (ret_val < 0) {
- dev_warn(dev, "maximal SMC CS Pulse value\n");
- } else if (ret_val != 0) {
- *cycle = *cs_pulse;
- dev_warn(dev, "SMC Cycle extended\n");
- err = -ER_SMC_RECALC;
- }
-
- return err;
-}
-
-/**
- * to_smc_format - convert values into SMC format
- * @setup: SETUP value of SMC Setup Register
- * @pulse: PULSE value of SMC Pulse Register
- * @cycle: CYCLE value of SMC Cycle Register
- * @cs_pulse: NCS_PULSE value of SMC Pulse Register
- */
-static void to_smc_format(int *setup, int *pulse, int *cycle, int *cs_pulse)
-{
- *setup = (*setup & 0x1f) | ((*setup & 0x80) >> 2);
- *pulse = (*pulse & 0x3f) | ((*pulse & 0x100) >> 2);
- *cycle = (*cycle & 0x7f) | ((*cycle & 0x300) >> 1);
- *cs_pulse = (*cs_pulse & 0x3f) | ((*cs_pulse & 0x100) >> 2);
-}
-
-static unsigned long calc_mck_cycles(unsigned long ns, unsigned long mck_hz)
-{
- unsigned long mul;
-
- /*
- * cycles = x [nsec] * f [Hz] / 10^9 [ns in sec] =
- * x * (f / 1_000_000_000) =
- * x * ((f * 65536) / 1_000_000_000) / 65536 =
- * x * (((f / 10_000) * 65536) / 100_000) / 65536 =
- */
-
- mul = (mck_hz / 10000) << 16;
- mul /= 100000;
-
- return (ns * mul + 65536) >> 16; /* rounding */
-}
-
-/**
- * set_smc_timing - SMC timings setup.
- * @dev: device
- * @info: AT91 IDE info
- * @ata: ATA timings
- *
- * Its assumed that write timings are same as read timings,
- * cs_setup = 0 and cs_pulse = cycle.
- */
-static void set_smc_timing(struct device *dev, struct ata_device *adev,
- struct at91_ide_info *info, const struct ata_timing *ata)
-{
- int ret = 0;
- int use_iordy;
- unsigned int t6z; /* data tristate time in ns */
- unsigned int cycle; /* SMC Cycle width in MCK ticks */
- unsigned int setup; /* SMC Setup width in MCK ticks */
- unsigned int pulse; /* CFIOR and CFIOW pulse width in MCK ticks */
- unsigned int cs_pulse; /* CS4 or CS5 pulse width in MCK ticks*/
- unsigned int tdf_cycles; /* SMC TDF MCK ticks */
- unsigned long mck_hz; /* MCK frequency in Hz */
-
- t6z = (ata->mode < XFER_PIO_5) ? 30 : 20;
- mck_hz = clk_get_rate(info->mck);
- cycle = calc_mck_cycles(ata->cyc8b, mck_hz);
- setup = calc_mck_cycles(ata->setup, mck_hz);
- pulse = calc_mck_cycles(ata->act8b, mck_hz);
- tdf_cycles = calc_mck_cycles(t6z, mck_hz);
-
- do {
- ret = calc_smc_vals(dev, &setup, &pulse, &cycle, &cs_pulse);
- } while (ret == -ER_SMC_RECALC);
-
- if (ret == -ER_SMC_CALC)
- dev_err(dev, "Interface may not operate correctly\n");
-
- dev_dbg(dev, "SMC Setup=%u, Pulse=%u, Cycle=%u, CS Pulse=%u\n",
- setup, pulse, cycle, cs_pulse);
- to_smc_format(&setup, &pulse, &cycle, &cs_pulse);
- /* disable or enable waiting for IORDY signal */
- use_iordy = ata_pio_need_iordy(adev);
- if (use_iordy)
- info->mode |= AT91_SMC_EXNWMODE_READY;
-
- if (tdf_cycles > 15) {
- tdf_cycles = 15;
- dev_warn(dev, "maximal SMC TDF Cycles value\n");
- }
-
- dev_dbg(dev, "Use IORDY=%u, TDF Cycles=%u\n", use_iordy, tdf_cycles);
-
- regmap_fields_write(fields.setup, info->cs,
- AT91SAM9_SMC_NRDSETUP(setup) |
- AT91SAM9_SMC_NWESETUP(setup) |
- AT91SAM9_SMC_NCS_NRDSETUP(0) |
- AT91SAM9_SMC_NCS_WRSETUP(0));
- regmap_fields_write(fields.pulse, info->cs,
- AT91SAM9_SMC_NRDPULSE(pulse) |
- AT91SAM9_SMC_NWEPULSE(pulse) |
- AT91SAM9_SMC_NCS_NRDPULSE(cs_pulse) |
- AT91SAM9_SMC_NCS_WRPULSE(cs_pulse));
- regmap_fields_write(fields.cycle, info->cs,
- AT91SAM9_SMC_NRDCYCLE(cycle) |
- AT91SAM9_SMC_NWECYCLE(cycle));
- regmap_fields_write(fields.mode, info->cs, info->mode |
- AT91_SMC_TDF_(tdf_cycles));
-}
-
-static void pata_at91_set_piomode(struct ata_port *ap, struct ata_device *adev)
-{
- struct at91_ide_info *info = ap->host->private_data;
- struct ata_timing timing;
- int ret;
-
- /* Compute ATA timing and set it to SMC */
- ret = ata_timing_compute(adev, adev->pio_mode, &timing, 1000, 0);
- if (ret) {
- dev_warn(ap->dev, "Failed to compute ATA timing %d, "
- "set PIO_0 timing\n", ret);
- timing = *ata_timing_find_mode(XFER_PIO_0);
- }
- set_smc_timing(ap->dev, adev, info, &timing);
-}
-
-static unsigned int pata_at91_data_xfer_noirq(struct ata_queued_cmd *qc,
- unsigned char *buf, unsigned int buflen, int rw)
-{
- struct at91_ide_info *info = qc->dev->link->ap->host->private_data;
- unsigned int consumed;
- unsigned int mode;
- unsigned long flags;
-
- local_irq_save(flags);
- regmap_fields_read(fields.mode, info->cs, &mode);
-
- /* set 16bit mode before writing data */
- regmap_fields_write(fields.mode, info->cs, (mode & ~AT91_SMC_DBW) |
- AT91_SMC_DBW_16);
-
- consumed = ata_sff_data_xfer(qc, buf, buflen, rw);
-
- /* restore 8bit mode after data is written */
- regmap_fields_write(fields.mode, info->cs, (mode & ~AT91_SMC_DBW) |
- AT91_SMC_DBW_8);
-
- local_irq_restore(flags);
- return consumed;
-}
-
-static struct scsi_host_template pata_at91_sht = {
- ATA_PIO_SHT(DRV_NAME),
-};
-
-static struct ata_port_operations pata_at91_port_ops = {
- .inherits = &ata_sff_port_ops,
-
- .sff_data_xfer = pata_at91_data_xfer_noirq,
- .set_piomode = pata_at91_set_piomode,
- .cable_detect = ata_cable_40wire,
-};
-
-static int at91sam9_smc_fields_init(struct device *dev)
-{
- struct reg_field field = REG_FIELD(0, 0, 31);
-
- field.id_size = 8;
- field.id_offset = AT91SAM9_SMC_GENERIC_BLK_SZ;
-
- field.reg = AT91SAM9_SMC_SETUP(AT91SAM9_SMC_GENERIC);
- fields.setup = devm_regmap_field_alloc(dev, smc, field);
- if (IS_ERR(fields.setup))
- return PTR_ERR(fields.setup);
-
- field.reg = AT91SAM9_SMC_PULSE(AT91SAM9_SMC_GENERIC);
- fields.pulse = devm_regmap_field_alloc(dev, smc, field);
- if (IS_ERR(fields.pulse))
- return PTR_ERR(fields.pulse);
-
- field.reg = AT91SAM9_SMC_CYCLE(AT91SAM9_SMC_GENERIC);
- fields.cycle = devm_regmap_field_alloc(dev, smc, field);
- if (IS_ERR(fields.cycle))
- return PTR_ERR(fields.cycle);
-
- field.reg = AT91SAM9_SMC_MODE(AT91SAM9_SMC_GENERIC);
- fields.mode = devm_regmap_field_alloc(dev, smc, field);
-
- return PTR_ERR_OR_ZERO(fields.mode);
-}
-
-static int pata_at91_probe(struct platform_device *pdev)
-{
- struct at91_cf_data *board = dev_get_platdata(&pdev->dev);
- struct device *dev = &pdev->dev;
- struct at91_ide_info *info;
- struct resource *mem_res;
- struct ata_host *host;
- struct ata_port *ap;
-
- int irq_flags = 0;
- int irq = 0;
- int ret;
-
- /* get platform resources: IO/CTL memories and irq/rst pins */
-
- if (pdev->num_resources != 1) {
- dev_err(&pdev->dev, "invalid number of resources\n");
- return -EINVAL;
- }
-
- mem_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-
- if (!mem_res) {
- dev_err(dev, "failed to get mem resource\n");
- return -EINVAL;
- }
-
- irq = board->irq_pin;
-
- smc = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, "atmel,smc");
- if (IS_ERR(smc))
- return PTR_ERR(smc);
-
- ret = at91sam9_smc_fields_init(dev);
- if (ret < 0)
- return ret;
-
- /* init ata host */
-
- host = ata_host_alloc(dev, 1);
-
- if (!host)
- return -ENOMEM;
-
- ap = host->ports[0];
- ap->ops = &pata_at91_port_ops;
- ap->flags |= ATA_FLAG_SLAVE_POSS;
- ap->pio_mask = ATA_PIO4;
-
- if (!gpio_is_valid(irq)) {
- ap->flags |= ATA_FLAG_PIO_POLLING;
- ata_port_desc(ap, "no IRQ, using PIO polling");
- }
-
- info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL);
-
- if (!info) {
- dev_err(dev, "failed to allocate memory for private data\n");
- return -ENOMEM;
- }
-
- info->mck = clk_get(NULL, "mck");
-
- if (IS_ERR(info->mck)) {
- dev_err(dev, "failed to get access to mck clock\n");
- return -ENODEV;
- }
-
- info->cs = board->chipselect;
- info->mode = AT91_SMC_READMODE | AT91_SMC_WRITEMODE |
- AT91_SMC_EXNWMODE_READY | AT91_SMC_BAT_SELECT |
- AT91_SMC_DBW_8 | AT91_SMC_TDF_(0);
-
- info->ide_addr = devm_ioremap(dev,
- mem_res->start + CF_IDE_OFFSET, CF_IDE_RES_SIZE);
-
- if (!info->ide_addr) {
- dev_err(dev, "failed to map IO base\n");
- ret = -ENOMEM;
- goto err_put;
- }
-
- info->alt_addr = devm_ioremap(dev,
- mem_res->start + CF_ALT_IDE_OFFSET, CF_IDE_RES_SIZE);
-
- if (!info->alt_addr) {
- dev_err(dev, "failed to map CTL base\n");
- ret = -ENOMEM;
- goto err_put;
- }
-
- ap->ioaddr.cmd_addr = info->ide_addr;
- ap->ioaddr.ctl_addr = info->alt_addr + 0x06;
- ap->ioaddr.altstatus_addr = ap->ioaddr.ctl_addr;
-
- ata_sff_std_ports(&ap->ioaddr);
-
- ata_port_desc(ap, "mmio cmd 0x%llx ctl 0x%llx",
- (unsigned long long)mem_res->start + CF_IDE_OFFSET,
- (unsigned long long)mem_res->start + CF_ALT_IDE_OFFSET);
-
- host->private_data = info;
-
- ret = ata_host_activate(host, gpio_is_valid(irq) ? gpio_to_irq(irq) : 0,
- gpio_is_valid(irq) ? ata_sff_interrupt : NULL,
- irq_flags, &pata_at91_sht);
- if (ret)
- goto err_put;
-
- return 0;
-
-err_put:
- clk_put(info->mck);
- return ret;
-}
-
-static int pata_at91_remove(struct platform_device *pdev)
-{
- struct ata_host *host = platform_get_drvdata(pdev);
- struct at91_ide_info *info;
-
- if (!host)
- return 0;
- info = host->private_data;
-
- ata_host_detach(host);
-
- if (!info)
- return 0;
-
- clk_put(info->mck);
-
- return 0;
-}
-
-static struct platform_driver pata_at91_driver = {
- .probe = pata_at91_probe,
- .remove = pata_at91_remove,
- .driver = {
- .name = DRV_NAME,
- },
-};
-
-module_platform_driver(pata_at91_driver);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Driver for CF in True IDE mode on AT91SAM9260 SoC");
-MODULE_AUTHOR("Matyukevich Sergey");
-MODULE_VERSION(DRV_VERSION);
-
diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c
index e347e7acd8ed..0adcb40d2794 100644
--- a/drivers/ata/pata_macio.c
+++ b/drivers/ata/pata_macio.c
@@ -1328,7 +1328,7 @@ static int pata_macio_pci_resume(struct pci_dev *pdev)
}
#endif /* CONFIG_PM_SLEEP */
-static struct of_device_id pata_macio_match[] =
+static const struct of_device_id pata_macio_match[] =
{
{
.name = "IDE",
diff --git a/drivers/ata/pata_mpc52xx.c b/drivers/ata/pata_mpc52xx.c
index 252ba27fa63b..9730125530f6 100644
--- a/drivers/ata/pata_mpc52xx.c
+++ b/drivers/ata/pata_mpc52xx.c
@@ -847,7 +847,7 @@ mpc52xx_ata_resume(struct platform_device *op)
}
#endif
-static struct of_device_id mpc52xx_ata_of_match[] = {
+static const struct of_device_id mpc52xx_ata_of_match[] = {
{ .compatible = "fsl,mpc5200-ata", },
{ .compatible = "mpc5200-ata", },
{},
diff --git a/drivers/ata/pata_of_platform.c b/drivers/ata/pata_of_platform.c
index 201a32d0627f..01161c1aef4d 100644
--- a/drivers/ata/pata_of_platform.c
+++ b/drivers/ata/pata_of_platform.c
@@ -67,7 +67,7 @@ static int pata_of_platform_probe(struct platform_device *ofdev)
reg_shift, pio_mask, &pata_platform_sht);
}
-static struct of_device_id pata_of_platform_match[] = {
+static const struct of_device_id pata_of_platform_match[] = {
{ .compatible = "ata-generic", },
{ },
};
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index a723ae929783..01734d54c69c 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -1612,7 +1612,7 @@ static int sata_fsl_resume(struct platform_device *op)
}
#endif
-static struct of_device_id fsl_sata_match[] = {
+static const struct of_device_id fsl_sata_match[] = {
{
.compatible = "fsl,pq-sata",
},
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 00ce26d0c047..b66bcda88320 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -4286,7 +4286,7 @@ static int mv_platform_resume(struct platform_device *pdev)
#endif
#ifdef CONFIG_OF
-static struct of_device_id mv_sata_dt_ids[] = {
+static const struct of_device_id mv_sata_dt_ids[] = {
{ .compatible = "marvell,armada-370-sata", },
{ .compatible = "marvell,orion-sata", },
{},
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index f744de7a0f9b..19df4918e37e 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -312,22 +312,6 @@ config BLK_DEV_SKD
Use device /dev/skd$N amd /dev/skd$Np$M.
-config BLK_DEV_OSD
- tristate "OSD object-as-blkdev support"
- depends on SCSI_OSD_ULD
- ---help---
- Saying Y or M here will allow the exporting of a single SCSI
- OSD (object-based storage) object as a Linux block device.
-
- For example, if you create a 2G object on an OSD device,
- you can then use this module to present that 2G object as
- a Linux block device.
-
- To compile this driver as a module, choose M here: the
- module will be called osdblk.
-
- If unsure, say N.
-
config BLK_DEV_SX8
tristate "Promise SATA SX8 support"
depends on PCI
@@ -434,23 +418,6 @@ config ATA_OVER_ETH
This driver provides Support for ATA over Ethernet block
devices like the Coraid EtherDrive (R) Storage Blade.
-config MG_DISK
- tristate "mGine mflash, gflash support"
- depends on ARM && GPIOLIB
- help
- mGine mFlash(gFlash) block device driver
-
-config MG_DISK_RES
- int "Size of reserved area before MBR"
- depends on MG_DISK
- default 0
- help
- Define size of reserved area that usually used for boot. Unit is KB.
- All of the block device operation will be taken this value as start
- offset
- Examples:
- 1024 => 1 MB
-
config SUNVDC
tristate "Sun Virtual Disk Client support"
depends on SUN_LDOMS
@@ -512,19 +479,7 @@ config VIRTIO_BLK_SCSI
Enable support for SCSI passthrough (e.g. the SG_IO ioctl) on
virtio-blk devices. This is only supported for the legacy
virtio protocol and not enabled by default by any hypervisor.
- Your probably want to virtio-scsi instead.
-
-config BLK_DEV_HD
- bool "Very old hard disk (MFM/RLL/IDE) driver"
- depends on HAVE_IDE
- depends on !ARM || ARCH_RPC || BROKEN
- help
- This is a very old hard disk driver that lacks the enhanced
- functionality of the newer ones.
-
- It is required for systems with ancient MFM/RLL/ESDI drives.
-
- If unsure, say N.
+ You probably want to use virtio-scsi instead.
config BLK_DEV_RBD
tristate "Rados block device (RBD)"
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1e9661e26f29..ec8c36897b75 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -19,10 +19,8 @@ obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o
obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o
obj-$(CONFIG_XILINX_SYSACE) += xsysace.o
obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
-obj-$(CONFIG_MG_DISK) += mg_disk.o
obj-$(CONFIG_SUNVDC) += sunvdc.o
obj-$(CONFIG_BLK_DEV_SKD) += skd.o
-obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o
obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
@@ -30,7 +28,6 @@ obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
-obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 2104b1b4ccda..fa69ecd52cb5 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -617,12 +617,12 @@ static void fd_error( void )
if (!fd_request)
return;
- fd_request->errors++;
- if (fd_request->errors >= MAX_ERRORS) {
+ fd_request->error_count++;
+ if (fd_request->error_count >= MAX_ERRORS) {
printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
fd_end_request_cur(-EIO);
}
- else if (fd_request->errors == RECALIBRATE_ERRORS) {
+ else if (fd_request->error_count == RECALIBRATE_ERRORS) {
printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
if (SelectedDrive != -1)
SUD.track = -1;
@@ -1386,7 +1386,7 @@ static void setup_req_params( int drive )
ReqData = ReqBuffer + 512 * ReqCnt;
if (UseTrackbuffer)
- read_track = (ReqCmd == READ && fd_request->errors == 0);
+ read_track = (ReqCmd == READ && fd_request->error_count == 0);
else
read_track = 0;
@@ -1409,8 +1409,10 @@ static struct request *set_next_request(void)
fdc_queue = 0;
if (q) {
rq = blk_fetch_request(q);
- if (rq)
+ if (rq) {
+ rq->error_count = 0;
break;
+ }
}
} while (fdc_queue != old_pos);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 3adc32a3153b..4ec84d504780 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -134,28 +134,6 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
return page;
}
-static void brd_free_page(struct brd_device *brd, sector_t sector)
-{
- struct page *page;
- pgoff_t idx;
-
- spin_lock(&brd->brd_lock);
- idx = sector >> PAGE_SECTORS_SHIFT;
- page = radix_tree_delete(&brd->brd_pages, idx);
- spin_unlock(&brd->brd_lock);
- if (page)
- __free_page(page);
-}
-
-static void brd_zero_page(struct brd_device *brd, sector_t sector)
-{
- struct page *page;
-
- page = brd_lookup_page(brd, sector);
- if (page)
- clear_highpage(page);
-}
-
/*
* Free all backing store pages and radix tree. This must only be called when
* there are no other users of the device.
@@ -212,24 +190,6 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
return 0;
}
-static void discard_from_brd(struct brd_device *brd,
- sector_t sector, size_t n)
-{
- while (n >= PAGE_SIZE) {
- /*
- * Don't want to actually discard pages here because
- * re-allocating the pages can result in writeback
- * deadlocks under heavy load.
- */
- if (0)
- brd_free_page(brd, sector);
- else
- brd_zero_page(brd, sector);
- sector += PAGE_SIZE >> SECTOR_SHIFT;
- n -= PAGE_SIZE;
- }
-}
-
/*
* Copy n bytes from src to the brd starting at sector. Does not sleep.
*/
@@ -338,14 +298,6 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
goto io_error;
- if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
- if (sector & ((PAGE_SIZE >> SECTOR_SHIFT) - 1) ||
- bio->bi_iter.bi_size & ~PAGE_MASK)
- goto io_error;
- discard_from_brd(brd, sector, bio->bi_iter.bi_size);
- goto out;
- }
-
bio_for_each_segment(bvec, bio, iter) {
unsigned int len = bvec.bv_len;
int err;
@@ -357,7 +309,6 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
sector += len >> SECTOR_SHIFT;
}
-out:
bio_endio(bio);
return BLK_QC_T_NONE;
io_error:
@@ -464,11 +415,6 @@ static struct brd_device *brd_alloc(int i)
* is harmless)
*/
blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
-
- brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
- blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX);
- brd->brd_queue->limits.discard_zeroes_data = 1;
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
#ifdef CONFIG_BLK_DEV_RAM_DAX
queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
#endif
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 8e1a4554951c..cd375503f7b0 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1864,8 +1864,7 @@ static void cciss_softirq_done(struct request *rq)
/* set the residual count for pc requests */
if (blk_rq_is_passthrough(rq))
scsi_req(rq)->resid_len = c->err_info->ResidualCnt;
-
- blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
+ blk_end_request_all(rq, scsi_req(rq)->result ? -EIO : 0);
spin_lock_irqsave(&h->lock, flags);
cmd_free(h, c);
@@ -3140,18 +3139,19 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
{
int retry_cmd = 0;
struct request *rq = cmd->rq;
+ struct scsi_request *sreq = scsi_req(rq);
- rq->errors = 0;
+ sreq->result = 0;
if (timeout)
- rq->errors = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT);
+ sreq->result = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT);
if (cmd->err_info->CommandStatus == 0) /* no error has occurred */
goto after_error_processing;
switch (cmd->err_info->CommandStatus) {
case CMD_TARGET_STATUS:
- rq->errors = evaluate_target_status(h, cmd, &retry_cmd);
+ sreq->result = evaluate_target_status(h, cmd, &retry_cmd);
break;
case CMD_DATA_UNDERRUN:
if (!blk_rq_is_passthrough(cmd->rq)) {
@@ -3169,7 +3169,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_INVALID:
dev_warn(&h->pdev->dev, "cciss: cmd %p is "
"reported invalid\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3177,7 +3177,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_PROTOCOL_ERR:
dev_warn(&h->pdev->dev, "cciss: cmd %p has "
"protocol error\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3185,7 +3185,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_HARDWARE_ERR:
dev_warn(&h->pdev->dev, "cciss: cmd %p had "
" hardware error\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3193,7 +3193,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_CONNECTION_LOST:
dev_warn(&h->pdev->dev, "cciss: cmd %p had "
"connection lost\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3201,7 +3201,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_ABORTED:
dev_warn(&h->pdev->dev, "cciss: cmd %p was "
"aborted\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ABORT);
@@ -3209,7 +3209,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_ABORT_FAILED:
dev_warn(&h->pdev->dev, "cciss: cmd %p reports "
"abort failed\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3224,21 +3224,21 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
} else
dev_warn(&h->pdev->dev,
"%p retried too many times\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ABORT);
break;
case CMD_TIMEOUT:
dev_warn(&h->pdev->dev, "cmd %p timedout\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
break;
case CMD_UNABORTABLE:
dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3247,7 +3247,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
dev_warn(&h->pdev->dev, "cmd %p returned "
"unknown status %x\n", cmd,
cmd->err_info->CommandStatus);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3380,9 +3380,9 @@ static void do_cciss_request(struct request_queue *q)
if (dma_mapping_error(&h->pdev->dev, temp64.val)) {
dev_warn(&h->pdev->dev,
"%s: error mapping page for DMA\n", __func__);
- creq->errors = make_status_bytes(SAM_STAT_GOOD,
- 0, DRIVER_OK,
- DID_SOFT_ERROR);
+ scsi_req(creq)->result =
+ make_status_bytes(SAM_STAT_GOOD, 0, DRIVER_OK,
+ DID_SOFT_ERROR);
cmd_free(h, c);
return;
}
@@ -3395,9 +3395,9 @@ static void do_cciss_request(struct request_queue *q)
if (cciss_map_sg_chain_block(h, c, h->cmd_sg_list[c->cmdindex],
(seg - (h->max_cmd_sgentries - 1)) *
sizeof(SGDescriptor_struct))) {
- creq->errors = make_status_bytes(SAM_STAT_GOOD,
- 0, DRIVER_OK,
- DID_SOFT_ERROR);
+ scsi_req(creq)->result =
+ make_status_bytes(SAM_STAT_GOOD, 0, DRIVER_OK,
+ DID_SOFT_ERROR);
cmd_free(h, c);
return;
}
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index de5c3ee8a790..494837e59f23 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -236,9 +236,6 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re
seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
-
- if (f & EE_IS_TRIM)
- __seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim");
seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
seq_putc(m, '\n');
}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 724d1c50fc52..d5da45bb03a6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -437,9 +437,6 @@ enum {
/* is this a TRIM aka REQ_DISCARD? */
__EE_IS_TRIM,
- /* our lower level cannot handle trim,
- * and we want to fall back to zeroout instead */
- __EE_IS_TRIM_USE_ZEROOUT,
/* In case a barrier failed,
* we need to resubmit without the barrier flag. */
@@ -482,7 +479,6 @@ enum {
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_IS_TRIM (1<<__EE_IS_TRIM)
-#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
@@ -1561,8 +1557,6 @@ extern void start_resync_timer_fn(unsigned long data);
extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
/* drbd_receiver.c */
-extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
- sector_t start, unsigned int nr_sectors, bool discard);
extern int drbd_receiver(struct drbd_thread *thi);
extern int drbd_ack_receiver(struct drbd_thread *thi);
extern void drbd_send_ping_wf(struct work_struct *ws);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 92c60cbd04ee..84455c365f57 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -931,7 +931,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
p->qlim->io_min = cpu_to_be32(queue_io_min(q));
p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
p->qlim->discard_enabled = blk_queue_discard(q);
- p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
} else {
q = device->rq_queue;
@@ -941,7 +940,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
p->qlim->io_min = cpu_to_be32(queue_io_min(q));
p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
p->qlim->discard_enabled = 0;
- p->qlim->discard_zeroes_data = 0;
p->qlim->write_same_capable = 0;
}
}
@@ -1668,7 +1666,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
(bio->bi_opf & REQ_FUA ? DP_FUA : 0) |
(bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) |
(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
- (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
+ (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) |
+ (bio_op(bio) == REQ_OP_WRITE_ZEROES ? DP_DISCARD : 0);
else
return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0;
}
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 908c704e20aa..02255a0d68b9 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1199,10 +1199,6 @@ static void decide_on_discard_support(struct drbd_device *device,
struct drbd_connection *connection = first_peer_device(device)->connection;
bool can_do = b ? blk_queue_discard(b) : true;
- if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
- can_do = false;
- drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
- }
if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
can_do = false;
drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
@@ -1217,10 +1213,12 @@ static void decide_on_discard_support(struct drbd_device *device,
blk_queue_discard_granularity(q, 512);
q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+ q->limits.max_write_zeroes_sectors = drbd_max_discard_sectors(connection);
} else {
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
blk_queue_discard_granularity(q, 0);
q->limits.max_discard_sectors = 0;
+ q->limits.max_write_zeroes_sectors = 0;
}
}
@@ -1482,8 +1480,7 @@ static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *dis
if (disk_conf->al_extents > drbd_al_extents_max(nbc))
disk_conf->al_extents = drbd_al_extents_max(nbc);
- if (!blk_queue_discard(q)
- || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
+ if (!blk_queue_discard(q)) {
if (disk_conf->rs_discard_granularity) {
disk_conf->rs_discard_granularity = 0; /* disable feature */
drbd_info(device, "rs_discard_granularity feature disabled\n");
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index aa6bf9692eff..1b0a2be24f39 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1448,105 +1448,14 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
}
-/*
- * We *may* ignore the discard-zeroes-data setting, if so configured.
- *
- * Assumption is that it "discard_zeroes_data=0" is only because the backend
- * may ignore partial unaligned discards.
- *
- * LVM/DM thin as of at least
- * LVM version: 2.02.115(2)-RHEL7 (2015-01-28)
- * Library version: 1.02.93-RHEL7 (2015-01-28)
- * Driver version: 4.29.0
- * still behaves this way.
- *
- * For unaligned (wrt. alignment and granularity) or too small discards,
- * we zero-out the initial (and/or) trailing unaligned partial chunks,
- * but discard all the aligned full chunks.
- *
- * At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1".
- */
-int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, bool discard)
-{
- struct block_device *bdev = device->ldev->backing_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- sector_t tmp, nr;
- unsigned int max_discard_sectors, granularity;
- int alignment;
- int err = 0;
-
- if (!discard)
- goto zero_out;
-
- /* Zero-sector (unknown) and one-sector granularities are the same. */
- granularity = max(q->limits.discard_granularity >> 9, 1U);
- alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
-
- max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
- max_discard_sectors -= max_discard_sectors % granularity;
- if (unlikely(!max_discard_sectors))
- goto zero_out;
-
- if (nr_sectors < granularity)
- goto zero_out;
-
- tmp = start;
- if (sector_div(tmp, granularity) != alignment) {
- if (nr_sectors < 2*granularity)
- goto zero_out;
- /* start + gran - (start + gran - align) % gran */
- tmp = start + granularity - alignment;
- tmp = start + granularity - sector_div(tmp, granularity);
-
- nr = tmp - start;
- err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
- nr_sectors -= nr;
- start = tmp;
- }
- while (nr_sectors >= granularity) {
- nr = min_t(sector_t, nr_sectors, max_discard_sectors);
- err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
- nr_sectors -= nr;
- start += nr;
- }
- zero_out:
- if (nr_sectors) {
- err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0);
- }
- return err != 0;
-}
-
-static bool can_do_reliable_discards(struct drbd_device *device)
-{
- struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
- struct disk_conf *dc;
- bool can_do;
-
- if (!blk_queue_discard(q))
- return false;
-
- if (q->limits.discard_zeroes_data)
- return true;
-
- rcu_read_lock();
- dc = rcu_dereference(device->ldev->disk_conf);
- can_do = dc->discard_zeroes_if_aligned;
- rcu_read_unlock();
- return can_do;
-}
-
static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
{
- /* If the backend cannot discard, or does not guarantee
- * read-back zeroes in discarded ranges, we fall back to
- * zero-out. Unless configuration specifically requested
- * otherwise. */
- if (!can_do_reliable_discards(device))
- peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+ struct block_device *bdev = device->ldev->backing_bdev;
- if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
- peer_req->i.size >> 9, !(peer_req->flags & EE_IS_TRIM_USE_ZEROOUT)))
+ if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9,
+ GFP_NOIO, 0))
peer_req->flags |= EE_WAS_ERROR;
+
drbd_endio_write_sec_final(peer_req);
}
@@ -2376,7 +2285,7 @@ static unsigned long wire_flags_to_bio_flags(u32 dpf)
static unsigned long wire_flags_to_bio_op(u32 dpf)
{
if (dpf & DP_DISCARD)
- return REQ_OP_DISCARD;
+ return REQ_OP_WRITE_ZEROES;
else
return REQ_OP_WRITE;
}
@@ -2567,7 +2476,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
op_flags = wire_flags_to_bio_flags(dp_flags);
if (pi->cmd == P_TRIM) {
D_ASSERT(peer_device, peer_req->i.size > 0);
- D_ASSERT(peer_device, op == REQ_OP_DISCARD);
+ D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
D_ASSERT(peer_device, peer_req->pages == NULL);
} else if (peer_req->pages == NULL) {
D_ASSERT(device, peer_req->i.size == 0);
@@ -4880,7 +4789,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
if (get_ldev(device)) {
struct drbd_peer_request *peer_req;
- const int op = REQ_OP_DISCARD;
+ const int op = REQ_OP_WRITE_ZEROES;
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
size, 0, GFP_NOIO);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 652114ae1a8a..b5730e17b455 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -59,6 +59,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
drbd_req_make_private_bio(req, bio_src);
req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
| (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
+ | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_UNMAP : 0)
| (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
req->device = device;
req->master_bio = bio_src;
@@ -1148,10 +1149,10 @@ static int drbd_process_write_request(struct drbd_request *req)
static void drbd_process_discard_req(struct drbd_request *req)
{
- int err = drbd_issue_discard_or_zero_out(req->device,
- req->i.sector, req->i.size >> 9, true);
+ struct block_device *bdev = req->device->ldev->backing_bdev;
- if (err)
+ if (blkdev_issue_zeroout(bdev, req->i.sector, req->i.size >> 9,
+ GFP_NOIO, 0))
req->private_bio->bi_error = -EIO;
bio_endio(req->private_bio);
}
@@ -1180,7 +1181,8 @@ drbd_submit_req_private_bio(struct drbd_request *req)
if (get_ldev(device)) {
if (drbd_insert_fault(device, type))
bio_io_error(bio);
- else if (bio_op(bio) == REQ_OP_DISCARD)
+ else if (bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+ bio_op(bio) == REQ_OP_DISCARD)
drbd_process_discard_req(req);
else
generic_make_request(bio);
@@ -1234,7 +1236,8 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
_drbd_start_io_acct(device, req);
/* process discards always from our submitter thread */
- if (bio_op(bio) & REQ_OP_DISCARD)
+ if ((bio_op(bio) & REQ_OP_WRITE_ZEROES) ||
+ (bio_op(bio) & REQ_OP_DISCARD))
goto queue_for_submitter_thread;
if (rw == WRITE && req->private_bio && req->i.size
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 3bff33f21435..1afcb4e02d8d 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -174,7 +174,8 @@ void drbd_peer_request_endio(struct bio *bio)
struct drbd_peer_request *peer_req = bio->bi_private;
struct drbd_device *device = peer_req->peer_device->device;
bool is_write = bio_data_dir(bio) == WRITE;
- bool is_discard = !!(bio_op(bio) == REQ_OP_DISCARD);
+ bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+ bio_op(bio) == REQ_OP_DISCARD;
if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
drbd_warn(device, "%s: error=%d s=%llus\n",
@@ -249,6 +250,7 @@ void drbd_request_endio(struct bio *bio)
/* to avoid recursion in __req_mod */
if (unlikely(bio->bi_error)) {
switch (bio_op(bio)) {
+ case REQ_OP_WRITE_ZEROES:
case REQ_OP_DISCARD:
if (bio->bi_error == -EOPNOTSUPP)
what = DISCARD_COMPLETED_NOTSUPP;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 45b4384f650c..60d4c7653178 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2805,8 +2805,10 @@ static int set_next_request(void)
fdc_queue = 0;
if (q) {
current_req = blk_fetch_request(q);
- if (current_req)
+ if (current_req) {
+ current_req->error_count = 0;
break;
+ }
}
} while (fdc_queue != old_pos);
@@ -2866,7 +2868,7 @@ do_request:
_floppy = floppy_type + DP->autodetect[DRS->probed_format];
} else
probing = 0;
- errors = &(current_req->errors);
+ errors = &(current_req->error_count);
tmp = make_raw_rw_request();
if (tmp < 2) {
request_done(tmp);
@@ -4207,9 +4209,7 @@ static int __init do_floppy_init(void)
disks[drive]->fops = &floppy_fops;
sprintf(disks[drive]->disk_name, "fd%d", drive);
- init_timer(&motor_off_timer[drive]);
- motor_off_timer[drive].data = drive;
- motor_off_timer[drive].function = motor_off_callback;
+ setup_timer(&motor_off_timer[drive], motor_off_callback, drive);
}
err = register_blkdev(FLOPPY_MAJOR, "fd");
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
deleted file mode 100644
index 6043648da1e8..000000000000
--- a/drivers/block/hd.c
+++ /dev/null
@@ -1,803 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This is the low-level hd interrupt support. It traverses the
- * request-list, using interrupts to jump between functions. As
- * all the functions are called within interrupts, we may not
- * sleep. Special care is recommended.
- *
- * modified by Drew Eckhardt to check nr of hd's from the CMOS.
- *
- * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
- * in the early extended-partition checks and added DM partitions
- *
- * IRQ-unmask, drive-id, multiple-mode, support for ">16 heads",
- * and general streamlining by Mark Lord.
- *
- * Removed 99% of above. Use Mark's ide driver for those options.
- * This is now a lightweight ST-506 driver. (Paul Gortmaker)
- *
- * Modified 1995 Russell King for ARM processor.
- *
- * Bugfix: max_sectors must be <= 255 or the wheels tend to come
- * off in a hurry once you queue things up - Paul G. 02/2001
- */
-
-/* Uncomment the following if you want verbose error reports. */
-/* #define VERBOSE_ERRORS */
-
-#include <linux/blkdev.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/interrupt.h>
-#include <linux/timer.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/genhd.h>
-#include <linux/string.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/blkpg.h>
-#include <linux/ata.h>
-#include <linux/hdreg.h>
-
-#define HD_IRQ 14
-
-#define REALLY_SLOW_IO
-#include <asm/io.h>
-#include <linux/uaccess.h>
-
-#ifdef __arm__
-#undef HD_IRQ
-#endif
-#include <asm/irq.h>
-#ifdef __arm__
-#define HD_IRQ IRQ_HARDDISK
-#endif
-
-/* Hd controller regster ports */
-
-#define HD_DATA 0x1f0 /* _CTL when writing */
-#define HD_ERROR 0x1f1 /* see err-bits */
-#define HD_NSECTOR 0x1f2 /* nr of sectors to read/write */
-#define HD_SECTOR 0x1f3 /* starting sector */
-#define HD_LCYL 0x1f4 /* starting cylinder */
-#define HD_HCYL 0x1f5 /* high byte of starting cyl */
-#define HD_CURRENT 0x1f6 /* 101dhhhh , d=drive, hhhh=head */
-#define HD_STATUS 0x1f7 /* see status-bits */
-#define HD_FEATURE HD_ERROR /* same io address, read=error, write=feature */
-#define HD_PRECOMP HD_FEATURE /* obsolete use of this port - predates IDE */
-#define HD_COMMAND HD_STATUS /* same io address, read=status, write=cmd */
-
-#define HD_CMD 0x3f6 /* used for resets */
-#define HD_ALTSTATUS 0x3f6 /* same as HD_STATUS but doesn't clear irq */
-
-/* Bits of HD_STATUS */
-#define ERR_STAT 0x01
-#define INDEX_STAT 0x02
-#define ECC_STAT 0x04 /* Corrected error */
-#define DRQ_STAT 0x08
-#define SEEK_STAT 0x10
-#define SERVICE_STAT SEEK_STAT
-#define WRERR_STAT 0x20
-#define READY_STAT 0x40
-#define BUSY_STAT 0x80
-
-/* Bits for HD_ERROR */
-#define MARK_ERR 0x01 /* Bad address mark */
-#define TRK0_ERR 0x02 /* couldn't find track 0 */
-#define ABRT_ERR 0x04 /* Command aborted */
-#define MCR_ERR 0x08 /* media change request */
-#define ID_ERR 0x10 /* ID field not found */
-#define MC_ERR 0x20 /* media changed */
-#define ECC_ERR 0x40 /* Uncorrectable ECC error */
-#define BBD_ERR 0x80 /* pre-EIDE meaning: block marked bad */
-#define ICRC_ERR 0x80 /* new meaning: CRC error during transfer */
-
-static DEFINE_SPINLOCK(hd_lock);
-static struct request_queue *hd_queue;
-static struct request *hd_req;
-
-#define TIMEOUT_VALUE (6*HZ)
-#define HD_DELAY 0
-
-#define MAX_ERRORS 16 /* Max read/write errors/sector */
-#define RESET_FREQ 8 /* Reset controller every 8th retry */
-#define RECAL_FREQ 4 /* Recalibrate every 4th retry */
-#define MAX_HD 2
-
-#define STAT_OK (READY_STAT|SEEK_STAT)
-#define OK_STATUS(s) (((s)&(STAT_OK|(BUSY_STAT|WRERR_STAT|ERR_STAT)))==STAT_OK)
-
-static void recal_intr(void);
-static void bad_rw_intr(void);
-
-static int reset;
-static int hd_error;
-
-/*
- * This struct defines the HD's and their types.
- */
-struct hd_i_struct {
- unsigned int head, sect, cyl, wpcom, lzone, ctl;
- int unit;
- int recalibrate;
- int special_op;
-};
-
-#ifdef HD_TYPE
-static struct hd_i_struct hd_info[] = { HD_TYPE };
-static int NR_HD = ARRAY_SIZE(hd_info);
-#else
-static struct hd_i_struct hd_info[MAX_HD];
-static int NR_HD;
-#endif
-
-static struct gendisk *hd_gendisk[MAX_HD];
-
-static struct timer_list device_timer;
-
-#define TIMEOUT_VALUE (6*HZ)
-
-#define SET_TIMER \
- do { \
- mod_timer(&device_timer, jiffies + TIMEOUT_VALUE); \
- } while (0)
-
-static void (*do_hd)(void) = NULL;
-#define SET_HANDLER(x) \
-if ((do_hd = (x)) != NULL) \
- SET_TIMER; \
-else \
- del_timer(&device_timer);
-
-
-#if (HD_DELAY > 0)
-
-#include <linux/i8253.h>
-
-unsigned long last_req;
-
-unsigned long read_timer(void)
-{
- unsigned long t, flags;
- int i;
-
- raw_spin_lock_irqsave(&i8253_lock, flags);
- t = jiffies * 11932;
- outb_p(0, 0x43);
- i = inb_p(0x40);
- i |= inb(0x40) << 8;
- raw_spin_unlock_irqrestore(&i8253_lock, flags);
- return(t - i);
-}
-#endif
-
-static void __init hd_setup(char *str, int *ints)
-{
- int hdind = 0;
-
- if (ints[0] != 3)
- return;
- if (hd_info[0].head != 0)
- hdind = 1;
- hd_info[hdind].head = ints[2];
- hd_info[hdind].sect = ints[3];
- hd_info[hdind].cyl = ints[1];
- hd_info[hdind].wpcom = 0;
- hd_info[hdind].lzone = ints[1];
- hd_info[hdind].ctl = (ints[2] > 8 ? 8 : 0);
- NR_HD = hdind+1;
-}
-
-static bool hd_end_request(int err, unsigned int bytes)
-{
- if (__blk_end_request(hd_req, err, bytes))
- return true;
- hd_req = NULL;
- return false;
-}
-
-static bool hd_end_request_cur(int err)
-{
- return hd_end_request(err, blk_rq_cur_bytes(hd_req));
-}
-
-static void dump_status(const char *msg, unsigned int stat)
-{
- char *name = "hd?";
- if (hd_req)
- name = hd_req->rq_disk->disk_name;
-
-#ifdef VERBOSE_ERRORS
- printk("%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
- if (stat & BUSY_STAT) printk("Busy ");
- if (stat & READY_STAT) printk("DriveReady ");
- if (stat & WRERR_STAT) printk("WriteFault ");
- if (stat & SEEK_STAT) printk("SeekComplete ");
- if (stat & DRQ_STAT) printk("DataRequest ");
- if (stat & ECC_STAT) printk("CorrectedError ");
- if (stat & INDEX_STAT) printk("Index ");
- if (stat & ERR_STAT) printk("Error ");
- printk("}\n");
- if ((stat & ERR_STAT) == 0) {
- hd_error = 0;
- } else {
- hd_error = inb(HD_ERROR);
- printk("%s: %s: error=0x%02x { ", name, msg, hd_error & 0xff);
- if (hd_error & BBD_ERR) printk("BadSector ");
- if (hd_error & ECC_ERR) printk("UncorrectableError ");
- if (hd_error & ID_ERR) printk("SectorIdNotFound ");
- if (hd_error & ABRT_ERR) printk("DriveStatusError ");
- if (hd_error & TRK0_ERR) printk("TrackZeroNotFound ");
- if (hd_error & MARK_ERR) printk("AddrMarkNotFound ");
- printk("}");
- if (hd_error & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) {
- printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL),
- inb(HD_CURRENT) & 0xf, inb(HD_SECTOR));
- if (hd_req)
- printk(", sector=%ld", blk_rq_pos(hd_req));
- }
- printk("\n");
- }
-#else
- printk("%s: %s: status=0x%02x.\n", name, msg, stat & 0xff);
- if ((stat & ERR_STAT) == 0) {
- hd_error = 0;
- } else {
- hd_error = inb(HD_ERROR);
- printk("%s: %s: error=0x%02x.\n", name, msg, hd_error & 0xff);
- }
-#endif
-}
-
-static void check_status(void)
-{
- int i = inb_p(HD_STATUS);
-
- if (!OK_STATUS(i)) {
- dump_status("check_status", i);
- bad_rw_intr();
- }
-}
-
-static int controller_busy(void)
-{
- int retries = 100000;
- unsigned char status;
-
- do {
- status = inb_p(HD_STATUS);
- } while ((status & BUSY_STAT) && --retries);
- return status;
-}
-
-static int status_ok(void)
-{
- unsigned char status = inb_p(HD_STATUS);
-
- if (status & BUSY_STAT)
- return 1; /* Ancient, but does it make sense??? */
- if (status & WRERR_STAT)
- return 0;
- if (!(status & READY_STAT))
- return 0;
- if (!(status & SEEK_STAT))
- return 0;
- return 1;
-}
-
-static int controller_ready(unsigned int drive, unsigned int head)
-{
- int retry = 100;
-
- do {
- if (controller_busy() & BUSY_STAT)
- return 0;
- outb_p(0xA0 | (drive<<4) | head, HD_CURRENT);
- if (status_ok())
- return 1;
- } while (--retry);
- return 0;
-}
-
-static void hd_out(struct hd_i_struct *disk,
- unsigned int nsect,
- unsigned int sect,
- unsigned int head,
- unsigned int cyl,
- unsigned int cmd,
- void (*intr_addr)(void))
-{
- unsigned short port;
-
-#if (HD_DELAY > 0)
- while (read_timer() - last_req < HD_DELAY)
- /* nothing */;
-#endif
- if (reset)
- return;
- if (!controller_ready(disk->unit, head)) {
- reset = 1;
- return;
- }
- SET_HANDLER(intr_addr);
- outb_p(disk->ctl, HD_CMD);
- port = HD_DATA;
- outb_p(disk->wpcom >> 2, ++port);
- outb_p(nsect, ++port);
- outb_p(sect, ++port);
- outb_p(cyl, ++port);
- outb_p(cyl >> 8, ++port);
- outb_p(0xA0 | (disk->unit << 4) | head, ++port);
- outb_p(cmd, ++port);
-}
-
-static void hd_request (void);
-
-static int drive_busy(void)
-{
- unsigned int i;
- unsigned char c;
-
- for (i = 0; i < 500000 ; i++) {
- c = inb_p(HD_STATUS);
- if ((c & (BUSY_STAT | READY_STAT | SEEK_STAT)) == STAT_OK)
- return 0;
- }
- dump_status("reset timed out", c);
- return 1;
-}
-
-static void reset_controller(void)
-{
- int i;
-
- outb_p(4, HD_CMD);
- for (i = 0; i < 1000; i++) barrier();
- outb_p(hd_info[0].ctl & 0x0f, HD_CMD);
- for (i = 0; i < 1000; i++) barrier();
- if (drive_busy())
- printk("hd: controller still busy\n");
- else if ((hd_error = inb(HD_ERROR)) != 1)
- printk("hd: controller reset failed: %02x\n", hd_error);
-}
-
-static void reset_hd(void)
-{
- static int i;
-
-repeat:
- if (reset) {
- reset = 0;
- i = -1;
- reset_controller();
- } else {
- check_status();
- if (reset)
- goto repeat;
- }
- if (++i < NR_HD) {
- struct hd_i_struct *disk = &hd_info[i];
- disk->special_op = disk->recalibrate = 1;
- hd_out(disk, disk->sect, disk->sect, disk->head-1,
- disk->cyl, ATA_CMD_INIT_DEV_PARAMS, &reset_hd);
- if (reset)
- goto repeat;
- } else
- hd_request();
-}
-
-/*
- * Ok, don't know what to do with the unexpected interrupts: on some machines
- * doing a reset and a retry seems to result in an eternal loop. Right now I
- * ignore it, and just set the timeout.
- *
- * On laptops (and "green" PCs), an unexpected interrupt occurs whenever the
- * drive enters "idle", "standby", or "sleep" mode, so if the status looks
- * "good", we just ignore the interrupt completely.
- */
-static void unexpected_hd_interrupt(void)
-{
- unsigned int stat = inb_p(HD_STATUS);
-
- if (stat & (BUSY_STAT|DRQ_STAT|ECC_STAT|ERR_STAT)) {
- dump_status("unexpected interrupt", stat);
- SET_TIMER;
- }
-}
-
-/*
- * bad_rw_intr() now tries to be a bit smarter and does things
- * according to the error returned by the controller.
- * -Mika Liljeberg (liljeber@cs.Helsinki.FI)
- */
-static void bad_rw_intr(void)
-{
- struct request *req = hd_req;
-
- if (req != NULL) {
- struct hd_i_struct *disk = req->rq_disk->private_data;
- if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) {
- hd_end_request_cur(-EIO);
- disk->special_op = disk->recalibrate = 1;
- } else if (req->errors % RESET_FREQ == 0)
- reset = 1;
- else if ((hd_error & TRK0_ERR) || req->errors % RECAL_FREQ == 0)
- disk->special_op = disk->recalibrate = 1;
- /* Otherwise just retry */
- }
-}
-
-static inline int wait_DRQ(void)
-{
- int retries;
- int stat;
-
- for (retries = 0; retries < 100000; retries++) {
- stat = inb_p(HD_STATUS);
- if (stat & DRQ_STAT)
- return 0;
- }
- dump_status("wait_DRQ", stat);
- return -1;
-}
-
-static void read_intr(void)
-{
- struct request *req;
- int i, retries = 100000;
-
- do {
- i = (unsigned) inb_p(HD_STATUS);
- if (i & BUSY_STAT)
- continue;
- if (!OK_STATUS(i))
- break;
- if (i & DRQ_STAT)
- goto ok_to_read;
- } while (--retries > 0);
- dump_status("read_intr", i);
- bad_rw_intr();
- hd_request();
- return;
-
-ok_to_read:
- req = hd_req;
- insw(HD_DATA, bio_data(req->bio), 256);
-#ifdef DEBUG
- printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
- req->rq_disk->disk_name, blk_rq_pos(req) + 1,
- blk_rq_sectors(req) - 1, bio_data(req->bio)+512);
-#endif
- if (hd_end_request(0, 512)) {
- SET_HANDLER(&read_intr);
- return;
- }
-
- (void) inb_p(HD_STATUS);
-#if (HD_DELAY > 0)
- last_req = read_timer();
-#endif
- hd_request();
-}
-
-static void write_intr(void)
-{
- struct request *req = hd_req;
- int i;
- int retries = 100000;
-
- do {
- i = (unsigned) inb_p(HD_STATUS);
- if (i & BUSY_STAT)
- continue;
- if (!OK_STATUS(i))
- break;
- if ((blk_rq_sectors(req) <= 1) || (i & DRQ_STAT))
- goto ok_to_write;
- } while (--retries > 0);
- dump_status("write_intr", i);
- bad_rw_intr();
- hd_request();
- return;
-
-ok_to_write:
- if (hd_end_request(0, 512)) {
- SET_HANDLER(&write_intr);
- outsw(HD_DATA, bio_data(req->bio), 256);
- return;
- }
-
-#if (HD_DELAY > 0)
- last_req = read_timer();
-#endif
- hd_request();
-}
-
-static void recal_intr(void)
-{
- check_status();
-#if (HD_DELAY > 0)
- last_req = read_timer();
-#endif
- hd_request();
-}
-
-/*
- * This is another of the error-routines I don't know what to do with. The
- * best idea seems to just set reset, and start all over again.
- */
-static void hd_times_out(unsigned long dummy)
-{
- char *name;
-
- do_hd = NULL;
-
- if (!hd_req)
- return;
-
- spin_lock_irq(hd_queue->queue_lock);
- reset = 1;
- name = hd_req->rq_disk->disk_name;
- printk("%s: timeout\n", name);
- if (++hd_req->errors >= MAX_ERRORS) {
-#ifdef DEBUG
- printk("%s: too many errors\n", name);
-#endif
- hd_end_request_cur(-EIO);
- }
- hd_request();
- spin_unlock_irq(hd_queue->queue_lock);
-}
-
-static int do_special_op(struct hd_i_struct *disk, struct request *req)
-{
- if (disk->recalibrate) {
- disk->recalibrate = 0;
- hd_out(disk, disk->sect, 0, 0, 0, ATA_CMD_RESTORE, &recal_intr);
- return reset;
- }
- if (disk->head > 16) {
- printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name);
- hd_end_request_cur(-EIO);
- }
- disk->special_op = 0;
- return 1;
-}
-
-/*
- * The driver enables interrupts as much as possible. In order to do this,
- * (a) the device-interrupt is disabled before entering hd_request(),
- * and (b) the timeout-interrupt is disabled before the sti().
- *
- * Interrupts are still masked (by default) whenever we are exchanging
- * data/cmds with a drive, because some drives seem to have very poor
- * tolerance for latency during I/O. The IDE driver has support to unmask
- * interrupts for non-broken hardware, so use that driver if required.
- */
-static void hd_request(void)
-{
- unsigned int block, nsect, sec, track, head, cyl;
- struct hd_i_struct *disk;
- struct request *req;
-
- if (do_hd)
- return;
-repeat:
- del_timer(&device_timer);
-
- if (!hd_req) {
- hd_req = blk_fetch_request(hd_queue);
- if (!hd_req) {
- do_hd = NULL;
- return;
- }
- }
- req = hd_req;
-
- if (reset) {
- reset_hd();
- return;
- }
- disk = req->rq_disk->private_data;
- block = blk_rq_pos(req);
- nsect = blk_rq_sectors(req);
- if (block >= get_capacity(req->rq_disk) ||
- ((block+nsect) > get_capacity(req->rq_disk))) {
- printk("%s: bad access: block=%d, count=%d\n",
- req->rq_disk->disk_name, block, nsect);
- hd_end_request_cur(-EIO);
- goto repeat;
- }
-
- if (disk->special_op) {
- if (do_special_op(disk, req))
- goto repeat;
- return;
- }
- sec = block % disk->sect + 1;
- track = block / disk->sect;
- head = track % disk->head;
- cyl = track / disk->head;
-#ifdef DEBUG
- printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
- req->rq_disk->disk_name,
- req_data_dir(req) == READ ? "read" : "writ",
- cyl, head, sec, nsect, bio_data(req->bio));
-#endif
-
- switch (req_op(req)) {
- case REQ_OP_READ:
- hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
- &read_intr);
- if (reset)
- goto repeat;
- break;
- case REQ_OP_WRITE:
- hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE,
- &write_intr);
- if (reset)
- goto repeat;
- if (wait_DRQ()) {
- bad_rw_intr();
- goto repeat;
- }
- outsw(HD_DATA, bio_data(req->bio), 256);
- break;
- default:
- printk("unknown hd-command\n");
- hd_end_request_cur(-EIO);
- break;
- }
-}
-
-static void do_hd_request(struct request_queue *q)
-{
- hd_request();
-}
-
-static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- struct hd_i_struct *disk = bdev->bd_disk->private_data;
-
- geo->heads = disk->head;
- geo->sectors = disk->sect;
- geo->cylinders = disk->cyl;
- return 0;
-}
-
-/*
- * Releasing a block device means we sync() it, so that it can safely
- * be forgotten about...
- */
-
-static irqreturn_t hd_interrupt(int irq, void *dev_id)
-{
- void (*handler)(void) = do_hd;
-
- spin_lock(hd_queue->queue_lock);
-
- do_hd = NULL;
- del_timer(&device_timer);
- if (!handler)
- handler = unexpected_hd_interrupt;
- handler();
-
- spin_unlock(hd_queue->queue_lock);
-
- return IRQ_HANDLED;
-}
-
-static const struct block_device_operations hd_fops = {
- .getgeo = hd_getgeo,
-};
-
-static int __init hd_init(void)
-{
- int drive;
-
- if (register_blkdev(HD_MAJOR, "hd"))
- return -1;
-
- hd_queue = blk_init_queue(do_hd_request, &hd_lock);
- if (!hd_queue) {
- unregister_blkdev(HD_MAJOR, "hd");
- return -ENOMEM;
- }
-
- blk_queue_max_hw_sectors(hd_queue, 255);
- init_timer(&device_timer);
- device_timer.function = hd_times_out;
- blk_queue_logical_block_size(hd_queue, 512);
-
- if (!NR_HD) {
- /*
- * We don't know anything about the drive. This means
- * that you *MUST* specify the drive parameters to the
- * kernel yourself.
- *
- * If we were on an i386, we used to read this info from
- * the BIOS or CMOS. This doesn't work all that well,
- * since this assumes that this is a primary or secondary
- * drive, and if we're using this legacy driver, it's
- * probably an auxiliary controller added to recover
- * legacy data off an ST-506 drive. Either way, it's
- * definitely safest to have the user explicitly specify
- * the information.
- */
- printk("hd: no drives specified - use hd=cyl,head,sectors"
- " on kernel command line\n");
- goto out;
- }
-
- for (drive = 0 ; drive < NR_HD ; drive++) {
- struct gendisk *disk = alloc_disk(64);
- struct hd_i_struct *p = &hd_info[drive];
- if (!disk)
- goto Enomem;
- disk->major = HD_MAJOR;
- disk->first_minor = drive << 6;
- disk->fops = &hd_fops;
- sprintf(disk->disk_name, "hd%c", 'a'+drive);
- disk->private_data = p;
- set_capacity(disk, p->head * p->sect * p->cyl);
- disk->queue = hd_queue;
- p->unit = drive;
- hd_gendisk[drive] = disk;
- printk("%s: %luMB, CHS=%d/%d/%d\n",
- disk->disk_name, (unsigned long)get_capacity(disk)/2048,
- p->cyl, p->head, p->sect);
- }
-
- if (request_irq(HD_IRQ, hd_interrupt, 0, "hd", NULL)) {
- printk("hd: unable to get IRQ%d for the hard disk driver\n",
- HD_IRQ);
- goto out1;
- }
- if (!request_region(HD_DATA, 8, "hd")) {
- printk(KERN_WARNING "hd: port 0x%x busy\n", HD_DATA);
- goto out2;
- }
- if (!request_region(HD_CMD, 1, "hd(cmd)")) {
- printk(KERN_WARNING "hd: port 0x%x busy\n", HD_CMD);
- goto out3;
- }
-
- /* Let them fly */
- for (drive = 0; drive < NR_HD; drive++)
- add_disk(hd_gendisk[drive]);
-
- return 0;
-
-out3:
- release_region(HD_DATA, 8);
-out2:
- free_irq(HD_IRQ, NULL);
-out1:
- for (drive = 0; drive < NR_HD; drive++)
- put_disk(hd_gendisk[drive]);
- NR_HD = 0;
-out:
- del_timer(&device_timer);
- unregister_blkdev(HD_MAJOR, "hd");
- blk_cleanup_queue(hd_queue);
- return -1;
-Enomem:
- while (drive--)
- put_disk(hd_gendisk[drive]);
- goto out;
-}
-
-static int __init parse_hd_setup(char *line)
-{
- int ints[6];
-
- (void) get_options(line, ARRAY_SIZE(ints), ints);
- hd_setup(NULL, ints);
-
- return 1;
-}
-__setup("hd=", parse_hd_setup);
-
-late_initcall(hd_init);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0ecb6461ed81..994403efee19 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -445,32 +445,27 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq)
return ret;
}
-static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
+static void lo_complete_rq(struct request *rq)
{
- if (bytes < 0 || op_is_write(req_op(cmd->rq)))
- return;
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
- if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
+ if (unlikely(req_op(cmd->rq) == REQ_OP_READ && cmd->use_aio &&
+ cmd->ret >= 0 && cmd->ret < blk_rq_bytes(cmd->rq))) {
struct bio *bio = cmd->rq->bio;
- bio_advance(bio, bytes);
+ bio_advance(bio, cmd->ret);
zero_fill_bio(bio);
}
+
+ blk_mq_end_request(rq, cmd->ret < 0 ? -EIO : 0);
}
static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
{
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
- struct request *rq = cmd->rq;
-
- handle_partial_read(cmd, ret);
- if (ret > 0)
- ret = 0;
- else if (ret < 0)
- ret = -EIO;
-
- blk_mq_complete_request(rq, ret);
+ cmd->ret = ret;
+ blk_mq_complete_request(cmd->rq);
}
static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
@@ -528,6 +523,7 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
case REQ_OP_FLUSH:
return lo_req_flush(lo, rq);
case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
return lo_discard(lo, rq, pos);
case REQ_OP_WRITE:
if (lo->transfer)
@@ -826,7 +822,7 @@ static void loop_config_discard(struct loop_device *lo)
q->limits.discard_granularity = 0;
q->limits.discard_alignment = 0;
blk_queue_max_discard_sectors(q, 0);
- q->limits.discard_zeroes_data = 0;
+ blk_queue_max_write_zeroes_sectors(q, 0);
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
return;
}
@@ -834,7 +830,7 @@ static void loop_config_discard(struct loop_device *lo)
q->limits.discard_granularity = inode->i_sb->s_blocksize;
q->limits.discard_alignment = 0;
blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
- q->limits.discard_zeroes_data = 1;
+ blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
}
@@ -1660,6 +1656,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
switch (req_op(cmd->rq)) {
case REQ_OP_FLUSH:
case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
cmd->use_aio = false;
break;
default:
@@ -1686,8 +1683,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
ret = do_req_filebacked(lo, cmd->rq);
failed:
/* complete non-aio request */
- if (!cmd->use_aio || ret)
- blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
+ if (!cmd->use_aio || ret) {
+ cmd->ret = ret ? -EIO : 0;
+ blk_mq_complete_request(cmd->rq);
+ }
}
static void loop_queue_work(struct kthread_work *work)
@@ -1710,9 +1709,10 @@ static int loop_init_request(void *data, struct request *rq,
return 0;
}
-static struct blk_mq_ops loop_mq_ops = {
+static const struct blk_mq_ops loop_mq_ops = {
.queue_rq = loop_queue_rq,
.init_request = loop_init_request,
+ .complete = lo_complete_rq,
};
static int loop_add(struct loop_device **l, int i)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index fb2237c73e61..fecd3f97ef8c 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -70,6 +70,7 @@ struct loop_cmd {
struct request *rq;
struct list_head list;
bool use_aio; /* use AIO interface to handle I/O */
+ long ret;
struct kiocb iocb;
};
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
deleted file mode 100644
index 286f276f586e..000000000000
--- a/drivers/block/mg_disk.c
+++ /dev/null
@@ -1,1112 +0,0 @@
-/*
- * drivers/block/mg_disk.c
- *
- * Support for the mGine m[g]flash IO mode.
- * Based on legacy hd.c
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/hdreg.h>
-#include <linux/ata.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/platform_device.h>
-#include <linux/gpio.h>
-#include <linux/mg_disk.h>
-#include <linux/slab.h>
-
-#define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
-
-/* name for block device */
-#define MG_DISK_NAME "mgd"
-
-#define MG_DISK_MAJ 0
-#define MG_DISK_MAX_PART 16
-#define MG_SECTOR_SIZE 512
-#define MG_MAX_SECTS 256
-
-/* Register offsets */
-#define MG_BUFF_OFFSET 0x8000
-#define MG_REG_OFFSET 0xC000
-#define MG_REG_FEATURE (MG_REG_OFFSET + 2) /* write case */
-#define MG_REG_ERROR (MG_REG_OFFSET + 2) /* read case */
-#define MG_REG_SECT_CNT (MG_REG_OFFSET + 4)
-#define MG_REG_SECT_NUM (MG_REG_OFFSET + 6)
-#define MG_REG_CYL_LOW (MG_REG_OFFSET + 8)
-#define MG_REG_CYL_HIGH (MG_REG_OFFSET + 0xA)
-#define MG_REG_DRV_HEAD (MG_REG_OFFSET + 0xC)
-#define MG_REG_COMMAND (MG_REG_OFFSET + 0xE) /* write case */
-#define MG_REG_STATUS (MG_REG_OFFSET + 0xE) /* read case */
-#define MG_REG_DRV_CTRL (MG_REG_OFFSET + 0x10)
-#define MG_REG_BURST_CTRL (MG_REG_OFFSET + 0x12)
-
-/* handy status */
-#define MG_STAT_READY (ATA_DRDY | ATA_DSC)
-#define MG_READY_OK(s) (((s) & (MG_STAT_READY | (ATA_BUSY | ATA_DF | \
- ATA_ERR))) == MG_STAT_READY)
-
-/* error code for others */
-#define MG_ERR_NONE 0
-#define MG_ERR_TIMEOUT 0x100
-#define MG_ERR_INIT_STAT 0x101
-#define MG_ERR_TRANSLATION 0x102
-#define MG_ERR_CTRL_RST 0x103
-#define MG_ERR_INV_STAT 0x104
-#define MG_ERR_RSTOUT 0x105
-
-#define MG_MAX_ERRORS 6 /* Max read/write errors */
-
-/* command */
-#define MG_CMD_RD 0x20
-#define MG_CMD_WR 0x30
-#define MG_CMD_SLEEP 0x99
-#define MG_CMD_WAKEUP 0xC3
-#define MG_CMD_ID 0xEC
-#define MG_CMD_WR_CONF 0x3C
-#define MG_CMD_RD_CONF 0x40
-
-/* operation mode */
-#define MG_OP_CASCADE (1 << 0)
-#define MG_OP_CASCADE_SYNC_RD (1 << 1)
-#define MG_OP_CASCADE_SYNC_WR (1 << 2)
-#define MG_OP_INTERLEAVE (1 << 3)
-
-/* synchronous */
-#define MG_BURST_LAT_4 (3 << 4)
-#define MG_BURST_LAT_5 (4 << 4)
-#define MG_BURST_LAT_6 (5 << 4)
-#define MG_BURST_LAT_7 (6 << 4)
-#define MG_BURST_LAT_8 (7 << 4)
-#define MG_BURST_LEN_4 (1 << 1)
-#define MG_BURST_LEN_8 (2 << 1)
-#define MG_BURST_LEN_16 (3 << 1)
-#define MG_BURST_LEN_32 (4 << 1)
-#define MG_BURST_LEN_CONT (0 << 1)
-
-/* timeout value (unit: ms) */
-#define MG_TMAX_CONF_TO_CMD 1
-#define MG_TMAX_WAIT_RD_DRQ 10
-#define MG_TMAX_WAIT_WR_DRQ 500
-#define MG_TMAX_RST_TO_BUSY 10
-#define MG_TMAX_HDRST_TO_RDY 500
-#define MG_TMAX_SWRST_TO_RDY 500
-#define MG_TMAX_RSTOUT 3000
-
-#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
-
-/* main structure for mflash driver */
-struct mg_host {
- struct device *dev;
-
- struct request_queue *breq;
- struct request *req;
- spinlock_t lock;
- struct gendisk *gd;
-
- struct timer_list timer;
- void (*mg_do_intr) (struct mg_host *);
-
- u16 id[ATA_ID_WORDS];
-
- u16 cyls;
- u16 heads;
- u16 sectors;
- u32 n_sectors;
- u32 nres_sectors;
-
- void __iomem *dev_base;
- unsigned int irq;
- unsigned int rst;
- unsigned int rstout;
-
- u32 major;
- u32 error;
-};
-
-/*
- * Debugging macro and defines
- */
-#undef DO_MG_DEBUG
-#ifdef DO_MG_DEBUG
-# define MG_DBG(fmt, args...) \
- printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
-#else /* CONFIG_MG_DEBUG */
-# define MG_DBG(fmt, args...) do { } while (0)
-#endif /* CONFIG_MG_DEBUG */
-
-static void mg_request(struct request_queue *);
-
-static bool mg_end_request(struct mg_host *host, int err, unsigned int nr_bytes)
-{
- if (__blk_end_request(host->req, err, nr_bytes))
- return true;
-
- host->req = NULL;
- return false;
-}
-
-static bool mg_end_request_cur(struct mg_host *host, int err)
-{
- return mg_end_request(host, err, blk_rq_cur_bytes(host->req));
-}
-
-static void mg_dump_status(const char *msg, unsigned int stat,
- struct mg_host *host)
-{
- char *name = MG_DISK_NAME;
-
- if (host->req)
- name = host->req->rq_disk->disk_name;
-
- printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
- if (stat & ATA_BUSY)
- printk("Busy ");
- if (stat & ATA_DRDY)
- printk("DriveReady ");
- if (stat & ATA_DF)
- printk("WriteFault ");
- if (stat & ATA_DSC)
- printk("SeekComplete ");
- if (stat & ATA_DRQ)
- printk("DataRequest ");
- if (stat & ATA_CORR)
- printk("CorrectedError ");
- if (stat & ATA_ERR)
- printk("Error ");
- printk("}\n");
- if ((stat & ATA_ERR) == 0) {
- host->error = 0;
- } else {
- host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR);
- printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg,
- host->error & 0xff);
- if (host->error & ATA_BBK)
- printk("BadSector ");
- if (host->error & ATA_UNC)
- printk("UncorrectableError ");
- if (host->error & ATA_IDNF)
- printk("SectorIdNotFound ");
- if (host->error & ATA_ABORTED)
- printk("DriveStatusError ");
- if (host->error & ATA_AMNF)
- printk("AddrMarkNotFound ");
- printk("}");
- if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) {
- if (host->req)
- printk(", sector=%u",
- (unsigned int)blk_rq_pos(host->req));
- }
- printk("\n");
- }
-}
-
-static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
-{
- u8 status;
- unsigned long expire, cur_jiffies;
- struct mg_drv_data *prv_data = host->dev->platform_data;
-
- host->error = MG_ERR_NONE;
- expire = jiffies + msecs_to_jiffies(msec);
-
- /* These 2 times dummy status read prevents reading invalid
- * status. A very little time (3 times of mflash operating clk)
- * is required for busy bit is set. Use dummy read instead of
- * busy wait, because mflash's PLL is machine dependent.
- */
- if (prv_data->use_polling) {
- status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
- status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
- }
-
- status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
- do {
- cur_jiffies = jiffies;
- if (status & ATA_BUSY) {
- if (expect == ATA_BUSY)
- break;
- } else {
- /* Check the error condition! */
- if (status & ATA_ERR) {
- mg_dump_status("mg_wait", status, host);
- break;
- }
-
- if (expect == MG_STAT_READY)
- if (MG_READY_OK(status))
- break;
-
- if (expect == ATA_DRQ)
- if (status & ATA_DRQ)
- break;
- }
- if (!msec) {
- mg_dump_status("not ready", status, host);
- return MG_ERR_INV_STAT;
- }
-
- status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
- } while (time_before(cur_jiffies, expire));
-
- if (time_after_eq(cur_jiffies, expire) && msec)
- host->error = MG_ERR_TIMEOUT;
-
- return host->error;
-}
-
-static unsigned int mg_wait_rstout(u32 rstout, u32 msec)
-{
- unsigned long expire;
-
- expire = jiffies + msecs_to_jiffies(msec);
- while (time_before(jiffies, expire)) {
- if (gpio_get_value(rstout) == 1)
- return MG_ERR_NONE;
- msleep(10);
- }
-
- return MG_ERR_RSTOUT;
-}
-
-static void mg_unexpected_intr(struct mg_host *host)
-{
- u32 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
- mg_dump_status("mg_unexpected_intr", status, host);
-}
-
-static irqreturn_t mg_irq(int irq, void *dev_id)
-{
- struct mg_host *host = dev_id;
- void (*handler)(struct mg_host *) = host->mg_do_intr;
-
- spin_lock(&host->lock);
-
- host->mg_do_intr = NULL;
- del_timer(&host->timer);
- if (!handler)
- handler = mg_unexpected_intr;
- handler(host);
-
- spin_unlock(&host->lock);
-
- return IRQ_HANDLED;
-}
-
-/* local copy of ata_id_string() */
-static void mg_id_string(const u16 *id, unsigned char *s,
- unsigned int ofs, unsigned int len)
-{
- unsigned int c;
-
- BUG_ON(len & 1);
-
- while (len > 0) {
- c = id[ofs] >> 8;
- *s = c;
- s++;
-
- c = id[ofs] & 0xff;
- *s = c;
- s++;
-
- ofs++;
- len -= 2;
- }
-}
-
-/* local copy of ata_id_c_string() */
-static void mg_id_c_string(const u16 *id, unsigned char *s,
- unsigned int ofs, unsigned int len)
-{
- unsigned char *p;
-
- mg_id_string(id, s, ofs, len - 1);
-
- p = s + strnlen(s, len - 1);
- while (p > s && p[-1] == ' ')
- p--;
- *p = '\0';
-}
-
-static int mg_get_disk_id(struct mg_host *host)
-{
- u32 i;
- s32 err;
- const u16 *id = host->id;
- struct mg_drv_data *prv_data = host->dev->platform_data;
- char fwrev[ATA_ID_FW_REV_LEN + 1];
- char model[ATA_ID_PROD_LEN + 1];
- char serial[ATA_ID_SERNO_LEN + 1];
-
- if (!prv_data->use_polling)
- outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
- outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND);
- err = mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ);
- if (err)
- return err;
-
- for (i = 0; i < (MG_SECTOR_SIZE >> 1); i++)
- host->id[i] = le16_to_cpu(inw((unsigned long)host->dev_base +
- MG_BUFF_OFFSET + i * 2));
-
- outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
- err = mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD);
- if (err)
- return err;
-
- if ((id[ATA_ID_FIELD_VALID] & 1) == 0)
- return MG_ERR_TRANSLATION;
-
- host->n_sectors = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
- host->cyls = id[ATA_ID_CYLS];
- host->heads = id[ATA_ID_HEADS];
- host->sectors = id[ATA_ID_SECTORS];
-
- if (MG_RES_SEC && host->heads && host->sectors) {
- /* modify cyls, n_sectors */
- host->cyls = (host->n_sectors - MG_RES_SEC) /
- host->heads / host->sectors;
- host->nres_sectors = host->n_sectors - host->cyls *
- host->heads * host->sectors;
- host->n_sectors -= host->nres_sectors;
- }
-
- mg_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
- mg_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
- mg_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
- printk(KERN_INFO "mg_disk: model: %s\n", model);
- printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev);
- printk(KERN_INFO "mg_disk: serial: %s\n", serial);
- printk(KERN_INFO "mg_disk: %d + reserved %d sectors\n",
- host->n_sectors, host->nres_sectors);
-
- if (!prv_data->use_polling)
- outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
- return err;
-}
-
-
-static int mg_disk_init(struct mg_host *host)
-{
- struct mg_drv_data *prv_data = host->dev->platform_data;
- s32 err;
- u8 init_status;
-
- /* hdd rst low */
- gpio_set_value(host->rst, 0);
- err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
- if (err)
- return err;
-
- /* hdd rst high */
- gpio_set_value(host->rst, 1);
- err = mg_wait(host, MG_STAT_READY, MG_TMAX_HDRST_TO_RDY);
- if (err)
- return err;
-
- /* soft reset on */
- outb(ATA_SRST | (prv_data->use_polling ? ATA_NIEN : 0),
- (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
- err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
- if (err)
- return err;
-
- /* soft reset off */
- outb(prv_data->use_polling ? ATA_NIEN : 0,
- (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
- err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY);
- if (err)
- return err;
-
- init_status = inb((unsigned long)host->dev_base + MG_REG_STATUS) & 0xf;
-
- if (init_status == 0xf)
- return MG_ERR_INIT_STAT;
-
- return err;
-}
-
-static void mg_bad_rw_intr(struct mg_host *host)
-{
- if (host->req)
- if (++host->req->errors >= MG_MAX_ERRORS ||
- host->error == MG_ERR_TIMEOUT)
- mg_end_request_cur(host, -EIO);
-}
-
-static unsigned int mg_out(struct mg_host *host,
- unsigned int sect_num,
- unsigned int sect_cnt,
- unsigned int cmd,
- void (*intr_addr)(struct mg_host *))
-{
- struct mg_drv_data *prv_data = host->dev->platform_data;
-
- if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
- return host->error;
-
- if (!prv_data->use_polling) {
- host->mg_do_intr = intr_addr;
- mod_timer(&host->timer, jiffies + 3 * HZ);
- }
- if (MG_RES_SEC)
- sect_num += MG_RES_SEC;
- outb((u8)sect_cnt, (unsigned long)host->dev_base + MG_REG_SECT_CNT);
- outb((u8)sect_num, (unsigned long)host->dev_base + MG_REG_SECT_NUM);
- outb((u8)(sect_num >> 8), (unsigned long)host->dev_base +
- MG_REG_CYL_LOW);
- outb((u8)(sect_num >> 16), (unsigned long)host->dev_base +
- MG_REG_CYL_HIGH);
- outb((u8)((sect_num >> 24) | ATA_LBA | ATA_DEVICE_OBS),
- (unsigned long)host->dev_base + MG_REG_DRV_HEAD);
- outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND);
- return MG_ERR_NONE;
-}
-
-static void mg_read_one(struct mg_host *host, struct request *req)
-{
- u16 *buff = (u16 *)bio_data(req->bio);
- u32 i;
-
- for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
- *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
- (i << 1));
-}
-
-static void mg_read(struct request *req)
-{
- struct mg_host *host = req->rq_disk->private_data;
-
- if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
- MG_CMD_RD, NULL) != MG_ERR_NONE)
- mg_bad_rw_intr(host);
-
- MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
- blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
-
- do {
- if (mg_wait(host, ATA_DRQ,
- MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return;
- }
-
- mg_read_one(host, req);
-
- outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
- MG_REG_COMMAND);
- } while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_write_one(struct mg_host *host, struct request *req)
-{
- u16 *buff = (u16 *)bio_data(req->bio);
- u32 i;
-
- for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
- outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET +
- (i << 1));
-}
-
-static void mg_write(struct request *req)
-{
- struct mg_host *host = req->rq_disk->private_data;
- unsigned int rem = blk_rq_sectors(req);
-
- if (mg_out(host, blk_rq_pos(req), rem,
- MG_CMD_WR, NULL) != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return;
- }
-
- MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
- rem, blk_rq_pos(req), bio_data(req->bio));
-
- if (mg_wait(host, ATA_DRQ,
- MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return;
- }
-
- do {
- mg_write_one(host, req);
-
- outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
- MG_REG_COMMAND);
-
- rem--;
- if (rem > 1 && mg_wait(host, ATA_DRQ,
- MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return;
- } else if (mg_wait(host, MG_STAT_READY,
- MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return;
- }
- } while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_read_intr(struct mg_host *host)
-{
- struct request *req = host->req;
- u32 i;
-
- /* check status */
- do {
- i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
- if (i & ATA_BUSY)
- break;
- if (!MG_READY_OK(i))
- break;
- if (i & ATA_DRQ)
- goto ok_to_read;
- } while (0);
- mg_dump_status("mg_read_intr", i, host);
- mg_bad_rw_intr(host);
- mg_request(host->breq);
- return;
-
-ok_to_read:
- mg_read_one(host, req);
-
- MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
- blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
-
- /* send read confirm */
- outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
- if (mg_end_request(host, 0, MG_SECTOR_SIZE)) {
- /* set handler if read remains */
- host->mg_do_intr = mg_read_intr;
- mod_timer(&host->timer, jiffies + 3 * HZ);
- } else /* goto next request */
- mg_request(host->breq);
-}
-
-static void mg_write_intr(struct mg_host *host)
-{
- struct request *req = host->req;
- u32 i;
- bool rem;
-
- /* check status */
- do {
- i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
- if (i & ATA_BUSY)
- break;
- if (!MG_READY_OK(i))
- break;
- if ((blk_rq_sectors(req) <= 1) || (i & ATA_DRQ))
- goto ok_to_write;
- } while (0);
- mg_dump_status("mg_write_intr", i, host);
- mg_bad_rw_intr(host);
- mg_request(host->breq);
- return;
-
-ok_to_write:
- if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) {
- /* write 1 sector and set handler if remains */
- mg_write_one(host, req);
- MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
- blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
- host->mg_do_intr = mg_write_intr;
- mod_timer(&host->timer, jiffies + 3 * HZ);
- }
-
- /* send write confirm */
- outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
- if (!rem)
- mg_request(host->breq);
-}
-
-static void mg_times_out(unsigned long data)
-{
- struct mg_host *host = (struct mg_host *)data;
- char *name;
-
- spin_lock_irq(&host->lock);
-
- if (!host->req)
- goto out_unlock;
-
- host->mg_do_intr = NULL;
-
- name = host->req->rq_disk->disk_name;
- printk(KERN_DEBUG "%s: timeout\n", name);
-
- host->error = MG_ERR_TIMEOUT;
- mg_bad_rw_intr(host);
-
-out_unlock:
- mg_request(host->breq);
- spin_unlock_irq(&host->lock);
-}
-
-static void mg_request_poll(struct request_queue *q)
-{
- struct mg_host *host = q->queuedata;
-
- while (1) {
- if (!host->req) {
- host->req = blk_fetch_request(q);
- if (!host->req)
- break;
- }
-
- switch (req_op(host->req)) {
- case REQ_OP_READ:
- mg_read(host->req);
- break;
- case REQ_OP_WRITE:
- mg_write(host->req);
- break;
- default:
- mg_end_request_cur(host, -EIO);
- break;
- }
- }
-}
-
-static unsigned int mg_issue_req(struct request *req,
- struct mg_host *host,
- unsigned int sect_num,
- unsigned int sect_cnt)
-{
- switch (req_op(host->req)) {
- case REQ_OP_READ:
- if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
- != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return host->error;
- }
- break;
- case REQ_OP_WRITE:
- /* TODO : handler */
- outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
- if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
- != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return host->error;
- }
- del_timer(&host->timer);
- mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ);
- outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
- if (host->error) {
- mg_bad_rw_intr(host);
- return host->error;
- }
- mg_write_one(host, req);
- mod_timer(&host->timer, jiffies + 3 * HZ);
- outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
- MG_REG_COMMAND);
- break;
- default:
- mg_end_request_cur(host, -EIO);
- break;
- }
- return MG_ERR_NONE;
-}
-
-/* This function also called from IRQ context */
-static void mg_request(struct request_queue *q)
-{
- struct mg_host *host = q->queuedata;
- struct request *req;
- u32 sect_num, sect_cnt;
-
- while (1) {
- if (!host->req) {
- host->req = blk_fetch_request(q);
- if (!host->req)
- break;
- }
- req = host->req;
-
- /* check unwanted request call */
- if (host->mg_do_intr)
- return;
-
- del_timer(&host->timer);
-
- sect_num = blk_rq_pos(req);
- /* deal whole segments */
- sect_cnt = blk_rq_sectors(req);
-
- /* sanity check */
- if (sect_num >= get_capacity(req->rq_disk) ||
- ((sect_num + sect_cnt) >
- get_capacity(req->rq_disk))) {
- printk(KERN_WARNING
- "%s: bad access: sector=%d, count=%d\n",
- req->rq_disk->disk_name,
- sect_num, sect_cnt);
- mg_end_request_cur(host, -EIO);
- continue;
- }
-
- if (!mg_issue_req(req, host, sect_num, sect_cnt))
- return;
- }
-}
-
-static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- struct mg_host *host = bdev->bd_disk->private_data;
-
- geo->cylinders = (unsigned short)host->cyls;
- geo->heads = (unsigned char)host->heads;
- geo->sectors = (unsigned char)host->sectors;
- return 0;
-}
-
-static const struct block_device_operations mg_disk_ops = {
- .getgeo = mg_getgeo
-};
-
-#ifdef CONFIG_PM_SLEEP
-static int mg_suspend(struct device *dev)
-{
- struct mg_drv_data *prv_data = dev->platform_data;
- struct mg_host *host = prv_data->host;
-
- if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
- return -EIO;
-
- if (!prv_data->use_polling)
- outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
- outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND);
- /* wait until mflash deep sleep */
- msleep(1);
-
- if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) {
- if (!prv_data->use_polling)
- outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
- return -EIO;
- }
-
- return 0;
-}
-
-static int mg_resume(struct device *dev)
-{
- struct mg_drv_data *prv_data = dev->platform_data;
- struct mg_host *host = prv_data->host;
-
- if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
- return -EIO;
-
- outb(MG_CMD_WAKEUP, (unsigned long)host->dev_base + MG_REG_COMMAND);
- /* wait until mflash wakeup */
- msleep(1);
-
- if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
- return -EIO;
-
- if (!prv_data->use_polling)
- outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
- return 0;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
-
-static int mg_probe(struct platform_device *plat_dev)
-{
- struct mg_host *host;
- struct resource *rsc;
- struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
- int err = 0;
-
- if (!prv_data) {
- printk(KERN_ERR "%s:%d fail (no driver_data)\n",
- __func__, __LINE__);
- err = -EINVAL;
- goto probe_err;
- }
-
- /* alloc mg_host */
- host = kzalloc(sizeof(struct mg_host), GFP_KERNEL);
- if (!host) {
- printk(KERN_ERR "%s:%d fail (no memory for mg_host)\n",
- __func__, __LINE__);
- err = -ENOMEM;
- goto probe_err;
- }
- host->major = MG_DISK_MAJ;
-
- /* link each other */
- prv_data->host = host;
- host->dev = &plat_dev->dev;
-
- /* io remap */
- rsc = platform_get_resource(plat_dev, IORESOURCE_MEM, 0);
- if (!rsc) {
- printk(KERN_ERR "%s:%d platform_get_resource fail\n",
- __func__, __LINE__);
- err = -EINVAL;
- goto probe_err_2;
- }
- host->dev_base = ioremap(rsc->start, resource_size(rsc));
- if (!host->dev_base) {
- printk(KERN_ERR "%s:%d ioremap fail\n",
- __func__, __LINE__);
- err = -EIO;
- goto probe_err_2;
- }
- MG_DBG("dev_base = 0x%x\n", (u32)host->dev_base);
-
- /* get reset pin */
- rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
- MG_RST_PIN);
- if (!rsc) {
- printk(KERN_ERR "%s:%d get reset pin fail\n",
- __func__, __LINE__);
- err = -EIO;
- goto probe_err_3;
- }
- host->rst = rsc->start;
-
- /* init rst pin */
- err = gpio_request(host->rst, MG_RST_PIN);
- if (err)
- goto probe_err_3;
- gpio_direction_output(host->rst, 1);
-
- /* reset out pin */
- if (!(prv_data->dev_attr & MG_DEV_MASK)) {
- err = -EINVAL;
- goto probe_err_3a;
- }
-
- if (prv_data->dev_attr != MG_BOOT_DEV) {
- rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
- MG_RSTOUT_PIN);
- if (!rsc) {
- printk(KERN_ERR "%s:%d get reset-out pin fail\n",
- __func__, __LINE__);
- err = -EIO;
- goto probe_err_3a;
- }
- host->rstout = rsc->start;
- err = gpio_request(host->rstout, MG_RSTOUT_PIN);
- if (err)
- goto probe_err_3a;
- gpio_direction_input(host->rstout);
- }
-
- /* disk reset */
- if (prv_data->dev_attr == MG_STORAGE_DEV) {
- /* If POR seq. not yet finished, wait */
- err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
- if (err)
- goto probe_err_3b;
- err = mg_disk_init(host);
- if (err) {
- printk(KERN_ERR "%s:%d fail (err code : %d)\n",
- __func__, __LINE__, err);
- err = -EIO;
- goto probe_err_3b;
- }
- }
-
- /* get irq resource */
- if (!prv_data->use_polling) {
- host->irq = platform_get_irq(plat_dev, 0);
- if (host->irq == -ENXIO) {
- err = host->irq;
- goto probe_err_3b;
- }
- err = request_irq(host->irq, mg_irq,
- IRQF_TRIGGER_RISING,
- MG_DEV_NAME, host);
- if (err) {
- printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",
- __func__, __LINE__, err);
- goto probe_err_3b;
- }
-
- }
-
- /* get disk id */
- err = mg_get_disk_id(host);
- if (err) {
- printk(KERN_ERR "%s:%d fail (err code : %d)\n",
- __func__, __LINE__, err);
- err = -EIO;
- goto probe_err_4;
- }
-
- err = register_blkdev(host->major, MG_DISK_NAME);
- if (err < 0) {
- printk(KERN_ERR "%s:%d register_blkdev fail (err code : %d)\n",
- __func__, __LINE__, err);
- goto probe_err_4;
- }
- if (!host->major)
- host->major = err;
-
- spin_lock_init(&host->lock);
-
- if (prv_data->use_polling)
- host->breq = blk_init_queue(mg_request_poll, &host->lock);
- else
- host->breq = blk_init_queue(mg_request, &host->lock);
-
- if (!host->breq) {
- err = -ENOMEM;
- printk(KERN_ERR "%s:%d (blk_init_queue) fail\n",
- __func__, __LINE__);
- goto probe_err_5;
- }
- host->breq->queuedata = host;
-
- /* mflash is random device, thanx for the noop */
- err = elevator_change(host->breq, "noop");
- if (err) {
- printk(KERN_ERR "%s:%d (elevator_init) fail\n",
- __func__, __LINE__);
- goto probe_err_6;
- }
- blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS);
- blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
-
- init_timer(&host->timer);
- host->timer.function = mg_times_out;
- host->timer.data = (unsigned long)host;
-
- host->gd = alloc_disk(MG_DISK_MAX_PART);
- if (!host->gd) {
- printk(KERN_ERR "%s:%d (alloc_disk) fail\n",
- __func__, __LINE__);
- err = -ENOMEM;
- goto probe_err_7;
- }
- host->gd->major = host->major;
- host->gd->first_minor = 0;
- host->gd->fops = &mg_disk_ops;
- host->gd->queue = host->breq;
- host->gd->private_data = host;
- sprintf(host->gd->disk_name, MG_DISK_NAME"a");
-
- set_capacity(host->gd, host->n_sectors);
-
- add_disk(host->gd);
-
- return err;
-
-probe_err_7:
- del_timer_sync(&host->timer);
-probe_err_6:
- blk_cleanup_queue(host->breq);
-probe_err_5:
- unregister_blkdev(host->major, MG_DISK_NAME);
-probe_err_4:
- if (!prv_data->use_polling)
- free_irq(host->irq, host);
-probe_err_3b:
- gpio_free(host->rstout);
-probe_err_3a:
- gpio_free(host->rst);
-probe_err_3:
- iounmap(host->dev_base);
-probe_err_2:
- kfree(host);
-probe_err:
- return err;
-}
-
-static int mg_remove(struct platform_device *plat_dev)
-{
- struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
- struct mg_host *host = prv_data->host;
- int err = 0;
-
- /* delete timer */
- del_timer_sync(&host->timer);
-
- /* remove disk */
- if (host->gd) {
- del_gendisk(host->gd);
- put_disk(host->gd);
- }
- /* remove queue */
- if (host->breq)
- blk_cleanup_queue(host->breq);
-
- /* unregister blk device */
- unregister_blkdev(host->major, MG_DISK_NAME);
-
- /* free irq */
- if (!prv_data->use_polling)
- free_irq(host->irq, host);
-
- /* free reset-out pin */
- if (prv_data->dev_attr != MG_BOOT_DEV)
- gpio_free(host->rstout);
-
- /* free rst pin */
- if (host->rst)
- gpio_free(host->rst);
-
- /* unmap io */
- if (host->dev_base)
- iounmap(host->dev_base);
-
- /* free mg_host */
- kfree(host);
-
- return err;
-}
-
-static struct platform_driver mg_disk_driver = {
- .probe = mg_probe,
- .remove = mg_remove,
- .driver = {
- .name = MG_DEV_NAME,
- .pm = &mg_pm,
- }
-};
-
-/****************************************************************************
- *
- * Module stuff
- *
- ****************************************************************************/
-
-static int __init mg_init(void)
-{
- printk(KERN_INFO "mGine mflash driver, (c) 2008 mGine Co.\n");
- return platform_driver_register(&mg_disk_driver);
-}
-
-static void __exit mg_exit(void)
-{
- printk(KERN_INFO "mflash driver : bye bye\n");
- platform_driver_unregister(&mg_disk_driver);
-}
-
-module_init(mg_init);
-module_exit(mg_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("unsik Kim <donari75@gmail.com>");
-MODULE_DESCRIPTION("mGine m[g]flash device driver");
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 1d1dc11aa5fa..02804cc79d82 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -169,6 +169,25 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
return false; /* device present */
}
+/* we have to use runtime tag to setup command header */
+static void mtip_init_cmd_header(struct request *rq)
+{
+ struct driver_data *dd = rq->q->queuedata;
+ struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
+
+ /* Point the command headers at the command tables. */
+ cmd->command_header = dd->port->command_list +
+ (sizeof(struct mtip_cmd_hdr) * rq->tag);
+ cmd->command_header_dma = dd->port->command_list_dma +
+ (sizeof(struct mtip_cmd_hdr) * rq->tag);
+
+ if (host_cap_64)
+ cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
+
+ cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+}
+
static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
{
struct request *rq;
@@ -180,6 +199,9 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
if (IS_ERR(rq))
return NULL;
+ /* Internal cmd isn't submitted via .queue_rq */
+ mtip_init_cmd_header(rq);
+
return blk_mq_rq_to_pdu(rq);
}
@@ -241,7 +263,8 @@ static void mtip_async_complete(struct mtip_port *port,
rq = mtip_rq_from_tag(dd, tag);
- blk_mq_complete_request(rq, status);
+ cmd->status = status;
+ blk_mq_complete_request(rq);
}
/*
@@ -2910,18 +2933,19 @@ static void mtip_softirq_done_fn(struct request *rq)
if (unlikely(cmd->unaligned))
up(&dd->port->cmd_slot_unal);
- blk_mq_end_request(rq, rq->errors);
+ blk_mq_end_request(rq, cmd->status);
}
static void mtip_abort_cmd(struct request *req, void *data,
bool reserved)
{
+ struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
struct driver_data *dd = data;
dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag);
clear_bit(req->tag, dd->port->cmds_to_issue);
- req->errors = -EIO;
+ cmd->status = -EIO;
mtip_softirq_done_fn(req);
}
@@ -3807,6 +3831,8 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *rq = bd->rq;
int ret;
+ mtip_init_cmd_header(rq);
+
if (unlikely(mtip_check_unal_depth(hctx, rq)))
return BLK_MQ_RQ_QUEUE_BUSY;
@@ -3816,7 +3842,6 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
if (likely(!ret))
return BLK_MQ_RQ_QUEUE_OK;
- rq->errors = ret;
return BLK_MQ_RQ_QUEUE_ERROR;
}
@@ -3838,7 +3863,6 @@ static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx,
{
struct driver_data *dd = data;
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
- u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
/*
* For flush requests, request_idx starts at the end of the
@@ -3855,17 +3879,6 @@ static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx,
memset(cmd->command, 0, CMD_DMA_ALLOC_SZ);
- /* Point the command headers at the command tables. */
- cmd->command_header = dd->port->command_list +
- (sizeof(struct mtip_cmd_hdr) * request_idx);
- cmd->command_header_dma = dd->port->command_list_dma +
- (sizeof(struct mtip_cmd_hdr) * request_idx);
-
- if (host_cap_64)
- cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
-
- cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
-
sg_init_table(cmd->sg, MTIP_MAX_SG);
return 0;
}
@@ -3889,7 +3902,7 @@ exit_handler:
return BLK_EH_RESET_TIMER;
}
-static struct blk_mq_ops mtip_mq_ops = {
+static const struct blk_mq_ops mtip_mq_ops = {
.queue_rq = mtip_queue_rq,
.init_request = mtip_init_cmd,
.exit_request = mtip_free_cmd,
@@ -4025,7 +4038,6 @@ skip_create_disk:
dd->queue->limits.discard_granularity = 4096;
blk_queue_max_discard_sectors(dd->queue,
MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
- dd->queue->limits.discard_zeroes_data = 0;
}
/* Set the capacity of the device in 512 byte sectors. */
@@ -4107,9 +4119,11 @@ static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
struct driver_data *dd = (struct driver_data *)data;
struct mtip_cmd *cmd;
- if (likely(!reserv))
- blk_mq_complete_request(rq, -ENODEV);
- else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &dd->port->flags)) {
+ if (likely(!reserv)) {
+ cmd = blk_mq_rq_to_pdu(rq);
+ cmd->status = -ENODEV;
+ blk_mq_complete_request(rq);
+ } else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &dd->port->flags)) {
cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
if (cmd->comp_func)
@@ -4162,7 +4176,7 @@ static int mtip_block_remove(struct driver_data *dd)
dev_info(&dd->pdev->dev, "device %s surprise removal\n",
dd->disk->disk_name);
- blk_mq_freeze_queue_start(dd->queue);
+ blk_freeze_queue_start(dd->queue);
blk_mq_stop_hw_queues(dd->queue);
blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 7617888f7944..57b41528a824 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -352,6 +352,7 @@ struct mtip_cmd {
int retries; /* The number of retries left for this command. */
int direction; /* Data transfer direction */
+ int status;
};
/* Structure used to describe a port. */
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index d8a23561b4cb..ac376b9b852d 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -40,49 +40,82 @@
#include <asm/types.h>
#include <linux/nbd.h>
+#include <linux/nbd-netlink.h>
+#include <net/genetlink.h>
static DEFINE_IDR(nbd_index_idr);
static DEFINE_MUTEX(nbd_index_mutex);
+static int nbd_total_devices = 0;
struct nbd_sock {
struct socket *sock;
struct mutex tx_lock;
struct request *pending;
int sent;
+ bool dead;
+ int fallback_index;
+ int cookie;
+};
+
+struct recv_thread_args {
+ struct work_struct work;
+ struct nbd_device *nbd;
+ int index;
+};
+
+struct link_dead_args {
+ struct work_struct work;
+ int index;
};
#define NBD_TIMEDOUT 0
#define NBD_DISCONNECT_REQUESTED 1
#define NBD_DISCONNECTED 2
-#define NBD_RUNNING 3
+#define NBD_HAS_PID_FILE 3
+#define NBD_HAS_CONFIG_REF 4
+#define NBD_BOUND 5
+#define NBD_DESTROY_ON_DISCONNECT 6
-struct nbd_device {
+struct nbd_config {
u32 flags;
unsigned long runtime_flags;
- struct nbd_sock **socks;
- int magic;
+ u64 dead_conn_timeout;
- struct blk_mq_tag_set tag_set;
-
- struct mutex config_lock;
- struct gendisk *disk;
+ struct nbd_sock **socks;
int num_connections;
+ atomic_t live_connections;
+ wait_queue_head_t conn_wait;
+
atomic_t recv_threads;
wait_queue_head_t recv_wq;
loff_t blksize;
loff_t bytesize;
-
- struct task_struct *task_recv;
- struct task_struct *task_setup;
-
#if IS_ENABLED(CONFIG_DEBUG_FS)
struct dentry *dbg_dir;
#endif
};
+struct nbd_device {
+ struct blk_mq_tag_set tag_set;
+
+ int index;
+ refcount_t config_refs;
+ refcount_t refs;
+ struct nbd_config *config;
+ struct mutex config_lock;
+ struct gendisk *disk;
+
+ struct list_head list;
+ struct task_struct *task_recv;
+ struct task_struct *task_setup;
+};
+
struct nbd_cmd {
struct nbd_device *nbd;
+ int index;
+ int cookie;
struct completion send_complete;
+ int status;
};
#if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -100,18 +133,16 @@ static int part_shift;
static int nbd_dev_dbg_init(struct nbd_device *nbd);
static void nbd_dev_dbg_close(struct nbd_device *nbd);
-
+static void nbd_config_put(struct nbd_device *nbd);
+static void nbd_connect_reply(struct genl_info *info, int index);
+static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
+static void nbd_dead_link_work(struct work_struct *work);
static inline struct device *nbd_to_dev(struct nbd_device *nbd)
{
return disk_to_dev(nbd->disk);
}
-static bool nbd_is_connected(struct nbd_device *nbd)
-{
- return !!nbd->task_recv;
-}
-
static const char *nbdcmd_to_ascii(int cmd)
{
switch (cmd) {
@@ -124,44 +155,104 @@ static const char *nbdcmd_to_ascii(int cmd)
return "invalid";
}
-static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
+static ssize_t pid_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- if (bdev->bd_openers <= 1)
- bd_set_size(bdev, 0);
- set_capacity(nbd->disk, 0);
- kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
- return 0;
+ return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
+}
+
+static struct device_attribute pid_attr = {
+ .attr = { .name = "pid", .mode = S_IRUGO},
+ .show = pid_show,
+};
+
+static void nbd_dev_remove(struct nbd_device *nbd)
+{
+ struct gendisk *disk = nbd->disk;
+ if (disk) {
+ del_gendisk(disk);
+ blk_cleanup_queue(disk->queue);
+ blk_mq_free_tag_set(&nbd->tag_set);
+ disk->private_data = NULL;
+ put_disk(disk);
+ }
+ kfree(nbd);
+}
+
+static void nbd_put(struct nbd_device *nbd)
+{
+ if (refcount_dec_and_mutex_lock(&nbd->refs,
+ &nbd_index_mutex)) {
+ idr_remove(&nbd_index_idr, nbd->index);
+ mutex_unlock(&nbd_index_mutex);
+ nbd_dev_remove(nbd);
+ }
+}
+
+static int nbd_disconnected(struct nbd_config *config)
+{
+ return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
+ test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
+}
+
+static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
+ int notify)
+{
+ if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
+ struct link_dead_args *args;
+ args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
+ if (args) {
+ INIT_WORK(&args->work, nbd_dead_link_work);
+ args->index = nbd->index;
+ queue_work(system_wq, &args->work);
+ }
+ }
+ if (!nsock->dead) {
+ kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+ atomic_dec(&nbd->config->live_connections);
+ }
+ nsock->dead = true;
+ nsock->pending = NULL;
+ nsock->sent = 0;
+}
+
+static void nbd_size_clear(struct nbd_device *nbd)
+{
+ if (nbd->config->bytesize) {
+ set_capacity(nbd->disk, 0);
+ kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
+ }
}
-static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
+static void nbd_size_update(struct nbd_device *nbd)
{
- blk_queue_logical_block_size(nbd->disk->queue, nbd->blksize);
- blk_queue_physical_block_size(nbd->disk->queue, nbd->blksize);
- bd_set_size(bdev, nbd->bytesize);
- set_capacity(nbd->disk, nbd->bytesize >> 9);
+ struct nbd_config *config = nbd->config;
+ blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
+ blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
+ set_capacity(nbd->disk, config->bytesize >> 9);
kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
}
-static void nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
- loff_t blocksize, loff_t nr_blocks)
+static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
+ loff_t nr_blocks)
{
- nbd->blksize = blocksize;
- nbd->bytesize = blocksize * nr_blocks;
- if (nbd_is_connected(nbd))
- nbd_size_update(nbd, bdev);
+ struct nbd_config *config = nbd->config;
+ config->blksize = blocksize;
+ config->bytesize = blocksize * nr_blocks;
+ nbd_size_update(nbd);
}
-static void nbd_end_request(struct nbd_cmd *cmd)
+static void nbd_complete_rq(struct request *req)
{
- struct nbd_device *nbd = cmd->nbd;
- struct request *req = blk_mq_rq_from_pdu(cmd);
- int error = req->errors ? -EIO : 0;
+ struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
- dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
- error ? "failed" : "done");
+ dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", cmd,
+ cmd->status ? "failed" : "done");
- blk_mq_complete_request(req, error);
+ blk_mq_end_request(req, cmd->status);
}
/*
@@ -169,17 +260,18 @@ static void nbd_end_request(struct nbd_cmd *cmd)
*/
static void sock_shutdown(struct nbd_device *nbd)
{
+ struct nbd_config *config = nbd->config;
int i;
- if (nbd->num_connections == 0)
+ if (config->num_connections == 0)
return;
- if (test_and_set_bit(NBD_DISCONNECTED, &nbd->runtime_flags))
+ if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
return;
- for (i = 0; i < nbd->num_connections; i++) {
- struct nbd_sock *nsock = nbd->socks[i];
+ for (i = 0; i < config->num_connections; i++) {
+ struct nbd_sock *nsock = config->socks[i];
mutex_lock(&nsock->tx_lock);
- kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+ nbd_mark_nsock_dead(nbd, nsock, 0);
mutex_unlock(&nsock->tx_lock);
}
dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
@@ -190,14 +282,58 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
struct nbd_device *nbd = cmd->nbd;
+ struct nbd_config *config;
- dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
- set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
- req->errors = -EIO;
+ if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ cmd->status = -EIO;
+ return BLK_EH_HANDLED;
+ }
- mutex_lock(&nbd->config_lock);
+ /* If we are waiting on our dead timer then we could get timeout
+ * callbacks for our request. For this we just want to reset the timer
+ * and let the queue side take care of everything.
+ */
+ if (!completion_done(&cmd->send_complete)) {
+ nbd_config_put(nbd);
+ return BLK_EH_RESET_TIMER;
+ }
+ config = nbd->config;
+
+ if (config->num_connections > 1) {
+ dev_err_ratelimited(nbd_to_dev(nbd),
+ "Connection timed out, retrying\n");
+ /*
+ * Hooray we have more connections, requeue this IO, the submit
+ * path will put it on a real connection.
+ */
+ if (config->socks && config->num_connections > 1) {
+ if (cmd->index < config->num_connections) {
+ struct nbd_sock *nsock =
+ config->socks[cmd->index];
+ mutex_lock(&nsock->tx_lock);
+ /* We can have multiple outstanding requests, so
+ * we don't want to mark the nsock dead if we've
+ * already reconnected with a new socket, so
+ * only mark it dead if its the same socket we
+ * were sent out on.
+ */
+ if (cmd->cookie == nsock->cookie)
+ nbd_mark_nsock_dead(nbd, nsock, 1);
+ mutex_unlock(&nsock->tx_lock);
+ }
+ blk_mq_requeue_request(req, true);
+ nbd_config_put(nbd);
+ return BLK_EH_NOT_HANDLED;
+ }
+ } else {
+ dev_err_ratelimited(nbd_to_dev(nbd),
+ "Connection timed out\n");
+ }
+ set_bit(NBD_TIMEDOUT, &config->runtime_flags);
+ cmd->status = -EIO;
sock_shutdown(nbd);
- mutex_unlock(&nbd->config_lock);
+ nbd_config_put(nbd);
+
return BLK_EH_HANDLED;
}
@@ -207,7 +343,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
static int sock_xmit(struct nbd_device *nbd, int index, int send,
struct iov_iter *iter, int msg_flags, int *sent)
{
- struct socket *sock = nbd->socks[index]->sock;
+ struct nbd_config *config = nbd->config;
+ struct socket *sock = config->socks[index]->sock;
int result;
struct msghdr msg;
unsigned long pflags = current->flags;
@@ -253,7 +390,8 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
- struct nbd_sock *nsock = nbd->socks[index];
+ struct nbd_config *config = nbd->config;
+ struct nbd_sock *nsock = config->socks[index];
int result;
struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
@@ -284,7 +422,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
}
if (rq_data_dir(req) == WRITE &&
- (nbd->flags & NBD_FLAG_READ_ONLY)) {
+ (config->flags & NBD_FLAG_READ_ONLY)) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Write on read-only\n");
return -EIO;
@@ -301,6 +439,8 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
}
iov_iter_advance(&from, sent);
}
+ cmd->index = index;
+ cmd->cookie = nsock->cookie;
request.type = htonl(type);
if (type != NBD_CMD_FLUSH) {
request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
@@ -328,7 +468,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
}
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Send control failed (result %d)\n", result);
- return -EIO;
+ return -EAGAIN;
}
send_pages:
if (type != NBD_CMD_WRITE)
@@ -370,7 +510,7 @@ send_pages:
dev_err(disk_to_dev(nbd->disk),
"Send data failed (result %d)\n",
result);
- return -EIO;
+ return -EAGAIN;
}
/*
* The completion might already have come in,
@@ -392,6 +532,7 @@ out:
/* NULL returned = something went wrong, inform userspace */
static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
{
+ struct nbd_config *config = nbd->config;
int result;
struct nbd_reply reply;
struct nbd_cmd *cmd;
@@ -405,8 +546,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
if (result <= 0) {
- if (!test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) &&
- !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
+ if (!nbd_disconnected(config))
dev_err(disk_to_dev(nbd->disk),
"Receive control failed (result %d)\n", result);
return ERR_PTR(result);
@@ -433,7 +573,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
if (ntohl(reply.error)) {
dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
ntohl(reply.error));
- req->errors = -EIO;
+ cmd->status = -EIO;
return cmd;
}
@@ -449,8 +589,19 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
if (result <= 0) {
dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
result);
- req->errors = -EIO;
- return cmd;
+ /*
+ * If we've disconnected or we only have 1
+ * connection then we need to make sure we
+ * complete this request, otherwise error out
+ * and let the timeout stuff handle resubmitting
+ * this request onto another connection.
+ */
+ if (nbd_disconnected(config) ||
+ config->num_connections <= 1) {
+ cmd->status = -EIO;
+ return cmd;
+ }
+ return ERR_PTR(-EIO);
}
dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
cmd, bvec.bv_len);
@@ -462,54 +613,34 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
return cmd;
}
-static ssize_t pid_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct gendisk *disk = dev_to_disk(dev);
- struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
-
- return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
-}
-
-static struct device_attribute pid_attr = {
- .attr = { .name = "pid", .mode = S_IRUGO},
- .show = pid_show,
-};
-
-struct recv_thread_args {
- struct work_struct work;
- struct nbd_device *nbd;
- int index;
-};
-
static void recv_work(struct work_struct *work)
{
struct recv_thread_args *args = container_of(work,
struct recv_thread_args,
work);
struct nbd_device *nbd = args->nbd;
+ struct nbd_config *config = nbd->config;
struct nbd_cmd *cmd;
int ret = 0;
- BUG_ON(nbd->magic != NBD_MAGIC);
while (1) {
cmd = nbd_read_stat(nbd, args->index);
if (IS_ERR(cmd)) {
+ struct nbd_sock *nsock = config->socks[args->index];
+
+ mutex_lock(&nsock->tx_lock);
+ nbd_mark_nsock_dead(nbd, nsock, 1);
+ mutex_unlock(&nsock->tx_lock);
ret = PTR_ERR(cmd);
break;
}
- nbd_end_request(cmd);
+ blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
}
-
- /*
- * We got an error, shut everybody down if this wasn't the result of a
- * disconnect request.
- */
- if (ret && !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
- sock_shutdown(nbd);
- atomic_dec(&nbd->recv_threads);
- wake_up(&nbd->recv_wq);
+ atomic_dec(&config->recv_threads);
+ wake_up(&config->recv_wq);
+ nbd_config_put(nbd);
+ kfree(args);
}
static void nbd_clear_req(struct request *req, void *data, bool reserved)
@@ -519,47 +650,119 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved)
if (!blk_mq_request_started(req))
return;
cmd = blk_mq_rq_to_pdu(req);
- req->errors = -EIO;
- nbd_end_request(cmd);
+ cmd->status = -EIO;
+ blk_mq_complete_request(req);
}
static void nbd_clear_que(struct nbd_device *nbd)
{
- BUG_ON(nbd->magic != NBD_MAGIC);
-
+ blk_mq_stop_hw_queues(nbd->disk->queue);
blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
+ blk_mq_start_hw_queues(nbd->disk->queue);
dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
}
+static int find_fallback(struct nbd_device *nbd, int index)
+{
+ struct nbd_config *config = nbd->config;
+ int new_index = -1;
+ struct nbd_sock *nsock = config->socks[index];
+ int fallback = nsock->fallback_index;
+
+ if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+ return new_index;
+
+ if (config->num_connections <= 1) {
+ dev_err_ratelimited(disk_to_dev(nbd->disk),
+ "Attempted send on invalid socket\n");
+ return new_index;
+ }
+
+ if (fallback >= 0 && fallback < config->num_connections &&
+ !config->socks[fallback]->dead)
+ return fallback;
+
+ if (nsock->fallback_index < 0 ||
+ nsock->fallback_index >= config->num_connections ||
+ config->socks[nsock->fallback_index]->dead) {
+ int i;
+ for (i = 0; i < config->num_connections; i++) {
+ if (i == index)
+ continue;
+ if (!config->socks[i]->dead) {
+ new_index = i;
+ break;
+ }
+ }
+ nsock->fallback_index = new_index;
+ if (new_index < 0) {
+ dev_err_ratelimited(disk_to_dev(nbd->disk),
+ "Dead connection, failed to find a fallback\n");
+ return new_index;
+ }
+ }
+ new_index = nsock->fallback_index;
+ return new_index;
+}
+
+static int wait_for_reconnect(struct nbd_device *nbd)
+{
+ struct nbd_config *config = nbd->config;
+ if (!config->dead_conn_timeout)
+ return 0;
+ if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+ return 0;
+ wait_event_interruptible_timeout(config->conn_wait,
+ atomic_read(&config->live_connections),
+ config->dead_conn_timeout);
+ return atomic_read(&config->live_connections);
+}
static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
struct nbd_device *nbd = cmd->nbd;
+ struct nbd_config *config;
struct nbd_sock *nsock;
int ret;
- if (index >= nbd->num_connections) {
+ if (!refcount_inc_not_zero(&nbd->config_refs)) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Attempted send on invalid socket\n");
+ "Socks array is empty\n");
return -EINVAL;
}
+ config = nbd->config;
- if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) {
+ if (index >= config->num_connections) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Attempted send on closed socket\n");
+ "Attempted send on invalid socket\n");
+ nbd_config_put(nbd);
return -EINVAL;
}
-
- req->errors = 0;
-
- nsock = nbd->socks[index];
+ cmd->status = 0;
+again:
+ nsock = config->socks[index];
mutex_lock(&nsock->tx_lock);
- if (unlikely(!nsock->sock)) {
+ if (nsock->dead) {
+ int old_index = index;
+ index = find_fallback(nbd, index);
mutex_unlock(&nsock->tx_lock);
- dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Attempted send on closed socket\n");
- return -EINVAL;
+ if (index < 0) {
+ if (wait_for_reconnect(nbd)) {
+ index = old_index;
+ goto again;
+ }
+ /* All the sockets should already be down at this point,
+ * we just want to make sure that DISCONNECTED is set so
+ * any requests that come in that were queue'ed waiting
+ * for the reconnect timer don't trigger the timer again
+ * and instead just error out.
+ */
+ sock_shutdown(nbd);
+ nbd_config_put(nbd);
+ return -EIO;
+ }
+ goto again;
}
/* Handle the case that we have a pending request that was partially
@@ -572,9 +775,21 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
ret = 0;
goto out;
}
+ /*
+ * Some failures are related to the link going down, so anything that
+ * returns EAGAIN can be retried on a different socket.
+ */
ret = nbd_send_cmd(nbd, cmd, index);
+ if (ret == -EAGAIN) {
+ dev_err_ratelimited(disk_to_dev(nbd->disk),
+ "Request send failed trying another connection\n");
+ nbd_mark_nsock_dead(nbd, nsock, 1);
+ mutex_unlock(&nsock->tx_lock);
+ goto again;
+ }
out:
mutex_unlock(&nsock->tx_lock);
+ nbd_config_put(nbd);
return ret;
}
@@ -611,9 +826,10 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
return ret;
}
-static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev,
- unsigned long arg)
+static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
+ bool netlink)
{
+ struct nbd_config *config = nbd->config;
struct socket *sock;
struct nbd_sock **socks;
struct nbd_sock *nsock;
@@ -623,43 +839,107 @@ static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev,
if (!sock)
return err;
- if (!nbd->task_setup)
+ if (!netlink && !nbd->task_setup &&
+ !test_bit(NBD_BOUND, &config->runtime_flags))
nbd->task_setup = current;
- if (nbd->task_setup != current) {
+
+ if (!netlink &&
+ (nbd->task_setup != current ||
+ test_bit(NBD_BOUND, &config->runtime_flags))) {
dev_err(disk_to_dev(nbd->disk),
"Device being setup by another task");
- return -EINVAL;
+ sockfd_put(sock);
+ return -EBUSY;
}
- socks = krealloc(nbd->socks, (nbd->num_connections + 1) *
+ socks = krealloc(config->socks, (config->num_connections + 1) *
sizeof(struct nbd_sock *), GFP_KERNEL);
- if (!socks)
+ if (!socks) {
+ sockfd_put(sock);
return -ENOMEM;
+ }
nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
- if (!nsock)
+ if (!nsock) {
+ sockfd_put(sock);
return -ENOMEM;
+ }
- nbd->socks = socks;
+ config->socks = socks;
+ nsock->fallback_index = -1;
+ nsock->dead = false;
mutex_init(&nsock->tx_lock);
nsock->sock = sock;
nsock->pending = NULL;
nsock->sent = 0;
- socks[nbd->num_connections++] = nsock;
+ nsock->cookie = 0;
+ socks[config->num_connections++] = nsock;
+ atomic_inc(&config->live_connections);
- if (max_part)
- bdev->bd_invalidated = 1;
return 0;
}
+static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
+{
+ struct nbd_config *config = nbd->config;
+ struct socket *sock, *old;
+ struct recv_thread_args *args;
+ int i;
+ int err;
+
+ sock = sockfd_lookup(arg, &err);
+ if (!sock)
+ return err;
+
+ args = kzalloc(sizeof(*args), GFP_KERNEL);
+ if (!args) {
+ sockfd_put(sock);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < config->num_connections; i++) {
+ struct nbd_sock *nsock = config->socks[i];
+
+ if (!nsock->dead)
+ continue;
+
+ mutex_lock(&nsock->tx_lock);
+ if (!nsock->dead) {
+ mutex_unlock(&nsock->tx_lock);
+ continue;
+ }
+ sk_set_memalloc(sock->sk);
+ atomic_inc(&config->recv_threads);
+ refcount_inc(&nbd->config_refs);
+ old = nsock->sock;
+ nsock->fallback_index = -1;
+ nsock->sock = sock;
+ nsock->dead = false;
+ INIT_WORK(&args->work, recv_work);
+ args->index = i;
+ args->nbd = nbd;
+ nsock->cookie++;
+ mutex_unlock(&nsock->tx_lock);
+ sockfd_put(old);
+
+ /* We take the tx_mutex in an error path in the recv_work, so we
+ * need to queue_work outside of the tx_mutex.
+ */
+ queue_work(recv_workqueue, &args->work);
+
+ atomic_inc(&config->live_connections);
+ wake_up(&config->conn_wait);
+ return 0;
+ }
+ sockfd_put(sock);
+ kfree(args);
+ return -ENOSPC;
+}
+
/* Reset all properties of an NBD device */
static void nbd_reset(struct nbd_device *nbd)
{
- nbd->runtime_flags = 0;
- nbd->blksize = 1024;
- nbd->bytesize = 0;
- set_capacity(nbd->disk, 0);
- nbd->flags = 0;
+ nbd->config = NULL;
nbd->tag_set.timeout = 0;
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
}
@@ -668,21 +948,23 @@ static void nbd_bdev_reset(struct block_device *bdev)
{
if (bdev->bd_openers > 1)
return;
- set_device_ro(bdev, false);
- bdev->bd_inode->i_size = 0;
+ bd_set_size(bdev, 0);
if (max_part > 0) {
blkdev_reread_part(bdev);
bdev->bd_invalidated = 1;
}
}
-static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
+static void nbd_parse_flags(struct nbd_device *nbd)
{
- if (nbd->flags & NBD_FLAG_READ_ONLY)
- set_device_ro(bdev, true);
- if (nbd->flags & NBD_FLAG_SEND_TRIM)
+ struct nbd_config *config = nbd->config;
+ if (config->flags & NBD_FLAG_READ_ONLY)
+ set_disk_ro(nbd->disk, true);
+ else
+ set_disk_ro(nbd->disk, false);
+ if (config->flags & NBD_FLAG_SEND_TRIM)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
- if (nbd->flags & NBD_FLAG_SEND_FLUSH)
+ if (config->flags & NBD_FLAG_SEND_FLUSH)
blk_queue_write_cache(nbd->disk->queue, true, false);
else
blk_queue_write_cache(nbd->disk->queue, false, false);
@@ -690,6 +972,7 @@ static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
static void send_disconnects(struct nbd_device *nbd)
{
+ struct nbd_config *config = nbd->config;
struct nbd_request request = {
.magic = htonl(NBD_REQUEST_MAGIC),
.type = htonl(NBD_CMD_DISC),
@@ -698,7 +981,7 @@ static void send_disconnects(struct nbd_device *nbd)
struct iov_iter from;
int i, ret;
- for (i = 0; i < nbd->num_connections; i++) {
+ for (i = 0; i < config->num_connections; i++) {
iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
if (ret <= 0)
@@ -707,145 +990,162 @@ static void send_disconnects(struct nbd_device *nbd)
}
}
-static int nbd_disconnect(struct nbd_device *nbd, struct block_device *bdev)
+static int nbd_disconnect(struct nbd_device *nbd)
{
- dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
- if (!nbd->socks)
- return -EINVAL;
-
- mutex_unlock(&nbd->config_lock);
- fsync_bdev(bdev);
- mutex_lock(&nbd->config_lock);
-
- /* Check again after getting mutex back. */
- if (!nbd->socks)
- return -EINVAL;
+ struct nbd_config *config = nbd->config;
+ dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
- &nbd->runtime_flags))
+ &config->runtime_flags))
send_disconnects(nbd);
return 0;
}
-static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev)
+static void nbd_clear_sock(struct nbd_device *nbd)
{
sock_shutdown(nbd);
nbd_clear_que(nbd);
+ nbd->task_setup = NULL;
+}
- __invalidate_device(bdev, true);
- nbd_bdev_reset(bdev);
- /*
- * We want to give the run thread a chance to wait for everybody
- * to clean up and then do it's own cleanup.
- */
- if (!test_bit(NBD_RUNNING, &nbd->runtime_flags) &&
- nbd->num_connections) {
- int i;
-
- for (i = 0; i < nbd->num_connections; i++) {
- sockfd_put(nbd->socks[i]->sock);
- kfree(nbd->socks[i]);
+static void nbd_config_put(struct nbd_device *nbd)
+{
+ if (refcount_dec_and_mutex_lock(&nbd->config_refs,
+ &nbd->config_lock)) {
+ struct nbd_config *config = nbd->config;
+ nbd_dev_dbg_close(nbd);
+ nbd_size_clear(nbd);
+ if (test_and_clear_bit(NBD_HAS_PID_FILE,
+ &config->runtime_flags))
+ device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+ nbd->task_recv = NULL;
+ nbd_clear_sock(nbd);
+ if (config->num_connections) {
+ int i;
+ for (i = 0; i < config->num_connections; i++) {
+ sockfd_put(config->socks[i]->sock);
+ kfree(config->socks[i]);
+ }
+ kfree(config->socks);
}
- kfree(nbd->socks);
- nbd->socks = NULL;
- nbd->num_connections = 0;
- }
- nbd->task_setup = NULL;
+ nbd_reset(nbd);
- return 0;
+ mutex_unlock(&nbd->config_lock);
+ nbd_put(nbd);
+ module_put(THIS_MODULE);
+ }
}
-static int nbd_start_device(struct nbd_device *nbd, struct block_device *bdev)
+static int nbd_start_device(struct nbd_device *nbd)
{
- struct recv_thread_args *args;
- int num_connections = nbd->num_connections;
+ struct nbd_config *config = nbd->config;
+ int num_connections = config->num_connections;
int error = 0, i;
if (nbd->task_recv)
return -EBUSY;
- if (!nbd->socks)
+ if (!config->socks)
return -EINVAL;
if (num_connections > 1 &&
- !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) {
+ !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
- error = -EINVAL;
- goto out_err;
+ return -EINVAL;
}
- set_bit(NBD_RUNNING, &nbd->runtime_flags);
- blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
- args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
- if (!args) {
- error = -ENOMEM;
- goto out_err;
- }
+ blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
nbd->task_recv = current;
- mutex_unlock(&nbd->config_lock);
- nbd_parse_flags(nbd, bdev);
+ nbd_parse_flags(nbd);
error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
if (error) {
dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
- goto out_recv;
+ return error;
}
-
- nbd_size_update(nbd, bdev);
+ set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
nbd_dev_dbg_init(nbd);
for (i = 0; i < num_connections; i++) {
- sk_set_memalloc(nbd->socks[i]->sock->sk);
- atomic_inc(&nbd->recv_threads);
- INIT_WORK(&args[i].work, recv_work);
- args[i].nbd = nbd;
- args[i].index = i;
- queue_work(recv_workqueue, &args[i].work);
- }
- wait_event_interruptible(nbd->recv_wq,
- atomic_read(&nbd->recv_threads) == 0);
- for (i = 0; i < num_connections; i++)
- flush_work(&args[i].work);
- nbd_dev_dbg_close(nbd);
- nbd_size_clear(nbd, bdev);
- device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-out_recv:
- mutex_lock(&nbd->config_lock);
- nbd->task_recv = NULL;
-out_err:
- clear_bit(NBD_RUNNING, &nbd->runtime_flags);
- nbd_clear_sock(nbd, bdev);
+ struct recv_thread_args *args;
+ args = kzalloc(sizeof(*args), GFP_KERNEL);
+ if (!args) {
+ sock_shutdown(nbd);
+ return -ENOMEM;
+ }
+ sk_set_memalloc(config->socks[i]->sock->sk);
+ atomic_inc(&config->recv_threads);
+ refcount_inc(&nbd->config_refs);
+ INIT_WORK(&args->work, recv_work);
+ args->nbd = nbd;
+ args->index = i;
+ queue_work(recv_workqueue, &args->work);
+ }
+ return error;
+}
+
+static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
+{
+ struct nbd_config *config = nbd->config;
+ int ret;
+
+ ret = nbd_start_device(nbd);
+ if (ret)
+ return ret;
+
+ bd_set_size(bdev, config->bytesize);
+ if (max_part)
+ bdev->bd_invalidated = 1;
+ mutex_unlock(&nbd->config_lock);
+ ret = wait_event_interruptible(config->recv_wq,
+ atomic_read(&config->recv_threads) == 0);
+ if (ret)
+ sock_shutdown(nbd);
+ mutex_lock(&nbd->config_lock);
+ bd_set_size(bdev, 0);
/* user requested, ignore socket errors */
- if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
- error = 0;
- if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
- error = -ETIMEDOUT;
+ if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
+ ret = 0;
+ if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
+ ret = -ETIMEDOUT;
+ return ret;
+}
- nbd_reset(nbd);
- return error;
+static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
+ struct block_device *bdev)
+{
+ sock_shutdown(nbd);
+ kill_bdev(bdev);
+ nbd_bdev_reset(bdev);
+ if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+ &nbd->config->runtime_flags))
+ nbd_config_put(nbd);
}
/* Must be called with config_lock held */
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
unsigned int cmd, unsigned long arg)
{
+ struct nbd_config *config = nbd->config;
+
switch (cmd) {
case NBD_DISCONNECT:
- return nbd_disconnect(nbd, bdev);
+ return nbd_disconnect(nbd);
case NBD_CLEAR_SOCK:
- return nbd_clear_sock(nbd, bdev);
+ nbd_clear_sock_ioctl(nbd, bdev);
+ return 0;
case NBD_SET_SOCK:
- return nbd_add_socket(nbd, bdev, arg);
+ return nbd_add_socket(nbd, arg, false);
case NBD_SET_BLKSIZE:
- nbd_size_set(nbd, bdev, arg,
- div_s64(nbd->bytesize, arg));
+ nbd_size_set(nbd, arg,
+ div_s64(config->bytesize, arg));
return 0;
case NBD_SET_SIZE:
- nbd_size_set(nbd, bdev, nbd->blksize,
- div_s64(arg, nbd->blksize));
+ nbd_size_set(nbd, config->blksize,
+ div_s64(arg, config->blksize));
return 0;
case NBD_SET_SIZE_BLOCKS:
- nbd_size_set(nbd, bdev, nbd->blksize, arg);
+ nbd_size_set(nbd, config->blksize, arg);
return 0;
case NBD_SET_TIMEOUT:
if (arg) {
@@ -855,10 +1155,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
return 0;
case NBD_SET_FLAGS:
- nbd->flags = arg;
+ config->flags = arg;
return 0;
case NBD_DO_IT:
- return nbd_start_device(nbd, bdev);
+ return nbd_start_device_ioctl(nbd, bdev);
case NBD_CLEAR_QUE:
/*
* This is for compatibility only. The queue is always cleared
@@ -879,23 +1179,92 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct nbd_device *nbd = bdev->bd_disk->private_data;
- int error;
+ struct nbd_config *config = nbd->config;
+ int error = -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- BUG_ON(nbd->magic != NBD_MAGIC);
-
mutex_lock(&nbd->config_lock);
- error = __nbd_ioctl(bdev, nbd, cmd, arg);
- mutex_unlock(&nbd->config_lock);
+ /* Don't allow ioctl operations on a nbd device that was created with
+ * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
+ */
+ if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
+ (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
+ error = __nbd_ioctl(bdev, nbd, cmd, arg);
+ else
+ dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
+ mutex_unlock(&nbd->config_lock);
return error;
}
+static struct nbd_config *nbd_alloc_config(void)
+{
+ struct nbd_config *config;
+
+ config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
+ if (!config)
+ return NULL;
+ atomic_set(&config->recv_threads, 0);
+ init_waitqueue_head(&config->recv_wq);
+ init_waitqueue_head(&config->conn_wait);
+ config->blksize = 1024;
+ atomic_set(&config->live_connections, 0);
+ try_module_get(THIS_MODULE);
+ return config;
+}
+
+static int nbd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct nbd_device *nbd;
+ int ret = 0;
+
+ mutex_lock(&nbd_index_mutex);
+ nbd = bdev->bd_disk->private_data;
+ if (!nbd) {
+ ret = -ENXIO;
+ goto out;
+ }
+ if (!refcount_inc_not_zero(&nbd->refs)) {
+ ret = -ENXIO;
+ goto out;
+ }
+ if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ struct nbd_config *config;
+
+ mutex_lock(&nbd->config_lock);
+ if (refcount_inc_not_zero(&nbd->config_refs)) {
+ mutex_unlock(&nbd->config_lock);
+ goto out;
+ }
+ config = nbd->config = nbd_alloc_config();
+ if (!config) {
+ ret = -ENOMEM;
+ mutex_unlock(&nbd->config_lock);
+ goto out;
+ }
+ refcount_set(&nbd->config_refs, 1);
+ refcount_inc(&nbd->refs);
+ mutex_unlock(&nbd->config_lock);
+ }
+out:
+ mutex_unlock(&nbd_index_mutex);
+ return ret;
+}
+
+static void nbd_release(struct gendisk *disk, fmode_t mode)
+{
+ struct nbd_device *nbd = disk->private_data;
+ nbd_config_put(nbd);
+ nbd_put(nbd);
+}
+
static const struct block_device_operations nbd_fops =
{
.owner = THIS_MODULE,
+ .open = nbd_open,
+ .release = nbd_release,
.ioctl = nbd_ioctl,
.compat_ioctl = nbd_ioctl,
};
@@ -927,7 +1296,7 @@ static const struct file_operations nbd_dbg_tasks_ops = {
static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
{
struct nbd_device *nbd = s->private;
- u32 flags = nbd->flags;
+ u32 flags = nbd->config->flags;
seq_printf(s, "Hex: 0x%08x\n\n", flags);
@@ -960,6 +1329,7 @@ static const struct file_operations nbd_dbg_flags_ops = {
static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
struct dentry *dir;
+ struct nbd_config *config = nbd->config;
if (!nbd_dbg_dir)
return -EIO;
@@ -970,12 +1340,12 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
nbd_name(nbd));
return -EIO;
}
- nbd->dbg_dir = dir;
+ config->dbg_dir = dir;
debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
- debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
+ debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
- debugfs_create_u64("blocksize", 0444, dir, &nbd->blksize);
+ debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
return 0;
@@ -983,7 +1353,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
- debugfs_remove_recursive(nbd->dbg_dir);
+ debugfs_remove_recursive(nbd->config->dbg_dir);
}
static int nbd_dbg_init(void)
@@ -1035,25 +1405,13 @@ static int nbd_init_request(void *data, struct request *rq,
return 0;
}
-static struct blk_mq_ops nbd_mq_ops = {
+static const struct blk_mq_ops nbd_mq_ops = {
.queue_rq = nbd_queue_rq,
+ .complete = nbd_complete_rq,
.init_request = nbd_init_request,
.timeout = nbd_xmit_timeout,
};
-static void nbd_dev_remove(struct nbd_device *nbd)
-{
- struct gendisk *disk = nbd->disk;
- nbd->magic = 0;
- if (disk) {
- del_gendisk(disk);
- blk_cleanup_queue(disk->queue);
- blk_mq_free_tag_set(&nbd->tag_set);
- put_disk(disk);
- }
- kfree(nbd);
-}
-
static int nbd_dev_add(int index)
{
struct nbd_device *nbd;
@@ -1082,6 +1440,7 @@ static int nbd_dev_add(int index)
if (err < 0)
goto out_free_disk;
+ nbd->index = index;
nbd->disk = disk;
nbd->tag_set.ops = &nbd_mq_ops;
nbd->tag_set.nr_hw_queues = 1;
@@ -1110,20 +1469,23 @@ static int nbd_dev_add(int index)
queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
disk->queue->limits.discard_granularity = 512;
blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
- disk->queue->limits.discard_zeroes_data = 0;
+ blk_queue_max_segment_size(disk->queue, UINT_MAX);
+ blk_queue_max_segments(disk->queue, USHRT_MAX);
blk_queue_max_hw_sectors(disk->queue, 65536);
disk->queue->limits.max_sectors = 256;
- nbd->magic = NBD_MAGIC;
mutex_init(&nbd->config_lock);
+ refcount_set(&nbd->config_refs, 0);
+ refcount_set(&nbd->refs, 1);
+ INIT_LIST_HEAD(&nbd->list);
disk->major = NBD_MAJOR;
disk->first_minor = index << part_shift;
disk->fops = &nbd_fops;
disk->private_data = nbd;
sprintf(disk->disk_name, "nbd%d", index);
- init_waitqueue_head(&nbd->recv_wq);
nbd_reset(nbd);
add_disk(disk);
+ nbd_total_devices++;
return index;
out_free_tags:
@@ -1138,10 +1500,535 @@ out:
return err;
}
-/*
- * And here should be modules and kernel interface
- * (Just smiley confuses emacs :-)
+static int find_free_cb(int id, void *ptr, void *data)
+{
+ struct nbd_device *nbd = ptr;
+ struct nbd_device **found = data;
+
+ if (!refcount_read(&nbd->config_refs)) {
+ *found = nbd;
+ return 1;
+ }
+ return 0;
+}
+
+/* Netlink interface. */
+static struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
+ [NBD_ATTR_INDEX] = { .type = NLA_U32 },
+ [NBD_ATTR_SIZE_BYTES] = { .type = NLA_U64 },
+ [NBD_ATTR_BLOCK_SIZE_BYTES] = { .type = NLA_U64 },
+ [NBD_ATTR_TIMEOUT] = { .type = NLA_U64 },
+ [NBD_ATTR_SERVER_FLAGS] = { .type = NLA_U64 },
+ [NBD_ATTR_CLIENT_FLAGS] = { .type = NLA_U64 },
+ [NBD_ATTR_SOCKETS] = { .type = NLA_NESTED},
+ [NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 },
+ [NBD_ATTR_DEVICE_LIST] = { .type = NLA_NESTED},
+};
+
+static struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
+ [NBD_SOCK_FD] = { .type = NLA_U32 },
+};
+
+/* We don't use this right now since we don't parse the incoming list, but we
+ * still want it here so userspace knows what to expect.
*/
+static struct nla_policy __attribute__((unused))
+nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
+ [NBD_DEVICE_INDEX] = { .type = NLA_U32 },
+ [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 },
+};
+
+static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nbd_device *nbd = NULL;
+ struct nbd_config *config;
+ int index = -1;
+ int ret;
+ bool put_dev = false;
+
+ if (!netlink_capable(skb, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (info->attrs[NBD_ATTR_INDEX])
+ index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+ if (!info->attrs[NBD_ATTR_SOCKETS]) {
+ printk(KERN_ERR "nbd: must specify at least one socket\n");
+ return -EINVAL;
+ }
+ if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
+ printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
+ return -EINVAL;
+ }
+again:
+ mutex_lock(&nbd_index_mutex);
+ if (index == -1) {
+ ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
+ if (ret == 0) {
+ int new_index;
+ new_index = nbd_dev_add(-1);
+ if (new_index < 0) {
+ mutex_unlock(&nbd_index_mutex);
+ printk(KERN_ERR "nbd: failed to add new device\n");
+ return ret;
+ }
+ nbd = idr_find(&nbd_index_idr, new_index);
+ }
+ } else {
+ nbd = idr_find(&nbd_index_idr, index);
+ }
+ if (!nbd) {
+ printk(KERN_ERR "nbd: couldn't find device at index %d\n",
+ index);
+ mutex_unlock(&nbd_index_mutex);
+ return -EINVAL;
+ }
+ if (!refcount_inc_not_zero(&nbd->refs)) {
+ mutex_unlock(&nbd_index_mutex);
+ if (index == -1)
+ goto again;
+ printk(KERN_ERR "nbd: device at index %d is going down\n",
+ index);
+ return -EINVAL;
+ }
+ mutex_unlock(&nbd_index_mutex);
+
+ mutex_lock(&nbd->config_lock);
+ if (refcount_read(&nbd->config_refs)) {
+ mutex_unlock(&nbd->config_lock);
+ nbd_put(nbd);
+ if (index == -1)
+ goto again;
+ printk(KERN_ERR "nbd: nbd%d already in use\n", index);
+ return -EBUSY;
+ }
+ if (WARN_ON(nbd->config)) {
+ mutex_unlock(&nbd->config_lock);
+ nbd_put(nbd);
+ return -EINVAL;
+ }
+ config = nbd->config = nbd_alloc_config();
+ if (!nbd->config) {
+ mutex_unlock(&nbd->config_lock);
+ nbd_put(nbd);
+ printk(KERN_ERR "nbd: couldn't allocate config\n");
+ return -ENOMEM;
+ }
+ refcount_set(&nbd->config_refs, 1);
+ set_bit(NBD_BOUND, &config->runtime_flags);
+
+ if (info->attrs[NBD_ATTR_SIZE_BYTES]) {
+ u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
+ nbd_size_set(nbd, config->blksize,
+ div64_u64(bytes, config->blksize));
+ }
+ if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
+ u64 bsize =
+ nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
+ nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
+ }
+ if (info->attrs[NBD_ATTR_TIMEOUT]) {
+ u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
+ nbd->tag_set.timeout = timeout * HZ;
+ blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
+ }
+ if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
+ config->dead_conn_timeout =
+ nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
+ config->dead_conn_timeout *= HZ;
+ }
+ if (info->attrs[NBD_ATTR_SERVER_FLAGS])
+ config->flags =
+ nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
+ if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
+ u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
+ if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
+ set_bit(NBD_DESTROY_ON_DISCONNECT,
+ &config->runtime_flags);
+ put_dev = true;
+ }
+ }
+
+ if (info->attrs[NBD_ATTR_SOCKETS]) {
+ struct nlattr *attr;
+ int rem, fd;
+
+ nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
+ rem) {
+ struct nlattr *socks[NBD_SOCK_MAX+1];
+
+ if (nla_type(attr) != NBD_SOCK_ITEM) {
+ printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
+ nbd_sock_policy);
+ if (ret != 0) {
+ printk(KERN_ERR "nbd: error processing sock list\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!socks[NBD_SOCK_FD])
+ continue;
+ fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
+ ret = nbd_add_socket(nbd, fd, true);
+ if (ret)
+ goto out;
+ }
+ }
+ ret = nbd_start_device(nbd);
+out:
+ mutex_unlock(&nbd->config_lock);
+ if (!ret) {
+ set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
+ refcount_inc(&nbd->config_refs);
+ nbd_connect_reply(info, nbd->index);
+ }
+ nbd_config_put(nbd);
+ if (put_dev)
+ nbd_put(nbd);
+ return ret;
+}
+
+static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nbd_device *nbd;
+ int index;
+
+ if (!netlink_capable(skb, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!info->attrs[NBD_ATTR_INDEX]) {
+ printk(KERN_ERR "nbd: must specify an index to disconnect\n");
+ return -EINVAL;
+ }
+ index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+ mutex_lock(&nbd_index_mutex);
+ nbd = idr_find(&nbd_index_idr, index);
+ if (!nbd) {
+ mutex_unlock(&nbd_index_mutex);
+ printk(KERN_ERR "nbd: couldn't find device at index %d\n",
+ index);
+ return -EINVAL;
+ }
+ if (!refcount_inc_not_zero(&nbd->refs)) {
+ mutex_unlock(&nbd_index_mutex);
+ printk(KERN_ERR "nbd: device at index %d is going down\n",
+ index);
+ return -EINVAL;
+ }
+ mutex_unlock(&nbd_index_mutex);
+ if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ nbd_put(nbd);
+ return 0;
+ }
+ mutex_lock(&nbd->config_lock);
+ nbd_disconnect(nbd);
+ mutex_unlock(&nbd->config_lock);
+ if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+ &nbd->config->runtime_flags))
+ nbd_config_put(nbd);
+ nbd_config_put(nbd);
+ nbd_put(nbd);
+ return 0;
+}
+
+static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nbd_device *nbd = NULL;
+ struct nbd_config *config;
+ int index;
+ int ret = -EINVAL;
+ bool put_dev = false;
+
+ if (!netlink_capable(skb, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!info->attrs[NBD_ATTR_INDEX]) {
+ printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
+ return -EINVAL;
+ }
+ index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+ mutex_lock(&nbd_index_mutex);
+ nbd = idr_find(&nbd_index_idr, index);
+ if (!nbd) {
+ mutex_unlock(&nbd_index_mutex);
+ printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
+ index);
+ return -EINVAL;
+ }
+ if (!refcount_inc_not_zero(&nbd->refs)) {
+ mutex_unlock(&nbd_index_mutex);
+ printk(KERN_ERR "nbd: device at index %d is going down\n",
+ index);
+ return -EINVAL;
+ }
+ mutex_unlock(&nbd_index_mutex);
+
+ if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ dev_err(nbd_to_dev(nbd),
+ "not configured, cannot reconfigure\n");
+ nbd_put(nbd);
+ return -EINVAL;
+ }
+
+ mutex_lock(&nbd->config_lock);
+ config = nbd->config;
+ if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
+ !nbd->task_recv) {
+ dev_err(nbd_to_dev(nbd),
+ "not configured, cannot reconfigure\n");
+ goto out;
+ }
+
+ if (info->attrs[NBD_ATTR_TIMEOUT]) {
+ u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
+ nbd->tag_set.timeout = timeout * HZ;
+ blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
+ }
+ if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
+ config->dead_conn_timeout =
+ nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
+ config->dead_conn_timeout *= HZ;
+ }
+ if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
+ u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
+ if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
+ if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+ &config->runtime_flags))
+ put_dev = true;
+ } else {
+ if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+ &config->runtime_flags))
+ refcount_inc(&nbd->refs);
+ }
+ }
+
+ if (info->attrs[NBD_ATTR_SOCKETS]) {
+ struct nlattr *attr;
+ int rem, fd;
+
+ nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
+ rem) {
+ struct nlattr *socks[NBD_SOCK_MAX+1];
+
+ if (nla_type(attr) != NBD_SOCK_ITEM) {
+ printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
+ nbd_sock_policy);
+ if (ret != 0) {
+ printk(KERN_ERR "nbd: error processing sock list\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!socks[NBD_SOCK_FD])
+ continue;
+ fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
+ ret = nbd_reconnect_socket(nbd, fd);
+ if (ret) {
+ if (ret == -ENOSPC)
+ ret = 0;
+ goto out;
+ }
+ dev_info(nbd_to_dev(nbd), "reconnected socket\n");
+ }
+ }
+out:
+ mutex_unlock(&nbd->config_lock);
+ nbd_config_put(nbd);
+ nbd_put(nbd);
+ if (put_dev)
+ nbd_put(nbd);
+ return ret;
+}
+
+static const struct genl_ops nbd_connect_genl_ops[] = {
+ {
+ .cmd = NBD_CMD_CONNECT,
+ .policy = nbd_attr_policy,
+ .doit = nbd_genl_connect,
+ },
+ {
+ .cmd = NBD_CMD_DISCONNECT,
+ .policy = nbd_attr_policy,
+ .doit = nbd_genl_disconnect,
+ },
+ {
+ .cmd = NBD_CMD_RECONFIGURE,
+ .policy = nbd_attr_policy,
+ .doit = nbd_genl_reconfigure,
+ },
+ {
+ .cmd = NBD_CMD_STATUS,
+ .policy = nbd_attr_policy,
+ .doit = nbd_genl_status,
+ },
+};
+
+static const struct genl_multicast_group nbd_mcast_grps[] = {
+ { .name = NBD_GENL_MCAST_GROUP_NAME, },
+};
+
+static struct genl_family nbd_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NBD_GENL_FAMILY_NAME,
+ .version = NBD_GENL_VERSION,
+ .module = THIS_MODULE,
+ .ops = nbd_connect_genl_ops,
+ .n_ops = ARRAY_SIZE(nbd_connect_genl_ops),
+ .maxattr = NBD_ATTR_MAX,
+ .mcgrps = nbd_mcast_grps,
+ .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps),
+};
+
+static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
+{
+ struct nlattr *dev_opt;
+ u8 connected = 0;
+ int ret;
+
+ /* This is a little racey, but for status it's ok. The
+ * reason we don't take a ref here is because we can't
+ * take a ref in the index == -1 case as we would need
+ * to put under the nbd_index_mutex, which could
+ * deadlock if we are configured to remove ourselves
+ * once we're disconnected.
+ */
+ if (refcount_read(&nbd->config_refs))
+ connected = 1;
+ dev_opt = nla_nest_start(reply, NBD_DEVICE_ITEM);
+ if (!dev_opt)
+ return -EMSGSIZE;
+ ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
+ if (ret)
+ return -EMSGSIZE;
+ ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
+ connected);
+ if (ret)
+ return -EMSGSIZE;
+ nla_nest_end(reply, dev_opt);
+ return 0;
+}
+
+static int status_cb(int id, void *ptr, void *data)
+{
+ struct nbd_device *nbd = ptr;
+ return populate_nbd_status(nbd, (struct sk_buff *)data);
+}
+
+static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *dev_list;
+ struct sk_buff *reply;
+ void *reply_head;
+ size_t msg_size;
+ int index = -1;
+ int ret = -ENOMEM;
+
+ if (info->attrs[NBD_ATTR_INDEX])
+ index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+
+ mutex_lock(&nbd_index_mutex);
+
+ msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
+ nla_attr_size(sizeof(u8)));
+ msg_size *= (index == -1) ? nbd_total_devices : 1;
+
+ reply = genlmsg_new(msg_size, GFP_KERNEL);
+ if (!reply)
+ goto out;
+ reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
+ NBD_CMD_STATUS);
+ if (!reply_head) {
+ nlmsg_free(reply);
+ goto out;
+ }
+
+ dev_list = nla_nest_start(reply, NBD_ATTR_DEVICE_LIST);
+ if (index == -1) {
+ ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
+ if (ret) {
+ nlmsg_free(reply);
+ goto out;
+ }
+ } else {
+ struct nbd_device *nbd;
+ nbd = idr_find(&nbd_index_idr, index);
+ if (nbd) {
+ ret = populate_nbd_status(nbd, reply);
+ if (ret) {
+ nlmsg_free(reply);
+ goto out;
+ }
+ }
+ }
+ nla_nest_end(reply, dev_list);
+ genlmsg_end(reply, reply_head);
+ genlmsg_reply(reply, info);
+ ret = 0;
+out:
+ mutex_unlock(&nbd_index_mutex);
+ return ret;
+}
+
+static void nbd_connect_reply(struct genl_info *info, int index)
+{
+ struct sk_buff *skb;
+ void *msg_head;
+ int ret;
+
+ skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
+ if (!skb)
+ return;
+ msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
+ NBD_CMD_CONNECT);
+ if (!msg_head) {
+ nlmsg_free(skb);
+ return;
+ }
+ ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
+ if (ret) {
+ nlmsg_free(skb);
+ return;
+ }
+ genlmsg_end(skb, msg_head);
+ genlmsg_reply(skb, info);
+}
+
+static void nbd_mcast_index(int index)
+{
+ struct sk_buff *skb;
+ void *msg_head;
+ int ret;
+
+ skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
+ if (!skb)
+ return;
+ msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
+ NBD_CMD_LINK_DEAD);
+ if (!msg_head) {
+ nlmsg_free(skb);
+ return;
+ }
+ ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
+ if (ret) {
+ nlmsg_free(skb);
+ return;
+ }
+ genlmsg_end(skb, msg_head);
+ genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
+}
+
+static void nbd_dead_link_work(struct work_struct *work)
+{
+ struct link_dead_args *args = container_of(work, struct link_dead_args,
+ work);
+ nbd_mcast_index(args->index);
+ kfree(args);
+}
static int __init nbd_init(void)
{
@@ -1184,6 +2071,11 @@ static int __init nbd_init(void)
return -EIO;
}
+ if (genl_register_family(&nbd_genl_family)) {
+ unregister_blkdev(NBD_MAJOR, "nbd");
+ destroy_workqueue(recv_workqueue);
+ return -EINVAL;
+ }
nbd_dbg_init();
mutex_lock(&nbd_index_mutex);
@@ -1195,17 +2087,34 @@ static int __init nbd_init(void)
static int nbd_exit_cb(int id, void *ptr, void *data)
{
+ struct list_head *list = (struct list_head *)data;
struct nbd_device *nbd = ptr;
- nbd_dev_remove(nbd);
+
+ list_add_tail(&nbd->list, list);
return 0;
}
static void __exit nbd_cleanup(void)
{
+ struct nbd_device *nbd;
+ LIST_HEAD(del_list);
+
nbd_dbg_close();
- idr_for_each(&nbd_index_idr, &nbd_exit_cb, NULL);
+ mutex_lock(&nbd_index_mutex);
+ idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
+ mutex_unlock(&nbd_index_mutex);
+
+ while (!list_empty(&del_list)) {
+ nbd = list_first_entry(&del_list, struct nbd_device, list);
+ list_del_init(&nbd->list);
+ if (refcount_read(&nbd->refs) != 1)
+ printk(KERN_ERR "nbd: possibly leaking a device\n");
+ nbd_put(nbd);
+ }
+
idr_destroy(&nbd_index_idr);
+ genl_unregister_family(&nbd_genl_family);
destroy_workqueue(recv_workqueue);
unregister_blkdev(NBD_MAJOR, "nbd");
}
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 6f2e565bccc5..d946e1eeac8e 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -117,6 +117,10 @@ static bool use_lightnvm;
module_param(use_lightnvm, bool, S_IRUGO);
MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
+static bool blocking;
+module_param(blocking, bool, S_IRUGO);
+MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
+
static int irqmode = NULL_IRQ_SOFTIRQ;
static int null_set_irqmode(const char *str, const struct kernel_param *kp)
@@ -277,7 +281,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd)
case NULL_IRQ_SOFTIRQ:
switch (queue_mode) {
case NULL_Q_MQ:
- blk_mq_complete_request(cmd->rq, cmd->rq->errors);
+ blk_mq_complete_request(cmd->rq);
break;
case NULL_Q_RQ:
blk_complete_request(cmd->rq);
@@ -357,6 +361,8 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
{
struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+ might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
+
if (irqmode == NULL_IRQ_TIMER) {
hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cmd->timer.function = null_cmd_timer_expired;
@@ -392,7 +398,7 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
return 0;
}
-static struct blk_mq_ops null_mq_ops = {
+static const struct blk_mq_ops null_mq_ops = {
.queue_rq = null_queue_rq,
.init_hctx = null_init_hctx,
.complete = null_softirq_done_fn,
@@ -437,14 +443,7 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
if (IS_ERR(rq))
return -ENOMEM;
- rq->__sector = bio->bi_iter.bi_sector;
- rq->ioprio = bio_prio(bio);
-
- if (bio_has_data(bio))
- rq->nr_phys_segments = bio_phys_segments(q, bio);
-
- rq->__data_len = bio->bi_iter.bi_size;
- rq->bio = rq->biotail = bio;
+ blk_init_request_from_bio(rq, bio);
rq->end_io_data = rqd;
@@ -724,6 +723,9 @@ static int null_add_dev(void)
nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
nullb->tag_set.driver_data = nullb;
+ if (blocking)
+ nullb->tag_set.flags |= BLK_MQ_F_BLOCKING;
+
rv = blk_mq_alloc_tag_set(&nullb->tag_set);
if (rv)
goto out_cleanup_queues;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
deleted file mode 100644
index 8127b8201a01..000000000000
--- a/drivers/block/osdblk.c
+++ /dev/null
@@ -1,693 +0,0 @@
-
-/*
- osdblk.c -- Export a single SCSI OSD object as a Linux block device
-
-
- Copyright 2009 Red Hat, Inc.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; see the file COPYING. If not, write to
- the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
-
-
- Instructions for use
- --------------------
-
- 1) Map a Linux block device to an existing OSD object.
-
- In this example, we will use partition id 1234, object id 5678,
- OSD device /dev/osd1.
-
- $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
-
-
- 2) List all active blkdev<->object mappings.
-
- In this example, we have performed step #1 twice, creating two blkdevs,
- mapped to two separate OSD objects.
-
- $ cat /sys/class/osdblk/list
- 0 174 1234 5678 /dev/osd1
- 1 179 1994 897123 /dev/osd0
-
- The columns, in order, are:
- - blkdev unique id
- - blkdev assigned major
- - OSD object partition id
- - OSD object id
- - OSD device
-
-
- 3) Remove an active blkdev<->object mapping.
-
- In this example, we remove the mapping with blkdev unique id 1.
-
- $ echo 1 > /sys/class/osdblk/remove
-
-
- NOTE: The actual creation and deletion of OSD objects is outside the scope
- of this driver.
-
- */
-
-#include <linux/kernel.h>
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <scsi/osd_initiator.h>
-#include <scsi/osd_attributes.h>
-#include <scsi/osd_sec.h>
-#include <scsi/scsi_device.h>
-
-#define DRV_NAME "osdblk"
-#define PFX DRV_NAME ": "
-
-/* #define _OSDBLK_DEBUG */
-#ifdef _OSDBLK_DEBUG
-#define OSDBLK_DEBUG(fmt, a...) \
- printk(KERN_NOTICE "osdblk @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define OSDBLK_DEBUG(fmt, a...) \
- do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
-MODULE_DESCRIPTION("block device inside an OSD object osdblk.ko");
-MODULE_LICENSE("GPL");
-
-struct osdblk_device;
-
-enum {
- OSDBLK_MINORS_PER_MAJOR = 256, /* max minors per blkdev */
- OSDBLK_MAX_REQ = 32, /* max parallel requests */
- OSDBLK_OP_TIMEOUT = 4 * 60, /* sync OSD req timeout */
-};
-
-struct osdblk_request {
- struct request *rq; /* blk layer request */
- struct bio *bio; /* cloned bio */
- struct osdblk_device *osdev; /* associated blkdev */
-};
-
-struct osdblk_device {
- int id; /* blkdev unique id */
-
- int major; /* blkdev assigned major */
- struct gendisk *disk; /* blkdev's gendisk and rq */
- struct request_queue *q;
-
- struct osd_dev *osd; /* associated OSD */
-
- char name[32]; /* blkdev name, e.g. osdblk34 */
-
- spinlock_t lock; /* queue lock */
-
- struct osd_obj_id obj; /* OSD partition, obj id */
- uint8_t obj_cred[OSD_CAP_LEN]; /* OSD cred */
-
- struct osdblk_request req[OSDBLK_MAX_REQ]; /* request table */
-
- struct list_head node;
-
- char osd_path[0]; /* OSD device path */
-};
-
-static struct class *class_osdblk; /* /sys/class/osdblk */
-static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
-static LIST_HEAD(osdblkdev_list);
-
-static const struct block_device_operations osdblk_bd_ops = {
- .owner = THIS_MODULE,
-};
-
-static const struct osd_attr g_attr_logical_length = ATTR_DEF(
- OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
-
-static void osdblk_make_credential(u8 cred_a[OSD_CAP_LEN],
- const struct osd_obj_id *obj)
-{
- osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-
-/* copied from exofs; move to libosd? */
-/*
- * Perform a synchronous OSD operation. copied from exofs; move to libosd?
- */
-static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
-{
- int ret;
-
- or->timeout = timeout;
- ret = osd_finalize_request(or, 0, credential, NULL);
- if (ret)
- return ret;
-
- ret = osd_execute_request(or);
-
- /* osd_req_decode_sense(or, ret); */
- return ret;
-}
-
-/*
- * Perform an asynchronous OSD operation. copied from exofs; move to libosd?
- */
-static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done,
- void *caller_context, u8 *cred)
-{
- int ret;
-
- ret = osd_finalize_request(or, 0, cred, NULL);
- if (ret)
- return ret;
-
- ret = osd_execute_request_async(or, async_done, caller_context);
-
- return ret;
-}
-
-/* copied from exofs; move to libosd? */
-static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
-{
- struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
- void *iter = NULL;
- int nelem;
-
- do {
- nelem = 1;
- osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
- if ((cur_attr.attr_page == attr->attr_page) &&
- (cur_attr.attr_id == attr->attr_id)) {
- attr->len = cur_attr.len;
- attr->val_ptr = cur_attr.val_ptr;
- return 0;
- }
- } while (iter);
-
- return -EIO;
-}
-
-static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
-{
- struct osd_request *or;
- struct osd_attr attr;
- int ret;
-
- /* start request */
- or = osd_start_request(osdev->osd, GFP_KERNEL);
- if (!or)
- return -ENOMEM;
-
- /* create a get-attributes(length) request */
- osd_req_get_attributes(or, &osdev->obj);
-
- osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
-
- /* execute op synchronously */
- ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
- if (ret)
- goto out;
-
- /* extract length from returned attribute info */
- attr = g_attr_logical_length;
- ret = extract_attr_from_req(or, &attr);
- if (ret)
- goto out;
-
- *size_out = get_unaligned_be64(attr.val_ptr);
-
-out:
- osd_end_request(or);
- return ret;
-
-}
-
-static void osdblk_osd_complete(struct osd_request *or, void *private)
-{
- struct osdblk_request *orq = private;
- struct osd_sense_info osi;
- int ret = osd_req_decode_sense(or, &osi);
-
- if (ret) {
- ret = -EIO;
- OSDBLK_DEBUG("osdblk_osd_complete with err=%d\n", ret);
- }
-
- /* complete OSD request */
- osd_end_request(or);
-
- /* complete request passed to osdblk by block layer */
- __blk_end_request_all(orq->rq, ret);
-}
-
-static void bio_chain_put(struct bio *chain)
-{
- struct bio *tmp;
-
- while (chain) {
- tmp = chain;
- chain = chain->bi_next;
-
- bio_put(tmp);
- }
-}
-
-static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
-{
- struct bio *tmp, *new_chain = NULL, *tail = NULL;
-
- while (old_chain) {
- tmp = bio_clone_kmalloc(old_chain, gfpmask);
- if (!tmp)
- goto err_out;
-
- tmp->bi_bdev = NULL;
- gfpmask &= ~__GFP_DIRECT_RECLAIM;
- tmp->bi_next = NULL;
-
- if (!new_chain)
- new_chain = tail = tmp;
- else {
- tail->bi_next = tmp;
- tail = tmp;
- }
-
- old_chain = old_chain->bi_next;
- }
-
- return new_chain;
-
-err_out:
- OSDBLK_DEBUG("bio_chain_clone with err\n");
- bio_chain_put(new_chain);
- return NULL;
-}
-
-static void osdblk_rq_fn(struct request_queue *q)
-{
- struct osdblk_device *osdev = q->queuedata;
-
- while (1) {
- struct request *rq;
- struct osdblk_request *orq;
- struct osd_request *or;
- struct bio *bio;
- bool do_write, do_flush;
-
- /* peek at request from block layer */
- rq = blk_fetch_request(q);
- if (!rq)
- break;
-
- /* deduce our operation (read, write, flush) */
- /* I wish the block layer simplified cmd_type/cmd_flags/cmd[]
- * into a clearly defined set of RPC commands:
- * read, write, flush, scsi command, power mgmt req,
- * driver-specific, etc.
- */
-
- do_flush = (req_op(rq) == REQ_OP_FLUSH);
- do_write = (rq_data_dir(rq) == WRITE);
-
- if (!do_flush) { /* osd_flush does not use a bio */
- /* a bio clone to be passed down to OSD request */
- bio = bio_chain_clone(rq->bio, GFP_ATOMIC);
- if (!bio)
- break;
- } else
- bio = NULL;
-
- /* alloc internal OSD request, for OSD command execution */
- or = osd_start_request(osdev->osd, GFP_ATOMIC);
- if (!or) {
- bio_chain_put(bio);
- OSDBLK_DEBUG("osd_start_request with err\n");
- break;
- }
-
- orq = &osdev->req[rq->tag];
- orq->rq = rq;
- orq->bio = bio;
- orq->osdev = osdev;
-
- /* init OSD command: flush, write or read */
- if (do_flush)
- osd_req_flush_object(or, &osdev->obj,
- OSD_CDB_FLUSH_ALL, 0, 0);
- else if (do_write)
- osd_req_write(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
- bio, blk_rq_bytes(rq));
- else
- osd_req_read(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
- bio, blk_rq_bytes(rq));
-
- OSDBLK_DEBUG("%s 0x%x bytes at 0x%llx\n",
- do_flush ? "flush" : do_write ?
- "write" : "read", blk_rq_bytes(rq),
- blk_rq_pos(rq) * 512ULL);
-
- /* begin OSD command execution */
- if (osd_async_op(or, osdblk_osd_complete, orq,
- osdev->obj_cred)) {
- osd_end_request(or);
- blk_requeue_request(q, rq);
- bio_chain_put(bio);
- OSDBLK_DEBUG("osd_execute_request_async with err\n");
- break;
- }
-
- /* remove the special 'flush' marker, now that the command
- * is executing
- */
- rq->special = NULL;
- }
-}
-
-static void osdblk_free_disk(struct osdblk_device *osdev)
-{
- struct gendisk *disk = osdev->disk;
-
- if (!disk)
- return;
-
- if (disk->flags & GENHD_FL_UP)
- del_gendisk(disk);
- if (disk->queue)
- blk_cleanup_queue(disk->queue);
- put_disk(disk);
-}
-
-static int osdblk_init_disk(struct osdblk_device *osdev)
-{
- struct gendisk *disk;
- struct request_queue *q;
- int rc;
- u64 obj_size = 0;
-
- /* contact OSD, request size info about the object being mapped */
- rc = osdblk_get_obj_size(osdev, &obj_size);
- if (rc)
- return rc;
-
- /* create gendisk info */
- disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);
- if (!disk)
- return -ENOMEM;
-
- sprintf(disk->disk_name, DRV_NAME "%d", osdev->id);
- disk->major = osdev->major;
- disk->first_minor = 0;
- disk->fops = &osdblk_bd_ops;
- disk->private_data = osdev;
-
- /* init rq */
- q = blk_init_queue(osdblk_rq_fn, &osdev->lock);
- if (!q) {
- put_disk(disk);
- return -ENOMEM;
- }
-
- /* switch queue to TCQ mode; allocate tag map */
- rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO);
- if (rc) {
- blk_cleanup_queue(q);
- put_disk(disk);
- return rc;
- }
-
- /* Set our limits to the lower device limits, because osdblk cannot
- * sleep when allocating a lower-request and therefore cannot be
- * bouncing.
- */
- blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
-
- blk_queue_prep_rq(q, blk_queue_start_tag);
- blk_queue_write_cache(q, true, false);
-
- disk->queue = q;
-
- q->queuedata = osdev;
-
- osdev->disk = disk;
- osdev->q = q;
-
- /* finally, announce the disk to the world */
- set_capacity(disk, obj_size / 512ULL);
- add_disk(disk);
-
- printk(KERN_INFO "%s: Added of size 0x%llx\n",
- disk->disk_name, (unsigned long long)obj_size);
-
- return 0;
-}
-
-/********************************************************************
- * /sys/class/osdblk/
- * add map OSD object to blkdev
- * remove unmap OSD object
- * list show mappings
- *******************************************************************/
-
-static void class_osdblk_release(struct class *cls)
-{
- kfree(cls);
-}
-
-static ssize_t class_osdblk_list(struct class *c,
- struct class_attribute *attr,
- char *data)
-{
- int n = 0;
- struct list_head *tmp;
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- list_for_each(tmp, &osdblkdev_list) {
- struct osdblk_device *osdev;
-
- osdev = list_entry(tmp, struct osdblk_device, node);
-
- n += sprintf(data+n, "%d %d %llu %llu %s\n",
- osdev->id,
- osdev->major,
- osdev->obj.partition,
- osdev->obj.id,
- osdev->osd_path);
- }
-
- mutex_unlock(&ctl_mutex);
- return n;
-}
-
-static ssize_t class_osdblk_add(struct class *c,
- struct class_attribute *attr,
- const char *buf, size_t count)
-{
- struct osdblk_device *osdev;
- ssize_t rc;
- int irc, new_id = 0;
- struct list_head *tmp;
-
- if (!try_module_get(THIS_MODULE))
- return -ENODEV;
-
- /* new osdblk_device object */
- osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);
- if (!osdev) {
- rc = -ENOMEM;
- goto err_out_mod;
- }
-
- /* static osdblk_device initialization */
- spin_lock_init(&osdev->lock);
- INIT_LIST_HEAD(&osdev->node);
-
- /* generate unique id: find highest unique id, add one */
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- list_for_each(tmp, &osdblkdev_list) {
- struct osdblk_device *osdev;
-
- osdev = list_entry(tmp, struct osdblk_device, node);
- if (osdev->id > new_id)
- new_id = osdev->id + 1;
- }
-
- osdev->id = new_id;
-
- /* add to global list */
- list_add_tail(&osdev->node, &osdblkdev_list);
-
- mutex_unlock(&ctl_mutex);
-
- /* parse add command */
- if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,
- osdev->osd_path) != 3) {
- rc = -EINVAL;
- goto err_out_slot;
- }
-
- /* initialize rest of new object */
- sprintf(osdev->name, DRV_NAME "%d", osdev->id);
-
- /* contact requested OSD */
- osdev->osd = osduld_path_lookup(osdev->osd_path);
- if (IS_ERR(osdev->osd)) {
- rc = PTR_ERR(osdev->osd);
- goto err_out_slot;
- }
-
- /* build OSD credential */
- osdblk_make_credential(osdev->obj_cred, &osdev->obj);
-
- /* register our block device */
- irc = register_blkdev(0, osdev->name);
- if (irc < 0) {
- rc = irc;
- goto err_out_osd;
- }
-
- osdev->major = irc;
-
- /* set up and announce blkdev mapping */
- rc = osdblk_init_disk(osdev);
- if (rc)
- goto err_out_blkdev;
-
- return count;
-
-err_out_blkdev:
- unregister_blkdev(osdev->major, osdev->name);
-err_out_osd:
- osduld_put_device(osdev->osd);
-err_out_slot:
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
- list_del_init(&osdev->node);
- mutex_unlock(&ctl_mutex);
-
- kfree(osdev);
-err_out_mod:
- OSDBLK_DEBUG("Error adding device %s\n", buf);
- module_put(THIS_MODULE);
- return rc;
-}
-
-static ssize_t class_osdblk_remove(struct class *c,
- struct class_attribute *attr,
- const char *buf,
- size_t count)
-{
- struct osdblk_device *osdev = NULL;
- int target_id, rc;
- unsigned long ul;
- struct list_head *tmp;
-
- rc = kstrtoul(buf, 10, &ul);
- if (rc)
- return rc;
-
- /* convert to int; abort if we lost anything in the conversion */
- target_id = (int) ul;
- if (target_id != ul)
- return -EINVAL;
-
- /* remove object from list immediately */
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- list_for_each(tmp, &osdblkdev_list) {
- osdev = list_entry(tmp, struct osdblk_device, node);
- if (osdev->id == target_id) {
- list_del_init(&osdev->node);
- break;
- }
- osdev = NULL;
- }
-
- mutex_unlock(&ctl_mutex);
-
- if (!osdev)
- return -ENOENT;
-
- /* clean up and free blkdev and associated OSD connection */
- osdblk_free_disk(osdev);
- unregister_blkdev(osdev->major, osdev->name);
- osduld_put_device(osdev->osd);
- kfree(osdev);
-
- /* release module ref */
- module_put(THIS_MODULE);
-
- return count;
-}
-
-static struct class_attribute class_osdblk_attrs[] = {
- __ATTR(add, 0200, NULL, class_osdblk_add),
- __ATTR(remove, 0200, NULL, class_osdblk_remove),
- __ATTR(list, 0444, class_osdblk_list, NULL),
- __ATTR_NULL
-};
-
-static int osdblk_sysfs_init(void)
-{
- int ret = 0;
-
- /*
- * create control files in sysfs
- * /sys/class/osdblk/...
- */
- class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);
- if (!class_osdblk)
- return -ENOMEM;
-
- class_osdblk->name = DRV_NAME;
- class_osdblk->owner = THIS_MODULE;
- class_osdblk->class_release = class_osdblk_release;
- class_osdblk->class_attrs = class_osdblk_attrs;
-
- ret = class_register(class_osdblk);
- if (ret) {
- kfree(class_osdblk);
- class_osdblk = NULL;
- printk(PFX "failed to create class osdblk\n");
- return ret;
- }
-
- return 0;
-}
-
-static void osdblk_sysfs_cleanup(void)
-{
- if (class_osdblk)
- class_destroy(class_osdblk);
- class_osdblk = NULL;
-}
-
-static int __init osdblk_init(void)
-{
- int rc;
-
- rc = osdblk_sysfs_init();
- if (rc)
- return rc;
-
- return 0;
-}
-
-static void __exit osdblk_exit(void)
-{
- osdblk_sysfs_cleanup();
-}
-
-module_init(osdblk_init);
-module_exit(osdblk_exit);
-
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 939641d6e262..b1267ef34d5a 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -300,6 +300,11 @@ static void pcd_init_units(void)
struct gendisk *disk = alloc_disk(1);
if (!disk)
continue;
+ disk->queue = blk_init_queue(do_pcd_request, &pcd_lock);
+ if (!disk->queue) {
+ put_disk(disk);
+ continue;
+ }
cd->disk = disk;
cd->pi = &cd->pia;
cd->present = 0;
@@ -735,18 +740,36 @@ static int pcd_detect(void)
}
/* I/O request processing */
-static struct request_queue *pcd_queue;
+static int pcd_queue;
+
+static int set_next_request(void)
+{
+ struct pcd_unit *cd;
+ struct request_queue *q;
+ int old_pos = pcd_queue;
+
+ do {
+ cd = &pcd[pcd_queue];
+ q = cd->present ? cd->disk->queue : NULL;
+ if (++pcd_queue == PCD_UNITS)
+ pcd_queue = 0;
+ if (q) {
+ pcd_req = blk_fetch_request(q);
+ if (pcd_req)
+ break;
+ }
+ } while (pcd_queue != old_pos);
+
+ return pcd_req != NULL;
+}
-static void do_pcd_request(struct request_queue * q)
+static void pcd_request(void)
{
if (pcd_busy)
return;
while (1) {
- if (!pcd_req) {
- pcd_req = blk_fetch_request(q);
- if (!pcd_req)
- return;
- }
+ if (!pcd_req && !set_next_request())
+ return;
if (rq_data_dir(pcd_req) == READ) {
struct pcd_unit *cd = pcd_req->rq_disk->private_data;
@@ -766,6 +789,11 @@ static void do_pcd_request(struct request_queue * q)
}
}
+static void do_pcd_request(struct request_queue *q)
+{
+ pcd_request();
+}
+
static inline void next_request(int err)
{
unsigned long saved_flags;
@@ -774,7 +802,7 @@ static inline void next_request(int err)
if (!__blk_end_request_cur(pcd_req, err))
pcd_req = NULL;
pcd_busy = 0;
- do_pcd_request(pcd_queue);
+ pcd_request();
spin_unlock_irqrestore(&pcd_lock, saved_flags);
}
@@ -849,7 +877,7 @@ static void do_pcd_read_drq(void)
do_pcd_read();
spin_lock_irqsave(&pcd_lock, saved_flags);
- do_pcd_request(pcd_queue);
+ pcd_request();
spin_unlock_irqrestore(&pcd_lock, saved_flags);
}
@@ -957,19 +985,10 @@ static int __init pcd_init(void)
return -EBUSY;
}
- pcd_queue = blk_init_queue(do_pcd_request, &pcd_lock);
- if (!pcd_queue) {
- unregister_blkdev(major, name);
- for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++)
- put_disk(cd->disk);
- return -ENOMEM;
- }
-
for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
if (cd->present) {
register_cdrom(&cd->info);
cd->disk->private_data = cd;
- cd->disk->queue = pcd_queue;
add_disk(cd->disk);
}
}
@@ -988,9 +1007,9 @@ static void __exit pcd_exit(void)
pi_release(cd->pi);
unregister_cdrom(&cd->info);
}
+ blk_cleanup_queue(cd->disk->queue);
put_disk(cd->disk);
}
- blk_cleanup_queue(pcd_queue);
unregister_blkdev(major, name);
pi_unregister_driver(par_drv);
}
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 9cfd2e06a649..7d2402f90978 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -381,12 +381,33 @@ static enum action do_pd_write_start(void);
static enum action do_pd_read_drq(void);
static enum action do_pd_write_done(void);
-static struct request_queue *pd_queue;
+static int pd_queue;
static int pd_claimed;
static struct pd_unit *pd_current; /* current request's drive */
static PIA *pi_current; /* current request's PIA */
+static int set_next_request(void)
+{
+ struct gendisk *disk;
+ struct request_queue *q;
+ int old_pos = pd_queue;
+
+ do {
+ disk = pd[pd_queue].gd;
+ q = disk ? disk->queue : NULL;
+ if (++pd_queue == PD_UNITS)
+ pd_queue = 0;
+ if (q) {
+ pd_req = blk_fetch_request(q);
+ if (pd_req)
+ break;
+ }
+ } while (pd_queue != old_pos);
+
+ return pd_req != NULL;
+}
+
static void run_fsm(void)
{
while (1) {
@@ -418,8 +439,7 @@ static void run_fsm(void)
spin_lock_irqsave(&pd_lock, saved_flags);
if (!__blk_end_request_cur(pd_req,
res == Ok ? 0 : -EIO)) {
- pd_req = blk_fetch_request(pd_queue);
- if (!pd_req)
+ if (!set_next_request())
stop = 1;
}
spin_unlock_irqrestore(&pd_lock, saved_flags);
@@ -719,18 +739,15 @@ static int pd_special_command(struct pd_unit *disk,
enum action (*func)(struct pd_unit *disk))
{
struct request *rq;
- int err = 0;
rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
if (IS_ERR(rq))
return PTR_ERR(rq);
rq->special = func;
-
- err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
-
+ blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
blk_put_request(rq);
- return err;
+ return 0;
}
/* kernel glue structures */
@@ -839,7 +856,13 @@ static void pd_probe_drive(struct pd_unit *disk)
p->first_minor = (disk - pd) << PD_BITS;
disk->gd = p;
p->private_data = disk;
- p->queue = pd_queue;
+ p->queue = blk_init_queue(do_pd_request, &pd_lock);
+ if (!p->queue) {
+ disk->gd = NULL;
+ put_disk(p);
+ return;
+ }
+ blk_queue_max_hw_sectors(p->queue, cluster);
if (disk->drive == -1) {
for (disk->drive = 0; disk->drive <= 1; disk->drive++)
@@ -919,26 +942,18 @@ static int __init pd_init(void)
if (disable)
goto out1;
- pd_queue = blk_init_queue(do_pd_request, &pd_lock);
- if (!pd_queue)
- goto out1;
-
- blk_queue_max_hw_sectors(pd_queue, cluster);
-
if (register_blkdev(major, name))
- goto out2;
+ goto out1;
printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
name, name, PD_VERSION, major, cluster, nice);
if (!pd_detect())
- goto out3;
+ goto out2;
return 0;
-out3:
- unregister_blkdev(major, name);
out2:
- blk_cleanup_queue(pd_queue);
+ unregister_blkdev(major, name);
out1:
return -ENODEV;
}
@@ -953,11 +968,11 @@ static void __exit pd_exit(void)
if (p) {
disk->gd = NULL;
del_gendisk(p);
+ blk_cleanup_queue(p->queue);
put_disk(p);
pi_release(disk->pi);
}
}
- blk_cleanup_queue(pd_queue);
}
MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 14c5d32f5d8b..f24ca7315ddc 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -287,6 +287,12 @@ static void __init pf_init_units(void)
struct gendisk *disk = alloc_disk(1);
if (!disk)
continue;
+ disk->queue = blk_init_queue(do_pf_request, &pf_spin_lock);
+ if (!disk->queue) {
+ put_disk(disk);
+ return;
+ }
+ blk_queue_max_segments(disk->queue, cluster);
pf->disk = disk;
pf->pi = &pf->pia;
pf->media_status = PF_NM;
@@ -772,7 +778,28 @@ static int pf_ready(void)
return (((status_reg(pf_current) & (STAT_BUSY | pf_mask)) == pf_mask));
}
-static struct request_queue *pf_queue;
+static int pf_queue;
+
+static int set_next_request(void)
+{
+ struct pf_unit *pf;
+ struct request_queue *q;
+ int old_pos = pf_queue;
+
+ do {
+ pf = &units[pf_queue];
+ q = pf->present ? pf->disk->queue : NULL;
+ if (++pf_queue == PF_UNITS)
+ pf_queue = 0;
+ if (q) {
+ pf_req = blk_fetch_request(q);
+ if (pf_req)
+ break;
+ }
+ } while (pf_queue != old_pos);
+
+ return pf_req != NULL;
+}
static void pf_end_request(int err)
{
@@ -780,16 +807,13 @@ static void pf_end_request(int err)
pf_req = NULL;
}
-static void do_pf_request(struct request_queue * q)
+static void pf_request(void)
{
if (pf_busy)
return;
repeat:
- if (!pf_req) {
- pf_req = blk_fetch_request(q);
- if (!pf_req)
- return;
- }
+ if (!pf_req && !set_next_request())
+ return;
pf_current = pf_req->rq_disk->private_data;
pf_block = blk_rq_pos(pf_req);
@@ -817,6 +841,11 @@ repeat:
}
}
+static void do_pf_request(struct request_queue *q)
+{
+ pf_request();
+}
+
static int pf_next_buf(void)
{
unsigned long saved_flags;
@@ -846,7 +875,7 @@ static inline void next_request(int err)
spin_lock_irqsave(&pf_spin_lock, saved_flags);
pf_end_request(err);
pf_busy = 0;
- do_pf_request(pf_queue);
+ pf_request();
spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
}
@@ -972,15 +1001,6 @@ static int __init pf_init(void)
put_disk(pf->disk);
return -EBUSY;
}
- pf_queue = blk_init_queue(do_pf_request, &pf_spin_lock);
- if (!pf_queue) {
- unregister_blkdev(major, name);
- for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++)
- put_disk(pf->disk);
- return -ENOMEM;
- }
-
- blk_queue_max_segments(pf_queue, cluster);
for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
struct gendisk *disk = pf->disk;
@@ -988,7 +1008,6 @@ static int __init pf_init(void)
if (!pf->present)
continue;
disk->private_data = pf;
- disk->queue = pf_queue;
add_disk(disk);
}
return 0;
@@ -1003,10 +1022,10 @@ static void __exit pf_exit(void)
if (!pf->present)
continue;
del_gendisk(pf->disk);
+ blk_cleanup_queue(pf->disk->queue);
put_disk(pf->disk);
pi_release(pf->pi);
}
- blk_cleanup_queue(pf_queue);
}
MODULE_LICENSE("GPL");
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 66d846ba85a9..205b865ebeb9 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -724,7 +724,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
rq->rq_flags |= RQF_QUIET;
blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
- if (rq->errors)
+ if (scsi_req(rq)->result)
ret = -EIO;
out:
blk_put_request(rq);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 517838b65964..089ac4179919 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4317,7 +4317,7 @@ static int rbd_init_request(void *data, struct request *rq,
return 0;
}
-static struct blk_mq_ops rbd_mq_ops = {
+static const struct blk_mq_ops rbd_mq_ops = {
.queue_rq = rbd_queue_rq,
.init_request = rbd_init_request,
};
@@ -4380,7 +4380,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
q->limits.discard_granularity = segment_size;
q->limits.discard_alignment = segment_size;
blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
- q->limits.discard_zeroes_data = 1;
if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index f81d70b39d10..9c566364ac9c 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -300,7 +300,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
RSXX_HW_BLK_SIZE >> 9);
card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE;
card->queue->limits.discard_alignment = RSXX_HW_BLK_SIZE;
- card->queue->limits.discard_zeroes_data = 1;
}
card->queue->queuedata = card;
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index b5afd495d482..3064be6cf375 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -211,7 +211,7 @@ enum head {
struct swim_priv {
struct swim __iomem *base;
spinlock_t lock;
- struct request_queue *queue;
+ int fdc_queue;
int floppy_count;
struct floppy_state unit[FD_MAX_UNIT];
};
@@ -525,12 +525,33 @@ static int floppy_read_sectors(struct floppy_state *fs,
return 0;
}
-static void redo_fd_request(struct request_queue *q)
+static struct request *swim_next_request(struct swim_priv *swd)
{
+ struct request_queue *q;
+ struct request *rq;
+ int old_pos = swd->fdc_queue;
+
+ do {
+ q = swd->unit[swd->fdc_queue].disk->queue;
+ if (++swd->fdc_queue == swd->floppy_count)
+ swd->fdc_queue = 0;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ return rq;
+ }
+ } while (swd->fdc_queue != old_pos);
+
+ return NULL;
+}
+
+static void do_fd_request(struct request_queue *q)
+{
+ struct swim_priv *swd = q->queuedata;
struct request *req;
struct floppy_state *fs;
- req = blk_fetch_request(q);
+ req = swim_next_request(swd);
while (req) {
int err = -EIO;
@@ -554,15 +575,10 @@ static void redo_fd_request(struct request_queue *q)
}
done:
if (!__blk_end_request_cur(req, err))
- req = blk_fetch_request(q);
+ req = swim_next_request(swd);
}
}
-static void do_fd_request(struct request_queue *q)
-{
- redo_fd_request(q);
-}
-
static struct floppy_struct floppy_type[4] = {
{ 0, 0, 0, 0, 0, 0x00, 0x00, 0x00, 0x00, NULL }, /* no testing */
{ 720, 9, 1, 80, 0, 0x2A, 0x02, 0xDF, 0x50, NULL }, /* 360KB SS 3.5"*/
@@ -833,22 +849,25 @@ static int swim_floppy_init(struct swim_priv *swd)
return -EBUSY;
}
+ spin_lock_init(&swd->lock);
+
for (drive = 0; drive < swd->floppy_count; drive++) {
swd->unit[drive].disk = alloc_disk(1);
if (swd->unit[drive].disk == NULL) {
err = -ENOMEM;
goto exit_put_disks;
}
+ swd->unit[drive].disk->queue = blk_init_queue(do_fd_request,
+ &swd->lock);
+ if (!swd->unit[drive].disk->queue) {
+ err = -ENOMEM;
+ put_disk(swd->unit[drive].disk);
+ goto exit_put_disks;
+ }
+ swd->unit[drive].disk->queue->queuedata = swd;
swd->unit[drive].swd = swd;
}
- spin_lock_init(&swd->lock);
- swd->queue = blk_init_queue(do_fd_request, &swd->lock);
- if (!swd->queue) {
- err = -ENOMEM;
- goto exit_put_disks;
- }
-
for (drive = 0; drive < swd->floppy_count; drive++) {
swd->unit[drive].disk->flags = GENHD_FL_REMOVABLE;
swd->unit[drive].disk->major = FLOPPY_MAJOR;
@@ -856,7 +875,6 @@ static int swim_floppy_init(struct swim_priv *swd)
sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive);
swd->unit[drive].disk->fops = &floppy_fops;
swd->unit[drive].disk->private_data = &swd->unit[drive];
- swd->unit[drive].disk->queue = swd->queue;
set_capacity(swd->unit[drive].disk, 2880);
add_disk(swd->unit[drive].disk);
}
@@ -943,13 +961,12 @@ static int swim_remove(struct platform_device *dev)
for (drive = 0; drive < swd->floppy_count; drive++) {
del_gendisk(swd->unit[drive].disk);
+ blk_cleanup_queue(swd->unit[drive].disk->queue);
put_disk(swd->unit[drive].disk);
}
unregister_blkdev(FLOPPY_MAJOR, "fd");
- blk_cleanup_queue(swd->queue);
-
/* eject floppies */
for (drive = 0; drive < swd->floppy_count; drive++)
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 61b3ffa4f458..ba4809c9bdba 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -343,8 +343,8 @@ static void start_request(struct floppy_state *fs)
req->rq_disk->disk_name, req->cmd,
(long)blk_rq_pos(req), blk_rq_sectors(req),
bio_data(req->bio));
- swim3_dbg(" errors=%d current_nr_sectors=%u\n",
- req->errors, blk_rq_cur_sectors(req));
+ swim3_dbg(" current_nr_sectors=%u\n",
+ blk_rq_cur_sectors(req));
#endif
if (blk_rq_pos(req) >= fs->total_secs) {
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1d4c9f8bc1e1..f94614257462 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -111,7 +111,7 @@ static int virtblk_add_req_scsi(struct virtqueue *vq, struct virtblk_req *vbr,
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
}
-static inline void virtblk_scsi_reques_done(struct request *req)
+static inline void virtblk_scsi_request_done(struct request *req)
{
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
struct virtio_blk *vblk = req->q->queuedata;
@@ -119,7 +119,7 @@ static inline void virtblk_scsi_reques_done(struct request *req)
sreq->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
sreq->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
- req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
+ sreq->result = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
}
static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
@@ -144,7 +144,7 @@ static inline int virtblk_add_req_scsi(struct virtqueue *vq,
{
return -EIO;
}
-static inline void virtblk_scsi_reques_done(struct request *req)
+static inline void virtblk_scsi_request_done(struct request *req)
{
}
#define virtblk_ioctl NULL
@@ -175,19 +175,15 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
static inline void virtblk_request_done(struct request *req)
{
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
- int error = virtblk_result(vbr);
switch (req_op(req)) {
case REQ_OP_SCSI_IN:
case REQ_OP_SCSI_OUT:
- virtblk_scsi_reques_done(req);
- break;
- case REQ_OP_DRV_IN:
- req->errors = (error != 0);
+ virtblk_scsi_request_done(req);
break;
}
- blk_mq_end_request(req, error);
+ blk_mq_end_request(req, virtblk_result(vbr));
}
static void virtblk_done(struct virtqueue *vq)
@@ -205,7 +201,7 @@ static void virtblk_done(struct virtqueue *vq)
while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
struct request *req = blk_mq_rq_from_pdu(vbr);
- blk_mq_complete_request(req, req->errors);
+ blk_mq_complete_request(req);
req_done = true;
}
if (unlikely(virtqueue_is_broken(vq)))
@@ -310,7 +306,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
if (err)
goto out;
- err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+ blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+ err = virtblk_result(blk_mq_rq_to_pdu(req));
out:
blk_put_request(req);
return err;
@@ -597,7 +594,7 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set)
return blk_mq_virtio_map_queues(set, vblk->vdev, 0);
}
-static struct blk_mq_ops virtio_mq_ops = {
+static const struct blk_mq_ops virtio_mq_ops = {
.queue_rq = virtio_queue_rq,
.complete = virtblk_request_done,
.init_request = virtblk_init_request,
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 5067a0a952cb..39459631667c 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -115,6 +115,15 @@ struct split_bio {
atomic_t pending;
};
+struct blkif_req {
+ int error;
+};
+
+static inline struct blkif_req *blkif_req(struct request *rq)
+{
+ return blk_mq_rq_to_pdu(rq);
+}
+
static DEFINE_MUTEX(blkfront_mutex);
static const struct block_device_operations xlvbd_block_fops;
@@ -907,8 +916,14 @@ out_busy:
return BLK_MQ_RQ_QUEUE_BUSY;
}
-static struct blk_mq_ops blkfront_mq_ops = {
+static void blkif_complete_rq(struct request *rq)
+{
+ blk_mq_end_request(rq, blkif_req(rq)->error);
+}
+
+static const struct blk_mq_ops blkfront_mq_ops = {
.queue_rq = blkif_queue_rq,
+ .complete = blkif_complete_rq,
};
static void blkif_set_queue_limits(struct blkfront_info *info)
@@ -969,7 +984,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
info->tag_set.queue_depth = BLK_RING_SIZE(info);
info->tag_set.numa_node = NUMA_NO_NODE;
info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
- info->tag_set.cmd_size = 0;
+ info->tag_set.cmd_size = sizeof(struct blkif_req);
info->tag_set.driver_data = info;
if (blk_mq_alloc_tag_set(&info->tag_set))
@@ -1543,7 +1558,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
unsigned long flags;
struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
struct blkfront_info *info = rinfo->dev_info;
- int error;
if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
return IRQ_HANDLED;
@@ -1587,37 +1601,36 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
continue;
}
- error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
+ blkif_req(req)->error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
switch (bret->operation) {
case BLKIF_OP_DISCARD:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
struct request_queue *rq = info->rq;
printk(KERN_WARNING "blkfront: %s: %s op failed\n",
info->gd->disk_name, op_name(bret->operation));
- error = -EOPNOTSUPP;
+ blkif_req(req)->error = -EOPNOTSUPP;
info->feature_discard = 0;
info->feature_secdiscard = 0;
queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
}
- blk_mq_complete_request(req, error);
break;
case BLKIF_OP_FLUSH_DISKCACHE:
case BLKIF_OP_WRITE_BARRIER:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
printk(KERN_WARNING "blkfront: %s: %s op failed\n",
info->gd->disk_name, op_name(bret->operation));
- error = -EOPNOTSUPP;
+ blkif_req(req)->error = -EOPNOTSUPP;
}
if (unlikely(bret->status == BLKIF_RSP_ERROR &&
rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
info->gd->disk_name, op_name(bret->operation));
- error = -EOPNOTSUPP;
+ blkif_req(req)->error = -EOPNOTSUPP;
}
- if (unlikely(error)) {
- if (error == -EOPNOTSUPP)
- error = 0;
+ if (unlikely(blkif_req(req)->error)) {
+ if (blkif_req(req)->error == -EOPNOTSUPP)
+ blkif_req(req)->error = 0;
info->feature_fua = 0;
info->feature_flush = 0;
xlvbd_flush(info);
@@ -1629,11 +1642,12 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
"request: %x\n", bret->status);
- blk_mq_complete_request(req, error);
break;
default:
BUG();
}
+
+ blk_mq_complete_request(req);
}
rinfo->ring.rsp_cons = i;
@@ -2345,6 +2359,7 @@ static void blkfront_connect(struct blkfront_info *info)
unsigned long sector_size;
unsigned int physical_sector_size;
unsigned int binfo;
+ char *envp[] = { "RESIZE=1", NULL };
int err, i;
switch (info->connected) {
@@ -2361,6 +2376,8 @@ static void blkfront_connect(struct blkfront_info *info)
sectors);
set_capacity(info->gd, sectors);
revalidate_disk(info->gd);
+ kobject_uevent_env(&disk_to_dev(info->gd)->kobj,
+ KOBJ_CHANGE, envp);
return;
case BLKIF_STATE_SUSPENDED:
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 0c09d4256108..6fac5fedd610 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -829,10 +829,14 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
offset = (bio->bi_iter.bi_sector &
(SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
- if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
zram_bio_discard(zram, index, offset, bio);
bio_endio(bio);
return;
+ default:
+ break;
}
bio_for_each_segment(bvec, bio, iter) {
@@ -1192,6 +1196,8 @@ static int zram_add(void)
zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE;
zram->disk->queue->limits.chunk_sectors = 0;
blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
+
/*
* zram_bio_discard() will clear all logical blocks if logical block
* size is identical with physical block size(PAGE_SIZE). But if it is
@@ -1201,10 +1207,7 @@ static int zram_add(void)
* zeroed.
*/
if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
- zram->disk->queue->limits.discard_zeroes_data = 1;
- else
- zram->disk->queue->limits.discard_zeroes_data = 0;
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
+ blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
add_disk(zram->disk);
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 87739649eac2..76c952fd9ab9 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2218,7 +2218,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
rq->timeout = 60 * HZ;
bio = rq->bio;
- if (blk_execute_rq(q, cdi->disk, rq, 0)) {
+ blk_execute_rq(q, cdi->disk, rq, 0);
+ if (scsi_req(rq)->result) {
struct request_sense *s = req->sense;
ret = -EIO;
cdi->last_sense = s->sense_key;
diff --git a/drivers/char/ipmi/bt-bmc.c b/drivers/char/ipmi/bt-bmc.c
index d6f5d9eb102d..70d434bc1cbf 100644
--- a/drivers/char/ipmi/bt-bmc.c
+++ b/drivers/char/ipmi/bt-bmc.c
@@ -523,6 +523,7 @@ static int bt_bmc_remove(struct platform_device *pdev)
static const struct of_device_id bt_bmc_match[] = {
{ .compatible = "aspeed,ast2400-ibt-bmc" },
+ { .compatible = "aspeed,ast2500-ibt-bmc" },
{ },
};
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 2a7c425ddfa7..b2b618f066e0 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1954,7 +1954,9 @@ static int hotmod_handler(const char *val, struct kernel_param *kp)
kfree(info);
goto out;
}
+ mutex_lock(&smi_infos_lock);
rv = try_smi_init(info);
+ mutex_unlock(&smi_infos_lock);
if (rv) {
cleanup_one_si(info);
goto out;
@@ -2042,8 +2044,10 @@ static int hardcode_find_bmc(void)
info->slave_addr = slave_addrs[i];
if (!add_smi(info)) {
+ mutex_lock(&smi_infos_lock);
if (try_smi_init(info))
cleanup_one_si(info);
+ mutex_unlock(&smi_infos_lock);
ret = 0;
} else {
kfree(info);
@@ -3492,6 +3496,11 @@ out_err:
return rv;
}
+/*
+ * Try to start up an interface. Must be called with smi_infos_lock
+ * held, primarily to keep smi_num consistent, we only one to do these
+ * one at a time.
+ */
static int try_smi_init(struct smi_info *new_smi)
{
int rv = 0;
@@ -3524,9 +3533,12 @@ static int try_smi_init(struct smi_info *new_smi)
goto out_err;
}
+ new_smi->intf_num = smi_num;
+
/* Do this early so it's available for logs. */
if (!new_smi->dev) {
- init_name = kasprintf(GFP_KERNEL, "ipmi_si.%d", 0);
+ init_name = kasprintf(GFP_KERNEL, "ipmi_si.%d",
+ new_smi->intf_num);
/*
* If we don't already have a device from something
@@ -3593,8 +3605,6 @@ static int try_smi_init(struct smi_info *new_smi)
new_smi->interrupt_disabled = true;
atomic_set(&new_smi->need_watch, 0);
- new_smi->intf_num = smi_num;
- smi_num++;
rv = try_enable_event_buffer(new_smi);
if (rv == 0)
@@ -3661,6 +3671,9 @@ static int try_smi_init(struct smi_info *new_smi)
goto out_err_stop_timer;
}
+ /* Don't increment till we know we have succeeded. */
+ smi_num++;
+
dev_info(new_smi->dev, "IPMI %s interface initialized\n",
si_to_str[new_smi->si_type]);
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index cca6e5bc1cea..0b22a9be5029 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -891,6 +891,7 @@ static void msg_written_handler(struct ssif_info *ssif_info, int result,
* for details on the intricacies of this.
*/
int left;
+ unsigned char *data_to_send;
ssif_inc_stat(ssif_info, sent_messages_parts);
@@ -899,6 +900,7 @@ static void msg_written_handler(struct ssif_info *ssif_info, int result,
left = 32;
/* Length byte. */
ssif_info->multi_data[ssif_info->multi_pos] = left;
+ data_to_send = ssif_info->multi_data + ssif_info->multi_pos;
ssif_info->multi_pos += left;
if (left < 32)
/*
@@ -912,7 +914,7 @@ static void msg_written_handler(struct ssif_info *ssif_info, int result,
rv = ssif_i2c_send(ssif_info, msg_written_handler,
I2C_SMBUS_WRITE,
SSIF_IPMI_MULTI_PART_REQUEST_MIDDLE,
- ssif_info->multi_data + ssif_info->multi_pos,
+ data_to_send,
I2C_SMBUS_BLOCK_DATA);
if (rv < 0) {
/* request failed, just return the error. */
@@ -1642,9 +1644,8 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
spin_lock_init(&ssif_info->lock);
ssif_info->ssif_state = SSIF_NORMAL;
- init_timer(&ssif_info->retry_timer);
- ssif_info->retry_timer.data = (unsigned long) ssif_info;
- ssif_info->retry_timer.function = retry_timeout;
+ setup_timer(&ssif_info->retry_timer, retry_timeout,
+ (unsigned long)ssif_info);
for (i = 0; i < SSIF_NUM_STATS; i++)
atomic_set(&ssif_info->stats[i], 0);
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 5ca24d9b101b..d165af8abe36 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -516,7 +516,7 @@ static void panic_halt_ipmi_heartbeat(void)
msg.cmd = IPMI_WDOG_RESET_TIMER;
msg.data = NULL;
msg.data_len = 0;
- atomic_add(2, &panic_done_count);
+ atomic_add(1, &panic_done_count);
rv = ipmi_request_supply_msgs(watchdog_user,
(struct ipmi_addr *) &addr,
0,
@@ -526,7 +526,7 @@ static void panic_halt_ipmi_heartbeat(void)
&panic_halt_heartbeat_recv_msg,
1);
if (rv)
- atomic_sub(2, &panic_done_count);
+ atomic_sub(1, &panic_done_count);
}
static struct ipmi_smi_msg panic_halt_smi_msg = {
@@ -550,12 +550,12 @@ static void panic_halt_ipmi_set_timeout(void)
/* Wait for the messages to be free. */
while (atomic_read(&panic_done_count) != 0)
ipmi_poll_interface(watchdog_user);
- atomic_add(2, &panic_done_count);
+ atomic_add(1, &panic_done_count);
rv = i_ipmi_set_timeout(&panic_halt_smi_msg,
&panic_halt_recv_msg,
&send_heartbeat_now);
if (rv) {
- atomic_sub(2, &panic_done_count);
+ atomic_sub(1, &panic_done_count);
printk(KERN_WARNING PFX
"Unable to extend the watchdog timeout.");
} else {
diff --git a/drivers/clk/sunxi-ng/Kconfig b/drivers/clk/sunxi-ng/Kconfig
index 1c2357301017..a077ab6edffa 100644
--- a/drivers/clk/sunxi-ng/Kconfig
+++ b/drivers/clk/sunxi-ng/Kconfig
@@ -16,7 +16,7 @@ config SUNXI_CCU_FRAC
bool
config SUNXI_CCU_GATE
- bool
+ def_bool y
config SUNXI_CCU_MUX
bool
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 4773f2867234..96afb2aeed18 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -10,26 +10,16 @@ config EDAC_SUPPORT
bool
menuconfig EDAC
- bool "EDAC (Error Detection And Correction) reporting"
- depends on HAS_IOMEM && EDAC_SUPPORT
+ tristate "EDAC (Error Detection And Correction) reporting"
+ depends on HAS_IOMEM && EDAC_SUPPORT && RAS
help
- EDAC is designed to report errors in the core system.
- These are low-level errors that are reported in the CPU or
- supporting chipset or other subsystems:
+ EDAC is a subsystem along with hardware-specific drivers designed to
+ report hardware errors. These are low-level errors that are reported
+ in the CPU or supporting chipset or other subsystems:
memory errors, cache errors, PCI errors, thermal throttling, etc..
If unsure, select 'Y'.
- If this code is reporting problems on your system, please
- see the EDAC project web pages for more information at:
-
- <http://bluesmoke.sourceforge.net/>
-
- and:
-
- <http://buttersideup.com/edacwiki>
-
- There is also a mailing list for the EDAC project, which can
- be found via the sourceforge page.
+ The mailing list for the EDAC project is linux-edac@vger.kernel.org.
if EDAC
@@ -62,21 +52,9 @@ config EDAC_DECODE_MCE
which occur really early upon boot, before the module infrastructure
has been initialized.
-config EDAC_MM_EDAC
- tristate "Main Memory EDAC (Error Detection And Correction) reporting"
- select RAS
- help
- Some systems are able to detect and correct errors in main
- memory. EDAC can report statistics on memory error
- detection and correction (EDAC - or commonly referred to ECC
- errors). EDAC will also try to decode where these errors
- occurred so that a particular failing memory module can be
- replaced. If unsure, select 'Y'.
-
config EDAC_GHES
bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
- depends on ACPI_APEI_GHES && (EDAC_MM_EDAC=y)
- default y
+ depends on ACPI_APEI_GHES && (EDAC=y)
help
Not all machines support hardware-driven error report. Some of those
provide a BIOS-driven error report mechanism via ACPI, using the
@@ -98,7 +76,7 @@ config EDAC_GHES
config EDAC_AMD64
tristate "AMD64 (Opteron, Athlon64)"
- depends on EDAC_MM_EDAC && AMD_NB && EDAC_DECODE_MCE
+ depends on AMD_NB && EDAC_DECODE_MCE
help
Support for error detection and correction of DRAM ECC errors on
the AMD64 families (>= K8) of memory controllers.
@@ -124,28 +102,28 @@ config EDAC_AMD64_ERROR_INJECTION
config EDAC_AMD76X
tristate "AMD 76x (760, 762, 768)"
- depends on EDAC_MM_EDAC && PCI && X86_32
+ depends on PCI && X86_32
help
Support for error detection and correction on the AMD 76x
series of chipsets used with the Athlon processor.
config EDAC_E7XXX
tristate "Intel e7xxx (e7205, e7500, e7501, e7505)"
- depends on EDAC_MM_EDAC && PCI && X86_32
+ depends on PCI && X86_32
help
Support for error detection and correction on the Intel
E7205, E7500, E7501 and E7505 server chipsets.
config EDAC_E752X
tristate "Intel e752x (e7520, e7525, e7320) and 3100"
- depends on EDAC_MM_EDAC && PCI && X86
+ depends on PCI && X86
help
Support for error detection and correction on the Intel
E7520, E7525, E7320 server chipsets.
config EDAC_I82443BXGX
tristate "Intel 82443BX/GX (440BX/GX)"
- depends on EDAC_MM_EDAC && PCI && X86_32
+ depends on PCI && X86_32
depends on BROKEN
help
Support for error detection and correction on the Intel
@@ -153,56 +131,56 @@ config EDAC_I82443BXGX
config EDAC_I82875P
tristate "Intel 82875p (D82875P, E7210)"
- depends on EDAC_MM_EDAC && PCI && X86_32
+ depends on PCI && X86_32
help
Support for error detection and correction on the Intel
DP82785P and E7210 server chipsets.
config EDAC_I82975X
tristate "Intel 82975x (D82975x)"
- depends on EDAC_MM_EDAC && PCI && X86
+ depends on PCI && X86
help
Support for error detection and correction on the Intel
DP82975x server chipsets.
config EDAC_I3000
tristate "Intel 3000/3010"
- depends on EDAC_MM_EDAC && PCI && X86
+ depends on PCI && X86
help
Support for error detection and correction on the Intel
3000 and 3010 server chipsets.
config EDAC_I3200
tristate "Intel 3200"
- depends on EDAC_MM_EDAC && PCI && X86
+ depends on PCI && X86
help
Support for error detection and correction on the Intel
3200 and 3210 server chipsets.
config EDAC_IE31200
tristate "Intel e312xx"
- depends on EDAC_MM_EDAC && PCI && X86
+ depends on PCI && X86
help
Support for error detection and correction on the Intel
E3-1200 based DRAM controllers.
config EDAC_X38
tristate "Intel X38"
- depends on EDAC_MM_EDAC && PCI && X86
+ depends on PCI && X86
help
Support for error detection and correction on the Intel
X38 server chipsets.
config EDAC_I5400
tristate "Intel 5400 (Seaburg) chipsets"
- depends on EDAC_MM_EDAC && PCI && X86
+ depends on PCI && X86
help
Support for error detection and correction the Intel
i5400 MCH chipset (Seaburg).
config EDAC_I7CORE
tristate "Intel i7 Core (Nehalem) processors"
- depends on EDAC_MM_EDAC && PCI && X86 && X86_MCE_INTEL
+ depends on PCI && X86 && X86_MCE_INTEL
help
Support for error detection and correction the Intel
i7 Core (Nehalem) Integrated Memory Controller that exists on
@@ -211,58 +189,56 @@ config EDAC_I7CORE
config EDAC_I82860
tristate "Intel 82860"
- depends on EDAC_MM_EDAC && PCI && X86_32
+ depends on PCI && X86_32
help
Support for error detection and correction on the Intel
82860 chipset.
config EDAC_R82600
tristate "Radisys 82600 embedded chipset"
- depends on EDAC_MM_EDAC && PCI && X86_32
+ depends on PCI && X86_32
help
Support for error detection and correction on the Radisys
82600 embedded chipset.
config EDAC_I5000
tristate "Intel Greencreek/Blackford chipset"
- depends on EDAC_MM_EDAC && X86 && PCI
+ depends on X86 && PCI
help
Support for error detection and correction the Intel
Greekcreek/Blackford chipsets.
config EDAC_I5100
tristate "Intel San Clemente MCH"
- depends on EDAC_MM_EDAC && X86 && PCI
+ depends on X86 && PCI
help
Support for error detection and correction the Intel
San Clemente MCH.
config EDAC_I7300
tristate "Intel Clarksboro MCH"
- depends on EDAC_MM_EDAC && X86 && PCI
+ depends on X86 && PCI
help
Support for error detection and correction the Intel
Clarksboro MCH (Intel 7300 chipset).
config EDAC_SBRIDGE
tristate "Intel Sandy-Bridge/Ivy-Bridge/Haswell Integrated MC"
- depends on EDAC_MM_EDAC && PCI && X86_64 && X86_MCE_INTEL
- depends on PCI_MMCONFIG
+ depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG
help
Support for error detection and correction the Intel
Sandy Bridge, Ivy Bridge and Haswell Integrated Memory Controllers.
config EDAC_SKX
tristate "Intel Skylake server Integrated MC"
- depends on EDAC_MM_EDAC && PCI && X86_64 && X86_MCE_INTEL
- depends on PCI_MMCONFIG
+ depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG
help
Support for error detection and correction the Intel
Skylake server Integrated Memory Controllers.
config EDAC_PND2
tristate "Intel Pondicherry2"
- depends on EDAC_MM_EDAC && PCI && X86_64 && X86_MCE_INTEL
+ depends on PCI && X86_64 && X86_MCE_INTEL
help
Support for error detection and correction on the Intel
Pondicherry2 Integrated Memory Controller. This SoC IP is
@@ -271,36 +247,35 @@ config EDAC_PND2
config EDAC_MPC85XX
tristate "Freescale MPC83xx / MPC85xx"
- depends on EDAC_MM_EDAC && FSL_SOC
+ depends on FSL_SOC
help
Support for error detection and correction on the Freescale
MPC8349, MPC8560, MPC8540, MPC8548, T4240
config EDAC_LAYERSCAPE
tristate "Freescale Layerscape DDR"
- depends on EDAC_MM_EDAC && ARCH_LAYERSCAPE
+ depends on ARCH_LAYERSCAPE
help
Support for error detection and correction on Freescale memory
controllers on Layerscape SoCs.
config EDAC_MV64X60
tristate "Marvell MV64x60"
- depends on EDAC_MM_EDAC && MV64X60
+ depends on MV64X60
help
Support for error detection and correction on the Marvell
MV64360 and MV64460 chipsets.
config EDAC_PASEMI
tristate "PA Semi PWRficient"
- depends on EDAC_MM_EDAC && PCI
- depends on PPC_PASEMI
+ depends on PPC_PASEMI && PCI
help
Support for error detection and correction on PA Semi
PWRficient.
config EDAC_CELL
tristate "Cell Broadband Engine memory controller"
- depends on EDAC_MM_EDAC && PPC_CELL_COMMON
+ depends on PPC_CELL_COMMON
help
Support for error detection and correction on the
Cell Broadband Engine internal memory controller
@@ -308,7 +283,7 @@ config EDAC_CELL
config EDAC_PPC4XX
tristate "PPC4xx IBM DDR2 Memory Controller"
- depends on EDAC_MM_EDAC && 4xx
+ depends on 4xx
help
This enables support for EDAC on the ECC memory used
with the IBM DDR2 memory controller found in various
@@ -317,7 +292,7 @@ config EDAC_PPC4XX
config EDAC_AMD8131
tristate "AMD8131 HyperTransport PCI-X Tunnel"
- depends on EDAC_MM_EDAC && PCI && PPC_MAPLE
+ depends on PCI && PPC_MAPLE
help
Support for error detection and correction on the
AMD8131 HyperTransport PCI-X Tunnel chip.
@@ -326,7 +301,7 @@ config EDAC_AMD8131
config EDAC_AMD8111
tristate "AMD8111 HyperTransport I/O Hub"
- depends on EDAC_MM_EDAC && PCI && PPC_MAPLE
+ depends on PCI && PPC_MAPLE
help
Support for error detection and correction on the
AMD8111 HyperTransport I/O Hub chip.
@@ -335,7 +310,7 @@ config EDAC_AMD8111
config EDAC_CPC925
tristate "IBM CPC925 Memory Controller (PPC970FX)"
- depends on EDAC_MM_EDAC && PPC64
+ depends on PPC64
help
Support for error detection and correction on the
IBM CPC925 Bridge and Memory Controller, which is
@@ -344,7 +319,7 @@ config EDAC_CPC925
config EDAC_TILE
tristate "Tilera Memory Controller"
- depends on EDAC_MM_EDAC && TILE
+ depends on TILE
default y
help
Support for error detection and correction on the
@@ -352,49 +327,59 @@ config EDAC_TILE
config EDAC_HIGHBANK_MC
tristate "Highbank Memory Controller"
- depends on EDAC_MM_EDAC && ARCH_HIGHBANK
+ depends on ARCH_HIGHBANK
help
Support for error detection and correction on the
Calxeda Highbank memory controller.
config EDAC_HIGHBANK_L2
tristate "Highbank L2 Cache"
- depends on EDAC_MM_EDAC && ARCH_HIGHBANK
+ depends on ARCH_HIGHBANK
help
Support for error detection and correction on the
Calxeda Highbank memory controller.
config EDAC_OCTEON_PC
tristate "Cavium Octeon Primary Caches"
- depends on EDAC_MM_EDAC && CPU_CAVIUM_OCTEON
+ depends on CPU_CAVIUM_OCTEON
help
Support for error detection and correction on the primary caches of
the cnMIPS cores of Cavium Octeon family SOCs.
config EDAC_OCTEON_L2C
tristate "Cavium Octeon Secondary Caches (L2C)"
- depends on EDAC_MM_EDAC && CAVIUM_OCTEON_SOC
+ depends on CAVIUM_OCTEON_SOC
help
Support for error detection and correction on the
Cavium Octeon family of SOCs.
config EDAC_OCTEON_LMC
tristate "Cavium Octeon DRAM Memory Controller (LMC)"
- depends on EDAC_MM_EDAC && CAVIUM_OCTEON_SOC
+ depends on CAVIUM_OCTEON_SOC
help
Support for error detection and correction on the
Cavium Octeon family of SOCs.
config EDAC_OCTEON_PCI
tristate "Cavium Octeon PCI Controller"
- depends on EDAC_MM_EDAC && PCI && CAVIUM_OCTEON_SOC
+ depends on PCI && CAVIUM_OCTEON_SOC
help
Support for error detection and correction on the
Cavium Octeon family of SOCs.
+config EDAC_THUNDERX
+ tristate "Cavium ThunderX EDAC"
+ depends on ARM64
+ depends on PCI
+ help
+ Support for error detection and correction on the
+ Cavium ThunderX memory controllers (LMC), Cache
+ Coherent Processor Interconnect (CCPI) and L2 cache
+ blocks (TAD, CBC, MCI).
+
config EDAC_ALTERA
bool "Altera SOCFPGA ECC"
- depends on EDAC_MM_EDAC=y && ARCH_SOCFPGA
+ depends on EDAC=y && ARCH_SOCFPGA
help
Support for error detection and correction on the
Altera SOCs. This must be selected for SDRAM ECC.
@@ -460,14 +445,14 @@ config EDAC_ALTERA_SDMMC
config EDAC_SYNOPSYS
tristate "Synopsys DDR Memory Controller"
- depends on EDAC_MM_EDAC && ARCH_ZYNQ
+ depends on ARCH_ZYNQ
help
Support for error detection and correction on the Synopsys DDR
memory controller.
config EDAC_XGENE
tristate "APM X-Gene SoC"
- depends on EDAC_MM_EDAC && (ARM64 || COMPILE_TEST)
+ depends on (ARM64 || COMPILE_TEST)
help
Support for error detection and correction on the
APM X-Gene family of SOCs.
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 587107e90996..0fd9ffa63299 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -6,8 +6,7 @@
# GNU General Public License.
#
-obj-$(CONFIG_EDAC) := edac_stub.o
-obj-$(CONFIG_EDAC_MM_EDAC) += edac_core.o
+obj-$(CONFIG_EDAC) := edac_core.o
edac_core-y := edac_mc.o edac_device.o edac_mc_sysfs.o
edac_core-y += edac_module.o edac_device_sysfs.o wq.o
@@ -67,13 +66,14 @@ obj-$(CONFIG_EDAC_AMD8131) += amd8131_edac.o
obj-$(CONFIG_EDAC_TILE) += tile_edac.o
-obj-$(CONFIG_EDAC_HIGHBANK_MC) += highbank_mc_edac.o
-obj-$(CONFIG_EDAC_HIGHBANK_L2) += highbank_l2_edac.o
+obj-$(CONFIG_EDAC_HIGHBANK_MC) += highbank_mc_edac.o
+obj-$(CONFIG_EDAC_HIGHBANK_L2) += highbank_l2_edac.o
obj-$(CONFIG_EDAC_OCTEON_PC) += octeon_edac-pc.o
obj-$(CONFIG_EDAC_OCTEON_L2C) += octeon_edac-l2c.o
obj-$(CONFIG_EDAC_OCTEON_LMC) += octeon_edac-lmc.o
obj-$(CONFIG_EDAC_OCTEON_PCI) += octeon_edac-pci.o
+obj-$(CONFIG_EDAC_THUNDERX) += thunderx_edac.o
obj-$(CONFIG_EDAC_ALTERA) += altera_edac.o
obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o
diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
index c5a5b91f37f0..7717b094fabb 100644
--- a/drivers/edac/altera_edac.c
+++ b/drivers/edac/altera_edac.c
@@ -1023,13 +1023,23 @@ out:
return ret;
}
+static int socfpga_is_a10(void)
+{
+ return of_machine_is_compatible("altr,socfpga-arria10");
+}
+
static int validate_parent_available(struct device_node *np);
static const struct of_device_id altr_edac_a10_device_of_match[];
static int __init __maybe_unused altr_init_a10_ecc_device_type(char *compat)
{
int irq;
- struct device_node *child, *np = of_find_compatible_node(NULL, NULL,
- "altr,socfpga-a10-ecc-manager");
+ struct device_node *child, *np;
+
+ if (!socfpga_is_a10())
+ return -ENODEV;
+
+ np = of_find_compatible_node(NULL, NULL,
+ "altr,socfpga-a10-ecc-manager");
if (!np) {
edac_printk(KERN_ERR, EDAC_DEVICE, "ECC Manager not found\n");
return -ENODEV;
@@ -1545,8 +1555,12 @@ static const struct edac_device_prv_data a10_sdmmceccb_data = {
static int __init socfpga_init_sdmmc_ecc(void)
{
int rc = -ENODEV;
- struct device_node *child = of_find_compatible_node(NULL, NULL,
- "altr,socfpga-sdmmc-ecc");
+ struct device_node *child;
+
+ if (!socfpga_is_a10())
+ return -ENODEV;
+
+ child = of_find_compatible_node(NULL, NULL, "altr,socfpga-sdmmc-ecc");
if (!child) {
edac_printk(KERN_WARNING, EDAC_DEVICE, "SDMMC node not found\n");
return -ENODEV;
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index e5573c56b15e..480072139b7a 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -40,6 +40,11 @@
#define edac_atomic_scrub(va, size) do { } while (0)
#endif
+int edac_op_state = EDAC_OPSTATE_INVAL;
+EXPORT_SYMBOL_GPL(edac_op_state);
+
+static int edac_report = EDAC_REPORTING_ENABLED;
+
/* lock to memory controller's control array */
static DEFINE_MUTEX(mem_ctls_mutex);
static LIST_HEAD(mc_devices);
@@ -52,6 +57,65 @@ static void const *edac_mc_owner;
static struct bus_type mc_bus[EDAC_MAX_MCS];
+int edac_get_report_status(void)
+{
+ return edac_report;
+}
+EXPORT_SYMBOL_GPL(edac_get_report_status);
+
+void edac_set_report_status(int new)
+{
+ if (new == EDAC_REPORTING_ENABLED ||
+ new == EDAC_REPORTING_DISABLED ||
+ new == EDAC_REPORTING_FORCE)
+ edac_report = new;
+}
+EXPORT_SYMBOL_GPL(edac_set_report_status);
+
+static int edac_report_set(const char *str, const struct kernel_param *kp)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strncmp(str, "on", 2))
+ edac_report = EDAC_REPORTING_ENABLED;
+ else if (!strncmp(str, "off", 3))
+ edac_report = EDAC_REPORTING_DISABLED;
+ else if (!strncmp(str, "force", 5))
+ edac_report = EDAC_REPORTING_FORCE;
+
+ return 0;
+}
+
+static int edac_report_get(char *buffer, const struct kernel_param *kp)
+{
+ int ret = 0;
+
+ switch (edac_report) {
+ case EDAC_REPORTING_ENABLED:
+ ret = sprintf(buffer, "on");
+ break;
+ case EDAC_REPORTING_DISABLED:
+ ret = sprintf(buffer, "off");
+ break;
+ case EDAC_REPORTING_FORCE:
+ ret = sprintf(buffer, "force");
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static const struct kernel_param_ops edac_report_ops = {
+ .set = edac_report_set,
+ .get = edac_report_get,
+};
+
+module_param_cb(edac_report, &edac_report_ops, &edac_report, 0644);
+
unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
unsigned len)
{
@@ -505,22 +569,6 @@ struct mem_ctl_info *find_mci_by_dev(struct device *dev)
EXPORT_SYMBOL_GPL(find_mci_by_dev);
/*
- * handler for EDAC to check if NMI type handler has asserted interrupt
- */
-static int edac_mc_assert_error_check_and_clear(void)
-{
- int old_state;
-
- if (edac_op_state == EDAC_OPSTATE_POLL)
- return 1;
-
- old_state = edac_err_assert;
- edac_err_assert = 0;
-
- return old_state;
-}
-
-/*
* edac_mc_workq_function
* performs the operation scheduled by a workq request
*/
@@ -536,7 +584,7 @@ static void edac_mc_workq_function(struct work_struct *work_req)
return;
}
- if (edac_mc_assert_error_check_and_clear())
+ if (edac_op_state == EDAC_OPSTATE_POLL)
mci->edac_check(mci);
mutex_unlock(&mem_ctls_mutex);
@@ -601,7 +649,6 @@ static int add_mc_to_global_list(struct mem_ctl_info *mci)
}
list_add_tail_rcu(&mci->link, insert_before);
- atomic_inc(&edac_handlers);
return 0;
fail0:
@@ -619,7 +666,6 @@ fail1:
static int del_mc_from_global_list(struct mem_ctl_info *mci)
{
- int handlers = atomic_dec_return(&edac_handlers);
list_del_rcu(&mci->link);
/* these are for safe removal of devices from global list while
@@ -628,7 +674,7 @@ static int del_mc_from_global_list(struct mem_ctl_info *mci)
synchronize_rcu();
INIT_LIST_HEAD(&mci->link);
- return handlers;
+ return list_empty(&mc_devices);
}
struct mem_ctl_info *edac_mc_find(int idx)
@@ -763,7 +809,7 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
/* mark MCI offline: */
mci->op_state = OP_OFFLINE;
- if (!del_mc_from_global_list(mci))
+ if (del_mc_from_global_list(mci))
edac_mc_owner = NULL;
mutex_unlock(&mem_ctls_mutex);
@@ -1195,10 +1241,13 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
/* Report the error via the trace interface */
grain_bits = fls_long(e->grain) + 1;
- trace_mc_event(type, e->msg, e->label, e->error_count,
- mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
- (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
- grain_bits, e->syndrome, e->other_detail);
+
+ if (IS_ENABLED(CONFIG_RAS))
+ trace_mc_event(type, e->msg, e->label, e->error_count,
+ mci->mc_idx, e->top_layer, e->mid_layer,
+ e->low_layer,
+ (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
+ grain_bits, e->syndrome, e->other_detail);
edac_raw_mc_handle_error(type, mci, e);
}
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
deleted file mode 100644
index 952e411f01f2..000000000000
--- a/drivers/edac/edac_stub.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * common EDAC components that must be in kernel
- *
- * Author: Dave Jiang <djiang@mvista.com>
- *
- * 2007 (c) MontaVista Software, Inc.
- * 2010 (c) Advanced Micro Devices Inc.
- * Borislav Petkov <bp@alien8.de>
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2. This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- *
- */
-#include <linux/module.h>
-#include <linux/edac.h>
-#include <linux/atomic.h>
-#include <linux/device.h>
-
-int edac_op_state = EDAC_OPSTATE_INVAL;
-EXPORT_SYMBOL_GPL(edac_op_state);
-
-atomic_t edac_handlers = ATOMIC_INIT(0);
-EXPORT_SYMBOL_GPL(edac_handlers);
-
-int edac_err_assert = 0;
-EXPORT_SYMBOL_GPL(edac_err_assert);
-
-int edac_report_status = EDAC_REPORTING_ENABLED;
-EXPORT_SYMBOL_GPL(edac_report_status);
-
-static int __init edac_report_setup(char *str)
-{
- if (!str)
- return -EINVAL;
-
- if (!strncmp(str, "on", 2))
- set_edac_report_status(EDAC_REPORTING_ENABLED);
- else if (!strncmp(str, "off", 3))
- set_edac_report_status(EDAC_REPORTING_DISABLED);
- else if (!strncmp(str, "force", 5))
- set_edac_report_status(EDAC_REPORTING_FORCE);
-
- return 0;
-}
-__setup("edac_report=", edac_report_setup);
-
-/*
- * called to determine if there is an EDAC driver interested in
- * knowing an event (such as NMI) occurred
- */
-int edac_handler_set(void)
-{
- if (edac_op_state == EDAC_OPSTATE_POLL)
- return 0;
-
- return atomic_read(&edac_handlers);
-}
-EXPORT_SYMBOL_GPL(edac_handler_set);
-
-/*
- * handler for NMI type of interrupts to assert error
- */
-void edac_atomic_assert_error(void)
-{
- edac_err_assert++;
-}
-EXPORT_SYMBOL_GPL(edac_atomic_assert_error);
diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c
index 928e0dba41fc..1cad5a9af8d0 100644
--- a/drivers/edac/pnd2_edac.c
+++ b/drivers/edac/pnd2_edac.c
@@ -1349,7 +1349,7 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo
struct dram_addr daddr;
char *type;
- if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+ if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
return NOTIFY_DONE;
mci = pnd2_mci;
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index a65ea44e3b0b..ea21cb651b3c 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -3075,7 +3075,7 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
struct sbridge_pvt *pvt;
char *type;
- if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+ if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
return NOTIFY_DONE;
mci = get_mci_for_node_id(mce->socketid);
@@ -3441,7 +3441,7 @@ static int __init sbridge_init(void)
if (rc >= 0) {
mce_register_decode_chain(&sbridge_mce_dec);
- if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+ if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n");
return 0;
}
diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c
index 1159dba4671f..64bef6c9cfb4 100644
--- a/drivers/edac/skx_edac.c
+++ b/drivers/edac/skx_edac.c
@@ -971,7 +971,7 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
struct mem_ctl_info *mci;
char *type;
- if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+ if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
return NOTIFY_DONE;
/* ignore unless this is memory related with an address */
diff --git a/drivers/edac/thunderx_edac.c b/drivers/edac/thunderx_edac.c
new file mode 100644
index 000000000000..86d585cb6d32
--- /dev/null
+++ b/drivers/edac/thunderx_edac.c
@@ -0,0 +1,2174 @@
+/*
+ * Cavium ThunderX memory controller kernel module
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright Cavium, Inc. (C) 2015-2017. All rights reserved.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/edac.h>
+#include <linux/interrupt.h>
+#include <linux/string.h>
+#include <linux/stop_machine.h>
+#include <linux/delay.h>
+#include <linux/sizes.h>
+#include <linux/atomic.h>
+#include <linux/bitfield.h>
+#include <linux/circ_buf.h>
+
+#include <asm/page.h>
+
+#include "edac_module.h"
+
+#define phys_to_pfn(phys) (PFN_DOWN(phys))
+
+#define THUNDERX_NODE GENMASK(45, 44)
+
+enum {
+ ERR_CORRECTED = 1,
+ ERR_UNCORRECTED = 2,
+ ERR_UNKNOWN = 3,
+};
+
+#define MAX_SYNDROME_REGS 4
+
+struct error_syndrome {
+ u64 reg[MAX_SYNDROME_REGS];
+};
+
+struct error_descr {
+ int type;
+ u64 mask;
+ char *descr;
+};
+
+static void decode_register(char *str, size_t size,
+ const struct error_descr *descr,
+ const uint64_t reg)
+{
+ int ret = 0;
+
+ while (descr->type && descr->mask && descr->descr) {
+ if (reg & descr->mask) {
+ ret = snprintf(str, size, "\n\t%s, %s",
+ descr->type == ERR_CORRECTED ?
+ "Corrected" : "Uncorrected",
+ descr->descr);
+ str += ret;
+ size -= ret;
+ }
+ descr++;
+ }
+}
+
+static unsigned long get_bits(unsigned long data, int pos, int width)
+{
+ return (data >> pos) & ((1 << width) - 1);
+}
+
+#define L2C_CTL 0x87E080800000
+#define L2C_CTL_DISIDXALIAS BIT(0)
+
+#define PCI_DEVICE_ID_THUNDER_LMC 0xa022
+
+#define LMC_FADR 0x20
+#define LMC_FADR_FDIMM(x) ((x >> 37) & 0x1)
+#define LMC_FADR_FBUNK(x) ((x >> 36) & 0x1)
+#define LMC_FADR_FBANK(x) ((x >> 32) & 0xf)
+#define LMC_FADR_FROW(x) ((x >> 14) & 0xffff)
+#define LMC_FADR_FCOL(x) ((x >> 0) & 0x1fff)
+
+#define LMC_NXM_FADR 0x28
+#define LMC_ECC_SYND 0x38
+
+#define LMC_ECC_PARITY_TEST 0x108
+
+#define LMC_INT_W1S 0x150
+
+#define LMC_INT_ENA_W1C 0x158
+#define LMC_INT_ENA_W1S 0x160
+
+#define LMC_CONFIG 0x188
+
+#define LMC_CONFIG_BG2 BIT(62)
+#define LMC_CONFIG_RANK_ENA BIT(42)
+#define LMC_CONFIG_PBANK_LSB(x) (((x) >> 5) & 0xF)
+#define LMC_CONFIG_ROW_LSB(x) (((x) >> 2) & 0x7)
+
+#define LMC_CONTROL 0x190
+#define LMC_CONTROL_XOR_BANK BIT(16)
+
+#define LMC_INT 0x1F0
+
+#define LMC_INT_DDR_ERR BIT(11)
+#define LMC_INT_DED_ERR (0xFUL << 5)
+#define LMC_INT_SEC_ERR (0xFUL << 1)
+#define LMC_INT_NXM_WR_MASK BIT(0)
+
+#define LMC_DDR_PLL_CTL 0x258
+#define LMC_DDR_PLL_CTL_DDR4 BIT(29)
+
+#define LMC_FADR_SCRAMBLED 0x330
+
+#define LMC_INT_UE (LMC_INT_DDR_ERR | LMC_INT_DED_ERR | \
+ LMC_INT_NXM_WR_MASK)
+
+#define LMC_INT_CE (LMC_INT_SEC_ERR)
+
+static const struct error_descr lmc_errors[] = {
+ {
+ .type = ERR_CORRECTED,
+ .mask = LMC_INT_SEC_ERR,
+ .descr = "Single-bit ECC error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = LMC_INT_DDR_ERR,
+ .descr = "DDR chip error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = LMC_INT_DED_ERR,
+ .descr = "Double-bit ECC error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = LMC_INT_NXM_WR_MASK,
+ .descr = "Non-existent memory write",
+ },
+ {0, 0, NULL},
+};
+
+#define LMC_INT_EN_DDR_ERROR_ALERT_ENA BIT(5)
+#define LMC_INT_EN_DLCRAM_DED_ERR BIT(4)
+#define LMC_INT_EN_DLCRAM_SEC_ERR BIT(3)
+#define LMC_INT_INTR_DED_ENA BIT(2)
+#define LMC_INT_INTR_SEC_ENA BIT(1)
+#define LMC_INT_INTR_NXM_WR_ENA BIT(0)
+
+#define LMC_INT_ENA_ALL GENMASK(5, 0)
+
+#define LMC_DDR_PLL_CTL 0x258
+#define LMC_DDR_PLL_CTL_DDR4 BIT(29)
+
+#define LMC_CONTROL 0x190
+#define LMC_CONTROL_RDIMM BIT(0)
+
+#define LMC_SCRAM_FADR 0x330
+
+#define LMC_CHAR_MASK0 0x228
+#define LMC_CHAR_MASK2 0x238
+
+#define RING_ENTRIES 8
+
+struct debugfs_entry {
+ const char *name;
+ umode_t mode;
+ const struct file_operations fops;
+};
+
+struct lmc_err_ctx {
+ u64 reg_int;
+ u64 reg_fadr;
+ u64 reg_nxm_fadr;
+ u64 reg_scram_fadr;
+ u64 reg_ecc_synd;
+};
+
+struct thunderx_lmc {
+ void __iomem *regs;
+ struct pci_dev *pdev;
+ struct msix_entry msix_ent;
+
+ atomic_t ecc_int;
+
+ u64 mask0;
+ u64 mask2;
+ u64 parity_test;
+ u64 node;
+
+ int xbits;
+ int bank_width;
+ int pbank_lsb;
+ int dimm_lsb;
+ int rank_lsb;
+ int bank_lsb;
+ int row_lsb;
+ int col_hi_lsb;
+
+ int xor_bank;
+ int l2c_alias;
+
+ struct page *mem;
+
+ struct lmc_err_ctx err_ctx[RING_ENTRIES];
+ unsigned long ring_head;
+ unsigned long ring_tail;
+};
+
+#define ring_pos(pos, size) ((pos) & (size - 1))
+
+#define DEBUGFS_STRUCT(_name, _mode, _write, _read) \
+static struct debugfs_entry debugfs_##_name = { \
+ .name = __stringify(_name), \
+ .mode = VERIFY_OCTAL_PERMISSIONS(_mode), \
+ .fops = { \
+ .open = simple_open, \
+ .write = _write, \
+ .read = _read, \
+ .llseek = generic_file_llseek, \
+ }, \
+}
+
+#define DEBUGFS_FIELD_ATTR(_type, _field) \
+static ssize_t thunderx_##_type##_##_field##_read(struct file *file, \
+ char __user *data, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct thunderx_##_type *pdata = file->private_data; \
+ char buf[20]; \
+ \
+ snprintf(buf, count, "0x%016llx", pdata->_field); \
+ return simple_read_from_buffer(data, count, ppos, \
+ buf, sizeof(buf)); \
+} \
+ \
+static ssize_t thunderx_##_type##_##_field##_write(struct file *file, \
+ const char __user *data, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct thunderx_##_type *pdata = file->private_data; \
+ int res; \
+ \
+ res = kstrtoull_from_user(data, count, 0, &pdata->_field); \
+ \
+ return res ? res : count; \
+} \
+ \
+DEBUGFS_STRUCT(_field, 0600, \
+ thunderx_##_type##_##_field##_write, \
+ thunderx_##_type##_##_field##_read) \
+
+#define DEBUGFS_REG_ATTR(_type, _name, _reg) \
+static ssize_t thunderx_##_type##_##_name##_read(struct file *file, \
+ char __user *data, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct thunderx_##_type *pdata = file->private_data; \
+ char buf[20]; \
+ \
+ sprintf(buf, "0x%016llx", readq(pdata->regs + _reg)); \
+ return simple_read_from_buffer(data, count, ppos, \
+ buf, sizeof(buf)); \
+} \
+ \
+static ssize_t thunderx_##_type##_##_name##_write(struct file *file, \
+ const char __user *data, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct thunderx_##_type *pdata = file->private_data; \
+ u64 val; \
+ int res; \
+ \
+ res = kstrtoull_from_user(data, count, 0, &val); \
+ \
+ if (!res) { \
+ writeq(val, pdata->regs + _reg); \
+ res = count; \
+ } \
+ \
+ return res; \
+} \
+ \
+DEBUGFS_STRUCT(_name, 0600, \
+ thunderx_##_type##_##_name##_write, \
+ thunderx_##_type##_##_name##_read)
+
+#define LMC_DEBUGFS_ENT(_field) DEBUGFS_FIELD_ATTR(lmc, _field)
+
+/*
+ * To get an ECC error injected, the following steps are needed:
+ * - Setup the ECC injection by writing the appropriate parameters:
+ * echo <bit mask value> > /sys/kernel/debug/<device number>/ecc_mask0
+ * echo <bit mask value> > /sys/kernel/debug/<device number>/ecc_mask2
+ * echo 0x802 > /sys/kernel/debug/<device number>/ecc_parity_test
+ * - Do the actual injection:
+ * echo 1 > /sys/kernel/debug/<device number>/inject_ecc
+ */
+static ssize_t thunderx_lmc_inject_int_write(struct file *file,
+ const char __user *data,
+ size_t count, loff_t *ppos)
+{
+ struct thunderx_lmc *lmc = file->private_data;
+ u64 val;
+ int res;
+
+ res = kstrtoull_from_user(data, count, 0, &val);
+
+ if (!res) {
+ /* Trigger the interrupt */
+ writeq(val, lmc->regs + LMC_INT_W1S);
+ res = count;
+ }
+
+ return res;
+}
+
+static ssize_t thunderx_lmc_int_read(struct file *file,
+ char __user *data,
+ size_t count, loff_t *ppos)
+{
+ struct thunderx_lmc *lmc = file->private_data;
+ char buf[20];
+ u64 lmc_int = readq(lmc->regs + LMC_INT);
+
+ snprintf(buf, sizeof(buf), "0x%016llx", lmc_int);
+ return simple_read_from_buffer(data, count, ppos, buf, sizeof(buf));
+}
+
+#define TEST_PATTERN 0xa5
+
+static int inject_ecc_fn(void *arg)
+{
+ struct thunderx_lmc *lmc = arg;
+ uintptr_t addr, phys;
+ unsigned int cline_size = cache_line_size();
+ const unsigned int lines = PAGE_SIZE / cline_size;
+ unsigned int i, cl_idx;
+
+ addr = (uintptr_t)page_address(lmc->mem);
+ phys = (uintptr_t)page_to_phys(lmc->mem);
+
+ cl_idx = (phys & 0x7f) >> 4;
+ lmc->parity_test &= ~(7ULL << 8);
+ lmc->parity_test |= (cl_idx << 8);
+
+ writeq(lmc->mask0, lmc->regs + LMC_CHAR_MASK0);
+ writeq(lmc->mask2, lmc->regs + LMC_CHAR_MASK2);
+ writeq(lmc->parity_test, lmc->regs + LMC_ECC_PARITY_TEST);
+
+ readq(lmc->regs + LMC_CHAR_MASK0);
+ readq(lmc->regs + LMC_CHAR_MASK2);
+ readq(lmc->regs + LMC_ECC_PARITY_TEST);
+
+ for (i = 0; i < lines; i++) {
+ memset((void *)addr, TEST_PATTERN, cline_size);
+ barrier();
+
+ /*
+ * Flush L1 cachelines to the PoC (L2).
+ * This will cause cacheline eviction to the L2.
+ */
+ asm volatile("dc civac, %0\n"
+ "dsb sy\n"
+ : : "r"(addr + i * cline_size));
+ }
+
+ for (i = 0; i < lines; i++) {
+ /*
+ * Flush L2 cachelines to the DRAM.
+ * This will cause cacheline eviction to the DRAM
+ * and ECC corruption according to the masks set.
+ */
+ __asm__ volatile("sys #0,c11,C1,#2, %0\n"
+ : : "r"(phys + i * cline_size));
+ }
+
+ for (i = 0; i < lines; i++) {
+ /*
+ * Invalidate L2 cachelines.
+ * The subsequent load will cause cacheline fetch
+ * from the DRAM and an error interrupt
+ */
+ __asm__ volatile("sys #0,c11,C1,#1, %0"
+ : : "r"(phys + i * cline_size));
+ }
+
+ for (i = 0; i < lines; i++) {
+ /*
+ * Invalidate L1 cachelines.
+ * The subsequent load will cause cacheline fetch
+ * from the L2 and/or DRAM
+ */
+ asm volatile("dc ivac, %0\n"
+ "dsb sy\n"
+ : : "r"(addr + i * cline_size));
+ }
+
+ return 0;
+}
+
+static ssize_t thunderx_lmc_inject_ecc_write(struct file *file,
+ const char __user *data,
+ size_t count, loff_t *ppos)
+{
+ struct thunderx_lmc *lmc = file->private_data;
+
+ unsigned int cline_size = cache_line_size();
+
+ u8 tmp[cline_size];
+ void __iomem *addr;
+ unsigned int offs, timeout = 100000;
+
+ atomic_set(&lmc->ecc_int, 0);
+
+ lmc->mem = alloc_pages_node(lmc->node, GFP_KERNEL, 0);
+
+ if (!lmc->mem)
+ return -ENOMEM;
+
+ addr = page_address(lmc->mem);
+
+ while (!atomic_read(&lmc->ecc_int) && timeout--) {
+ stop_machine(inject_ecc_fn, lmc, NULL);
+
+ for (offs = 0; offs < PAGE_SIZE; offs += sizeof(tmp)) {
+ /*
+ * Do a load from the previously rigged location
+ * This should generate an error interrupt.
+ */
+ memcpy(tmp, addr + offs, cline_size);
+ asm volatile("dsb ld\n");
+ }
+ }
+
+ __free_pages(lmc->mem, 0);
+
+ return count;
+}
+
+LMC_DEBUGFS_ENT(mask0);
+LMC_DEBUGFS_ENT(mask2);
+LMC_DEBUGFS_ENT(parity_test);
+
+DEBUGFS_STRUCT(inject_int, 0200, thunderx_lmc_inject_int_write, NULL);
+DEBUGFS_STRUCT(inject_ecc, 0200, thunderx_lmc_inject_ecc_write, NULL);
+DEBUGFS_STRUCT(int_w1c, 0400, NULL, thunderx_lmc_int_read);
+
+struct debugfs_entry *lmc_dfs_ents[] = {
+ &debugfs_mask0,
+ &debugfs_mask2,
+ &debugfs_parity_test,
+ &debugfs_inject_ecc,
+ &debugfs_inject_int,
+ &debugfs_int_w1c,
+};
+
+static int thunderx_create_debugfs_nodes(struct dentry *parent,
+ struct debugfs_entry *attrs[],
+ void *data,
+ size_t num)
+{
+ int i;
+ struct dentry *ent;
+
+ if (!IS_ENABLED(CONFIG_EDAC_DEBUG))
+ return 0;
+
+ if (!parent)
+ return -ENOENT;
+
+ for (i = 0; i < num; i++) {
+ ent = edac_debugfs_create_file(attrs[i]->name, attrs[i]->mode,
+ parent, data, &attrs[i]->fops);
+
+ if (!ent)
+ break;
+ }
+
+ return i;
+}
+
+static phys_addr_t thunderx_faddr_to_phys(u64 faddr, struct thunderx_lmc *lmc)
+{
+ phys_addr_t addr = 0;
+ int bank, xbits;
+
+ addr |= lmc->node << 40;
+ addr |= LMC_FADR_FDIMM(faddr) << lmc->dimm_lsb;
+ addr |= LMC_FADR_FBUNK(faddr) << lmc->rank_lsb;
+ addr |= LMC_FADR_FROW(faddr) << lmc->row_lsb;
+ addr |= (LMC_FADR_FCOL(faddr) >> 4) << lmc->col_hi_lsb;
+
+ bank = LMC_FADR_FBANK(faddr) << lmc->bank_lsb;
+
+ if (lmc->xor_bank)
+ bank ^= get_bits(addr, 12 + lmc->xbits, lmc->bank_width);
+
+ addr |= bank << lmc->bank_lsb;
+
+ xbits = PCI_FUNC(lmc->pdev->devfn);
+
+ if (lmc->l2c_alias)
+ xbits ^= get_bits(addr, 20, lmc->xbits) ^
+ get_bits(addr, 12, lmc->xbits);
+
+ addr |= xbits << 7;
+
+ return addr;
+}
+
+static unsigned int thunderx_get_num_lmcs(unsigned int node)
+{
+ unsigned int number = 0;
+ struct pci_dev *pdev = NULL;
+
+ do {
+ pdev = pci_get_device(PCI_VENDOR_ID_CAVIUM,
+ PCI_DEVICE_ID_THUNDER_LMC,
+ pdev);
+ if (pdev) {
+#ifdef CONFIG_NUMA
+ if (pdev->dev.numa_node == node)
+ number++;
+#else
+ number++;
+#endif
+ }
+ } while (pdev);
+
+ return number;
+}
+
+#define LMC_MESSAGE_SIZE 120
+#define LMC_OTHER_SIZE (50 * ARRAY_SIZE(lmc_errors))
+
+static irqreturn_t thunderx_lmc_err_isr(int irq, void *dev_id)
+{
+ struct mem_ctl_info *mci = dev_id;
+ struct thunderx_lmc *lmc = mci->pvt_info;
+
+ unsigned long head = ring_pos(lmc->ring_head, ARRAY_SIZE(lmc->err_ctx));
+ struct lmc_err_ctx *ctx = &lmc->err_ctx[head];
+
+ writeq(0, lmc->regs + LMC_CHAR_MASK0);
+ writeq(0, lmc->regs + LMC_CHAR_MASK2);
+ writeq(0x2, lmc->regs + LMC_ECC_PARITY_TEST);
+
+ ctx->reg_int = readq(lmc->regs + LMC_INT);
+ ctx->reg_fadr = readq(lmc->regs + LMC_FADR);
+ ctx->reg_nxm_fadr = readq(lmc->regs + LMC_NXM_FADR);
+ ctx->reg_scram_fadr = readq(lmc->regs + LMC_SCRAM_FADR);
+ ctx->reg_ecc_synd = readq(lmc->regs + LMC_ECC_SYND);
+
+ lmc->ring_head++;
+
+ atomic_set(&lmc->ecc_int, 1);
+
+ /* Clear the interrupt */
+ writeq(ctx->reg_int, lmc->regs + LMC_INT);
+
+ return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_lmc_threaded_isr(int irq, void *dev_id)
+{
+ struct mem_ctl_info *mci = dev_id;
+ struct thunderx_lmc *lmc = mci->pvt_info;
+ phys_addr_t phys_addr;
+
+ unsigned long tail;
+ struct lmc_err_ctx *ctx;
+
+ irqreturn_t ret = IRQ_NONE;
+
+ char *msg;
+ char *other;
+
+ msg = kmalloc(LMC_MESSAGE_SIZE, GFP_KERNEL);
+ other = kmalloc(LMC_OTHER_SIZE, GFP_KERNEL);
+
+ if (!msg || !other)
+ goto err_free;
+
+ while (CIRC_CNT(lmc->ring_head, lmc->ring_tail,
+ ARRAY_SIZE(lmc->err_ctx))) {
+ tail = ring_pos(lmc->ring_tail, ARRAY_SIZE(lmc->err_ctx));
+
+ ctx = &lmc->err_ctx[tail];
+
+ dev_dbg(&lmc->pdev->dev, "LMC_INT: %016llx\n",
+ ctx->reg_int);
+ dev_dbg(&lmc->pdev->dev, "LMC_FADR: %016llx\n",
+ ctx->reg_fadr);
+ dev_dbg(&lmc->pdev->dev, "LMC_NXM_FADR: %016llx\n",
+ ctx->reg_nxm_fadr);
+ dev_dbg(&lmc->pdev->dev, "LMC_SCRAM_FADR: %016llx\n",
+ ctx->reg_scram_fadr);
+ dev_dbg(&lmc->pdev->dev, "LMC_ECC_SYND: %016llx\n",
+ ctx->reg_ecc_synd);
+
+ snprintf(msg, LMC_MESSAGE_SIZE,
+ "DIMM %lld rank %lld bank %lld row %lld col %lld",
+ LMC_FADR_FDIMM(ctx->reg_scram_fadr),
+ LMC_FADR_FBUNK(ctx->reg_scram_fadr),
+ LMC_FADR_FBANK(ctx->reg_scram_fadr),
+ LMC_FADR_FROW(ctx->reg_scram_fadr),
+ LMC_FADR_FCOL(ctx->reg_scram_fadr));
+
+ decode_register(other, LMC_OTHER_SIZE, lmc_errors,
+ ctx->reg_int);
+
+ phys_addr = thunderx_faddr_to_phys(ctx->reg_fadr, lmc);
+
+ if (ctx->reg_int & LMC_INT_UE)
+ edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
+ phys_to_pfn(phys_addr),
+ offset_in_page(phys_addr),
+ 0, -1, -1, -1, msg, other);
+ else if (ctx->reg_int & LMC_INT_CE)
+ edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
+ phys_to_pfn(phys_addr),
+ offset_in_page(phys_addr),
+ 0, -1, -1, -1, msg, other);
+
+ lmc->ring_tail++;
+ }
+
+ ret = IRQ_HANDLED;
+
+err_free:
+ kfree(msg);
+ kfree(other);
+
+ return ret;
+}
+
+#ifdef CONFIG_PM
+static int thunderx_lmc_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+ pci_save_state(pdev);
+ pci_disable_device(pdev);
+
+ pci_set_power_state(pdev, pci_choose_state(pdev, state));
+
+ return 0;
+}
+
+static int thunderx_lmc_resume(struct pci_dev *pdev)
+{
+ pci_set_power_state(pdev, PCI_D0);
+ pci_enable_wake(pdev, PCI_D0, 0);
+ pci_restore_state(pdev);
+
+ return 0;
+}
+#endif
+
+static const struct pci_device_id thunderx_lmc_pci_tbl[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVICE_ID_THUNDER_LMC) },
+ { 0, },
+};
+
+static inline int pci_dev_to_mc_idx(struct pci_dev *pdev)
+{
+ int node = dev_to_node(&pdev->dev);
+ int ret = PCI_FUNC(pdev->devfn);
+
+ ret += max(node, 0) << 3;
+
+ return ret;
+}
+
+static int thunderx_lmc_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ struct thunderx_lmc *lmc;
+ struct edac_mc_layer layer;
+ struct mem_ctl_info *mci;
+ u64 lmc_control, lmc_ddr_pll_ctl, lmc_config;
+ int ret;
+ u64 lmc_int;
+ void *l2c_ioaddr;
+
+ layer.type = EDAC_MC_LAYER_SLOT;
+ layer.size = 2;
+ layer.is_virt_csrow = false;
+
+ ret = pcim_enable_device(pdev);
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot enable PCI device: %d\n", ret);
+ return ret;
+ }
+
+ ret = pcim_iomap_regions(pdev, BIT(0), "thunderx_lmc");
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot map PCI resources: %d\n", ret);
+ return ret;
+ }
+
+ mci = edac_mc_alloc(pci_dev_to_mc_idx(pdev), 1, &layer,
+ sizeof(struct thunderx_lmc));
+ if (!mci)
+ return -ENOMEM;
+
+ mci->pdev = &pdev->dev;
+ lmc = mci->pvt_info;
+
+ pci_set_drvdata(pdev, mci);
+
+ lmc->regs = pcim_iomap_table(pdev)[0];
+
+ lmc_control = readq(lmc->regs + LMC_CONTROL);
+ lmc_ddr_pll_ctl = readq(lmc->regs + LMC_DDR_PLL_CTL);
+ lmc_config = readq(lmc->regs + LMC_CONFIG);
+
+ if (lmc_control & LMC_CONTROL_RDIMM) {
+ mci->mtype_cap = FIELD_GET(LMC_DDR_PLL_CTL_DDR4,
+ lmc_ddr_pll_ctl) ?
+ MEM_RDDR4 : MEM_RDDR3;
+ } else {
+ mci->mtype_cap = FIELD_GET(LMC_DDR_PLL_CTL_DDR4,
+ lmc_ddr_pll_ctl) ?
+ MEM_DDR4 : MEM_DDR3;
+ }
+
+ mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
+ mci->edac_cap = EDAC_FLAG_SECDED;
+
+ mci->mod_name = "thunderx-lmc";
+ mci->mod_ver = "1";
+ mci->ctl_name = "thunderx-lmc";
+ mci->dev_name = dev_name(&pdev->dev);
+ mci->scrub_mode = SCRUB_NONE;
+
+ lmc->pdev = pdev;
+ lmc->msix_ent.entry = 0;
+
+ lmc->ring_head = 0;
+ lmc->ring_tail = 0;
+
+ ret = pci_enable_msix_exact(pdev, &lmc->msix_ent, 1);
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot enable interrupt: %d\n", ret);
+ goto err_free;
+ }
+
+ ret = devm_request_threaded_irq(&pdev->dev, lmc->msix_ent.vector,
+ thunderx_lmc_err_isr,
+ thunderx_lmc_threaded_isr, 0,
+ "[EDAC] ThunderX LMC", mci);
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot set ISR: %d\n", ret);
+ goto err_free;
+ }
+
+ lmc->node = FIELD_GET(THUNDERX_NODE, pci_resource_start(pdev, 0));
+
+ lmc->xbits = thunderx_get_num_lmcs(lmc->node) >> 1;
+ lmc->bank_width = (FIELD_GET(LMC_DDR_PLL_CTL_DDR4, lmc_ddr_pll_ctl) &&
+ FIELD_GET(LMC_CONFIG_BG2, lmc_config)) ? 4 : 3;
+
+ lmc->pbank_lsb = (lmc_config >> 5) & 0xf;
+ lmc->dimm_lsb = 28 + lmc->pbank_lsb + lmc->xbits;
+ lmc->rank_lsb = lmc->dimm_lsb;
+ lmc->rank_lsb -= FIELD_GET(LMC_CONFIG_RANK_ENA, lmc_config) ? 1 : 0;
+ lmc->bank_lsb = 7 + lmc->xbits;
+ lmc->row_lsb = 14 + LMC_CONFIG_ROW_LSB(lmc_config) + lmc->xbits;
+
+ lmc->col_hi_lsb = lmc->bank_lsb + lmc->bank_width;
+
+ lmc->xor_bank = lmc_control & LMC_CONTROL_XOR_BANK;
+
+ l2c_ioaddr = ioremap(L2C_CTL | FIELD_PREP(THUNDERX_NODE, lmc->node),
+ PAGE_SIZE);
+
+ if (!l2c_ioaddr) {
+ dev_err(&pdev->dev, "Cannot map L2C_CTL\n");
+ goto err_free;
+ }
+
+ lmc->l2c_alias = !(readq(l2c_ioaddr) & L2C_CTL_DISIDXALIAS);
+
+ iounmap(l2c_ioaddr);
+
+ ret = edac_mc_add_mc(mci);
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot add the MC: %d\n", ret);
+ goto err_free;
+ }
+
+ lmc_int = readq(lmc->regs + LMC_INT);
+ writeq(lmc_int, lmc->regs + LMC_INT);
+
+ writeq(LMC_INT_ENA_ALL, lmc->regs + LMC_INT_ENA_W1S);
+
+ if (IS_ENABLED(CONFIG_EDAC_DEBUG)) {
+ ret = thunderx_create_debugfs_nodes(mci->debugfs,
+ lmc_dfs_ents,
+ lmc,
+ ARRAY_SIZE(lmc_dfs_ents));
+
+ if (ret != ARRAY_SIZE(lmc_dfs_ents)) {
+ dev_warn(&pdev->dev, "Error creating debugfs entries: %d%s\n",
+ ret, ret >= 0 ? " created" : "");
+ }
+ }
+
+ return 0;
+
+err_free:
+ pci_set_drvdata(pdev, NULL);
+ edac_mc_free(mci);
+
+ return ret;
+}
+
+static void thunderx_lmc_remove(struct pci_dev *pdev)
+{
+ struct mem_ctl_info *mci = pci_get_drvdata(pdev);
+ struct thunderx_lmc *lmc = mci->pvt_info;
+
+ writeq(LMC_INT_ENA_ALL, lmc->regs + LMC_INT_ENA_W1C);
+
+ edac_mc_del_mc(&pdev->dev);
+ edac_mc_free(mci);
+}
+
+MODULE_DEVICE_TABLE(pci, thunderx_lmc_pci_tbl);
+
+static struct pci_driver thunderx_lmc_driver = {
+ .name = "thunderx_lmc_edac",
+ .probe = thunderx_lmc_probe,
+ .remove = thunderx_lmc_remove,
+#ifdef CONFIG_PM
+ .suspend = thunderx_lmc_suspend,
+ .resume = thunderx_lmc_resume,
+#endif
+ .id_table = thunderx_lmc_pci_tbl,
+};
+
+/*---------------------- OCX driver ---------------------------------*/
+
+#define PCI_DEVICE_ID_THUNDER_OCX 0xa013
+
+#define OCX_LINK_INTS 3
+#define OCX_INTS (OCX_LINK_INTS + 1)
+#define OCX_RX_LANES 24
+#define OCX_RX_LANE_STATS 15
+
+#define OCX_COM_INT 0x100
+#define OCX_COM_INT_W1S 0x108
+#define OCX_COM_INT_ENA_W1S 0x110
+#define OCX_COM_INT_ENA_W1C 0x118
+
+#define OCX_COM_IO_BADID BIT(54)
+#define OCX_COM_MEM_BADID BIT(53)
+#define OCX_COM_COPR_BADID BIT(52)
+#define OCX_COM_WIN_REQ_BADID BIT(51)
+#define OCX_COM_WIN_REQ_TOUT BIT(50)
+#define OCX_COM_RX_LANE GENMASK(23, 0)
+
+#define OCX_COM_INT_CE (OCX_COM_IO_BADID | \
+ OCX_COM_MEM_BADID | \
+ OCX_COM_COPR_BADID | \
+ OCX_COM_WIN_REQ_BADID | \
+ OCX_COM_WIN_REQ_TOUT)
+
+static const struct error_descr ocx_com_errors[] = {
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_COM_IO_BADID,
+ .descr = "Invalid IO transaction node ID",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_COM_MEM_BADID,
+ .descr = "Invalid memory transaction node ID",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_COM_COPR_BADID,
+ .descr = "Invalid coprocessor transaction node ID",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_COM_WIN_REQ_BADID,
+ .descr = "Invalid SLI transaction node ID",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_COM_WIN_REQ_TOUT,
+ .descr = "Window/core request timeout",
+ },
+ {0, 0, NULL},
+};
+
+#define OCX_COM_LINKX_INT(x) (0x120 + (x) * 8)
+#define OCX_COM_LINKX_INT_W1S(x) (0x140 + (x) * 8)
+#define OCX_COM_LINKX_INT_ENA_W1S(x) (0x160 + (x) * 8)
+#define OCX_COM_LINKX_INT_ENA_W1C(x) (0x180 + (x) * 8)
+
+#define OCX_COM_LINK_BAD_WORD BIT(13)
+#define OCX_COM_LINK_ALIGN_FAIL BIT(12)
+#define OCX_COM_LINK_ALIGN_DONE BIT(11)
+#define OCX_COM_LINK_UP BIT(10)
+#define OCX_COM_LINK_STOP BIT(9)
+#define OCX_COM_LINK_BLK_ERR BIT(8)
+#define OCX_COM_LINK_REINIT BIT(7)
+#define OCX_COM_LINK_LNK_DATA BIT(6)
+#define OCX_COM_LINK_RXFIFO_DBE BIT(5)
+#define OCX_COM_LINK_RXFIFO_SBE BIT(4)
+#define OCX_COM_LINK_TXFIFO_DBE BIT(3)
+#define OCX_COM_LINK_TXFIFO_SBE BIT(2)
+#define OCX_COM_LINK_REPLAY_DBE BIT(1)
+#define OCX_COM_LINK_REPLAY_SBE BIT(0)
+
+static const struct error_descr ocx_com_link_errors[] = {
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_COM_LINK_REPLAY_SBE,
+ .descr = "Replay buffer single-bit error",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_COM_LINK_TXFIFO_SBE,
+ .descr = "TX FIFO single-bit error",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_COM_LINK_RXFIFO_SBE,
+ .descr = "RX FIFO single-bit error",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_COM_LINK_BLK_ERR,
+ .descr = "Block code error",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_COM_LINK_ALIGN_FAIL,
+ .descr = "Link alignment failure",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_COM_LINK_BAD_WORD,
+ .descr = "Bad code word",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = OCX_COM_LINK_REPLAY_DBE,
+ .descr = "Replay buffer double-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = OCX_COM_LINK_TXFIFO_DBE,
+ .descr = "TX FIFO double-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = OCX_COM_LINK_RXFIFO_DBE,
+ .descr = "RX FIFO double-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = OCX_COM_LINK_STOP,
+ .descr = "Link stopped",
+ },
+ {0, 0, NULL},
+};
+
+#define OCX_COM_LINK_INT_UE (OCX_COM_LINK_REPLAY_DBE | \
+ OCX_COM_LINK_TXFIFO_DBE | \
+ OCX_COM_LINK_RXFIFO_DBE | \
+ OCX_COM_LINK_STOP)
+
+#define OCX_COM_LINK_INT_CE (OCX_COM_LINK_REPLAY_SBE | \
+ OCX_COM_LINK_TXFIFO_SBE | \
+ OCX_COM_LINK_RXFIFO_SBE | \
+ OCX_COM_LINK_BLK_ERR | \
+ OCX_COM_LINK_ALIGN_FAIL | \
+ OCX_COM_LINK_BAD_WORD)
+
+#define OCX_LNE_INT(x) (0x8018 + (x) * 0x100)
+#define OCX_LNE_INT_EN(x) (0x8020 + (x) * 0x100)
+#define OCX_LNE_BAD_CNT(x) (0x8028 + (x) * 0x100)
+#define OCX_LNE_CFG(x) (0x8000 + (x) * 0x100)
+#define OCX_LNE_STAT(x, y) (0x8040 + (x) * 0x100 + (y) * 8)
+
+#define OCX_LNE_CFG_RX_BDRY_LOCK_DIS BIT(8)
+#define OCX_LNE_CFG_RX_STAT_WRAP_DIS BIT(2)
+#define OCX_LNE_CFG_RX_STAT_RDCLR BIT(1)
+#define OCX_LNE_CFG_RX_STAT_ENA BIT(0)
+
+
+#define OCX_LANE_BAD_64B67B BIT(8)
+#define OCX_LANE_DSKEW_FIFO_OVFL BIT(5)
+#define OCX_LANE_SCRM_SYNC_LOSS BIT(4)
+#define OCX_LANE_UKWN_CNTL_WORD BIT(3)
+#define OCX_LANE_CRC32_ERR BIT(2)
+#define OCX_LANE_BDRY_SYNC_LOSS BIT(1)
+#define OCX_LANE_SERDES_LOCK_LOSS BIT(0)
+
+#define OCX_COM_LANE_INT_UE (0)
+#define OCX_COM_LANE_INT_CE (OCX_LANE_SERDES_LOCK_LOSS | \
+ OCX_LANE_BDRY_SYNC_LOSS | \
+ OCX_LANE_CRC32_ERR | \
+ OCX_LANE_UKWN_CNTL_WORD | \
+ OCX_LANE_SCRM_SYNC_LOSS | \
+ OCX_LANE_DSKEW_FIFO_OVFL | \
+ OCX_LANE_BAD_64B67B)
+
+static const struct error_descr ocx_lane_errors[] = {
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_LANE_SERDES_LOCK_LOSS,
+ .descr = "RX SerDes lock lost",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_LANE_BDRY_SYNC_LOSS,
+ .descr = "RX word boundary lost",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_LANE_CRC32_ERR,
+ .descr = "CRC32 error",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_LANE_UKWN_CNTL_WORD,
+ .descr = "Unknown control word",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_LANE_SCRM_SYNC_LOSS,
+ .descr = "Scrambler synchronization lost",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_LANE_DSKEW_FIFO_OVFL,
+ .descr = "RX deskew FIFO overflow",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = OCX_LANE_BAD_64B67B,
+ .descr = "Bad 64B/67B codeword",
+ },
+ {0, 0, NULL},
+};
+
+#define OCX_LNE_INT_ENA_ALL (GENMASK(9, 8) | GENMASK(6, 0))
+#define OCX_COM_INT_ENA_ALL (GENMASK(54, 50) | GENMASK(23, 0))
+#define OCX_COM_LINKX_INT_ENA_ALL (GENMASK(13, 12) | \
+ GENMASK(9, 7) | GENMASK(5, 0))
+
+#define OCX_TLKX_ECC_CTL(x) (0x10018 + (x) * 0x2000)
+#define OCX_RLKX_ECC_CTL(x) (0x18018 + (x) * 0x2000)
+
+struct ocx_com_err_ctx {
+ u64 reg_com_int;
+ u64 reg_lane_int[OCX_RX_LANES];
+ u64 reg_lane_stat11[OCX_RX_LANES];
+};
+
+struct ocx_link_err_ctx {
+ u64 reg_com_link_int;
+ int link;
+};
+
+struct thunderx_ocx {
+ void __iomem *regs;
+ int com_link;
+ struct pci_dev *pdev;
+ struct edac_device_ctl_info *edac_dev;
+
+ struct dentry *debugfs;
+ struct msix_entry msix_ent[OCX_INTS];
+
+ struct ocx_com_err_ctx com_err_ctx[RING_ENTRIES];
+ struct ocx_link_err_ctx link_err_ctx[RING_ENTRIES];
+
+ unsigned long com_ring_head;
+ unsigned long com_ring_tail;
+
+ unsigned long link_ring_head;
+ unsigned long link_ring_tail;
+};
+
+#define OCX_MESSAGE_SIZE SZ_1K
+#define OCX_OTHER_SIZE (50 * ARRAY_SIZE(ocx_com_link_errors))
+
+/* This handler is threaded */
+static irqreturn_t thunderx_ocx_com_isr(int irq, void *irq_id)
+{
+ struct msix_entry *msix = irq_id;
+ struct thunderx_ocx *ocx = container_of(msix, struct thunderx_ocx,
+ msix_ent[msix->entry]);
+
+ int lane;
+ unsigned long head = ring_pos(ocx->com_ring_head,
+ ARRAY_SIZE(ocx->com_err_ctx));
+ struct ocx_com_err_ctx *ctx = &ocx->com_err_ctx[head];
+
+ ctx->reg_com_int = readq(ocx->regs + OCX_COM_INT);
+
+ for (lane = 0; lane < OCX_RX_LANES; lane++) {
+ ctx->reg_lane_int[lane] =
+ readq(ocx->regs + OCX_LNE_INT(lane));
+ ctx->reg_lane_stat11[lane] =
+ readq(ocx->regs + OCX_LNE_STAT(lane, 11));
+
+ writeq(ctx->reg_lane_int[lane], ocx->regs + OCX_LNE_INT(lane));
+ }
+
+ writeq(ctx->reg_com_int, ocx->regs + OCX_COM_INT);
+
+ ocx->com_ring_head++;
+
+ return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_ocx_com_threaded_isr(int irq, void *irq_id)
+{
+ struct msix_entry *msix = irq_id;
+ struct thunderx_ocx *ocx = container_of(msix, struct thunderx_ocx,
+ msix_ent[msix->entry]);
+
+ irqreturn_t ret = IRQ_NONE;
+
+ unsigned long tail;
+ struct ocx_com_err_ctx *ctx;
+ int lane;
+ char *msg;
+ char *other;
+
+ msg = kmalloc(OCX_MESSAGE_SIZE, GFP_KERNEL);
+ other = kmalloc(OCX_OTHER_SIZE, GFP_KERNEL);
+
+ if (!msg || !other)
+ goto err_free;
+
+ while (CIRC_CNT(ocx->com_ring_head, ocx->com_ring_tail,
+ ARRAY_SIZE(ocx->com_err_ctx))) {
+ tail = ring_pos(ocx->com_ring_tail,
+ ARRAY_SIZE(ocx->com_err_ctx));
+ ctx = &ocx->com_err_ctx[tail];
+
+ snprintf(msg, OCX_MESSAGE_SIZE, "%s: OCX_COM_INT: %016llx",
+ ocx->edac_dev->ctl_name, ctx->reg_com_int);
+
+ decode_register(other, OCX_OTHER_SIZE,
+ ocx_com_errors, ctx->reg_com_int);
+
+ strncat(msg, other, OCX_MESSAGE_SIZE);
+
+ for (lane = 0; lane < OCX_RX_LANES; lane++)
+ if (ctx->reg_com_int & BIT(lane)) {
+ snprintf(other, OCX_OTHER_SIZE,
+ "\n\tOCX_LNE_INT[%02d]: %016llx OCX_LNE_STAT11[%02d]: %016llx",
+ lane, ctx->reg_lane_int[lane],
+ lane, ctx->reg_lane_stat11[lane]);
+
+ strncat(msg, other, OCX_MESSAGE_SIZE);
+
+ decode_register(other, OCX_OTHER_SIZE,
+ ocx_lane_errors,
+ ctx->reg_lane_int[lane]);
+ strncat(msg, other, OCX_MESSAGE_SIZE);
+ }
+
+ if (ctx->reg_com_int & OCX_COM_INT_CE)
+ edac_device_handle_ce(ocx->edac_dev, 0, 0, msg);
+
+ ocx->com_ring_tail++;
+ }
+
+ ret = IRQ_HANDLED;
+
+err_free:
+ kfree(other);
+ kfree(msg);
+
+ return ret;
+}
+
+static irqreturn_t thunderx_ocx_lnk_isr(int irq, void *irq_id)
+{
+ struct msix_entry *msix = irq_id;
+ struct thunderx_ocx *ocx = container_of(msix, struct thunderx_ocx,
+ msix_ent[msix->entry]);
+ unsigned long head = ring_pos(ocx->link_ring_head,
+ ARRAY_SIZE(ocx->link_err_ctx));
+ struct ocx_link_err_ctx *ctx = &ocx->link_err_ctx[head];
+
+ ctx->link = msix->entry;
+ ctx->reg_com_link_int = readq(ocx->regs + OCX_COM_LINKX_INT(ctx->link));
+
+ writeq(ctx->reg_com_link_int, ocx->regs + OCX_COM_LINKX_INT(ctx->link));
+
+ ocx->link_ring_head++;
+
+ return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_ocx_lnk_threaded_isr(int irq, void *irq_id)
+{
+ struct msix_entry *msix = irq_id;
+ struct thunderx_ocx *ocx = container_of(msix, struct thunderx_ocx,
+ msix_ent[msix->entry]);
+ irqreturn_t ret = IRQ_NONE;
+ unsigned long tail;
+ struct ocx_link_err_ctx *ctx;
+
+ char *msg;
+ char *other;
+
+ msg = kmalloc(OCX_MESSAGE_SIZE, GFP_KERNEL);
+ other = kmalloc(OCX_OTHER_SIZE, GFP_KERNEL);
+
+ if (!msg || !other)
+ goto err_free;
+
+ while (CIRC_CNT(ocx->link_ring_head, ocx->link_ring_tail,
+ ARRAY_SIZE(ocx->link_err_ctx))) {
+ tail = ring_pos(ocx->link_ring_head,
+ ARRAY_SIZE(ocx->link_err_ctx));
+
+ ctx = &ocx->link_err_ctx[tail];
+
+ snprintf(msg, OCX_MESSAGE_SIZE,
+ "%s: OCX_COM_LINK_INT[%d]: %016llx",
+ ocx->edac_dev->ctl_name,
+ ctx->link, ctx->reg_com_link_int);
+
+ decode_register(other, OCX_OTHER_SIZE,
+ ocx_com_link_errors, ctx->reg_com_link_int);
+
+ strncat(msg, other, OCX_MESSAGE_SIZE);
+
+ if (ctx->reg_com_link_int & OCX_COM_LINK_INT_UE)
+ edac_device_handle_ue(ocx->edac_dev, 0, 0, msg);
+ else if (ctx->reg_com_link_int & OCX_COM_LINK_INT_CE)
+ edac_device_handle_ce(ocx->edac_dev, 0, 0, msg);
+
+ ocx->link_ring_tail++;
+ }
+
+ ret = IRQ_HANDLED;
+err_free:
+ kfree(other);
+ kfree(msg);
+
+ return ret;
+}
+
+#define OCX_DEBUGFS_ATTR(_name, _reg) DEBUGFS_REG_ATTR(ocx, _name, _reg)
+
+OCX_DEBUGFS_ATTR(tlk0_ecc_ctl, OCX_TLKX_ECC_CTL(0));
+OCX_DEBUGFS_ATTR(tlk1_ecc_ctl, OCX_TLKX_ECC_CTL(1));
+OCX_DEBUGFS_ATTR(tlk2_ecc_ctl, OCX_TLKX_ECC_CTL(2));
+
+OCX_DEBUGFS_ATTR(rlk0_ecc_ctl, OCX_RLKX_ECC_CTL(0));
+OCX_DEBUGFS_ATTR(rlk1_ecc_ctl, OCX_RLKX_ECC_CTL(1));
+OCX_DEBUGFS_ATTR(rlk2_ecc_ctl, OCX_RLKX_ECC_CTL(2));
+
+OCX_DEBUGFS_ATTR(com_link0_int, OCX_COM_LINKX_INT_W1S(0));
+OCX_DEBUGFS_ATTR(com_link1_int, OCX_COM_LINKX_INT_W1S(1));
+OCX_DEBUGFS_ATTR(com_link2_int, OCX_COM_LINKX_INT_W1S(2));
+
+OCX_DEBUGFS_ATTR(lne00_badcnt, OCX_LNE_BAD_CNT(0));
+OCX_DEBUGFS_ATTR(lne01_badcnt, OCX_LNE_BAD_CNT(1));
+OCX_DEBUGFS_ATTR(lne02_badcnt, OCX_LNE_BAD_CNT(2));
+OCX_DEBUGFS_ATTR(lne03_badcnt, OCX_LNE_BAD_CNT(3));
+OCX_DEBUGFS_ATTR(lne04_badcnt, OCX_LNE_BAD_CNT(4));
+OCX_DEBUGFS_ATTR(lne05_badcnt, OCX_LNE_BAD_CNT(5));
+OCX_DEBUGFS_ATTR(lne06_badcnt, OCX_LNE_BAD_CNT(6));
+OCX_DEBUGFS_ATTR(lne07_badcnt, OCX_LNE_BAD_CNT(7));
+
+OCX_DEBUGFS_ATTR(lne08_badcnt, OCX_LNE_BAD_CNT(8));
+OCX_DEBUGFS_ATTR(lne09_badcnt, OCX_LNE_BAD_CNT(9));
+OCX_DEBUGFS_ATTR(lne10_badcnt, OCX_LNE_BAD_CNT(10));
+OCX_DEBUGFS_ATTR(lne11_badcnt, OCX_LNE_BAD_CNT(11));
+OCX_DEBUGFS_ATTR(lne12_badcnt, OCX_LNE_BAD_CNT(12));
+OCX_DEBUGFS_ATTR(lne13_badcnt, OCX_LNE_BAD_CNT(13));
+OCX_DEBUGFS_ATTR(lne14_badcnt, OCX_LNE_BAD_CNT(14));
+OCX_DEBUGFS_ATTR(lne15_badcnt, OCX_LNE_BAD_CNT(15));
+
+OCX_DEBUGFS_ATTR(lne16_badcnt, OCX_LNE_BAD_CNT(16));
+OCX_DEBUGFS_ATTR(lne17_badcnt, OCX_LNE_BAD_CNT(17));
+OCX_DEBUGFS_ATTR(lne18_badcnt, OCX_LNE_BAD_CNT(18));
+OCX_DEBUGFS_ATTR(lne19_badcnt, OCX_LNE_BAD_CNT(19));
+OCX_DEBUGFS_ATTR(lne20_badcnt, OCX_LNE_BAD_CNT(20));
+OCX_DEBUGFS_ATTR(lne21_badcnt, OCX_LNE_BAD_CNT(21));
+OCX_DEBUGFS_ATTR(lne22_badcnt, OCX_LNE_BAD_CNT(22));
+OCX_DEBUGFS_ATTR(lne23_badcnt, OCX_LNE_BAD_CNT(23));
+
+OCX_DEBUGFS_ATTR(com_int, OCX_COM_INT_W1S);
+
+struct debugfs_entry *ocx_dfs_ents[] = {
+ &debugfs_tlk0_ecc_ctl,
+ &debugfs_tlk1_ecc_ctl,
+ &debugfs_tlk2_ecc_ctl,
+
+ &debugfs_rlk0_ecc_ctl,
+ &debugfs_rlk1_ecc_ctl,
+ &debugfs_rlk2_ecc_ctl,
+
+ &debugfs_com_link0_int,
+ &debugfs_com_link1_int,
+ &debugfs_com_link2_int,
+
+ &debugfs_lne00_badcnt,
+ &debugfs_lne01_badcnt,
+ &debugfs_lne02_badcnt,
+ &debugfs_lne03_badcnt,
+ &debugfs_lne04_badcnt,
+ &debugfs_lne05_badcnt,
+ &debugfs_lne06_badcnt,
+ &debugfs_lne07_badcnt,
+ &debugfs_lne08_badcnt,
+ &debugfs_lne09_badcnt,
+ &debugfs_lne10_badcnt,
+ &debugfs_lne11_badcnt,
+ &debugfs_lne12_badcnt,
+ &debugfs_lne13_badcnt,
+ &debugfs_lne14_badcnt,
+ &debugfs_lne15_badcnt,
+ &debugfs_lne16_badcnt,
+ &debugfs_lne17_badcnt,
+ &debugfs_lne18_badcnt,
+ &debugfs_lne19_badcnt,
+ &debugfs_lne20_badcnt,
+ &debugfs_lne21_badcnt,
+ &debugfs_lne22_badcnt,
+ &debugfs_lne23_badcnt,
+
+ &debugfs_com_int,
+};
+
+static const struct pci_device_id thunderx_ocx_pci_tbl[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVICE_ID_THUNDER_OCX) },
+ { 0, },
+};
+
+static void thunderx_ocx_clearstats(struct thunderx_ocx *ocx)
+{
+ int lane, stat, cfg;
+
+ for (lane = 0; lane < OCX_RX_LANES; lane++) {
+ cfg = readq(ocx->regs + OCX_LNE_CFG(lane));
+ cfg |= OCX_LNE_CFG_RX_STAT_RDCLR;
+ cfg &= ~OCX_LNE_CFG_RX_STAT_ENA;
+ writeq(cfg, ocx->regs + OCX_LNE_CFG(lane));
+
+ for (stat = 0; stat < OCX_RX_LANE_STATS; stat++)
+ readq(ocx->regs + OCX_LNE_STAT(lane, stat));
+ }
+}
+
+static int thunderx_ocx_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ struct thunderx_ocx *ocx;
+ struct edac_device_ctl_info *edac_dev;
+ char name[32];
+ int idx;
+ int i;
+ int ret;
+ u64 reg;
+
+ ret = pcim_enable_device(pdev);
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot enable PCI device: %d\n", ret);
+ return ret;
+ }
+
+ ret = pcim_iomap_regions(pdev, BIT(0), "thunderx_ocx");
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot map PCI resources: %d\n", ret);
+ return ret;
+ }
+
+ idx = edac_device_alloc_index();
+ snprintf(name, sizeof(name), "OCX%d", idx);
+ edac_dev = edac_device_alloc_ctl_info(sizeof(struct thunderx_ocx),
+ name, 1, "CCPI", 1,
+ 0, NULL, 0, idx);
+ if (!edac_dev) {
+ dev_err(&pdev->dev, "Cannot allocate EDAC device: %d\n", ret);
+ return -ENOMEM;
+ }
+ ocx = edac_dev->pvt_info;
+ ocx->edac_dev = edac_dev;
+ ocx->com_ring_head = 0;
+ ocx->com_ring_tail = 0;
+ ocx->link_ring_head = 0;
+ ocx->link_ring_tail = 0;
+
+ ocx->regs = pcim_iomap_table(pdev)[0];
+ if (!ocx->regs) {
+ dev_err(&pdev->dev, "Cannot map PCI resources: %d\n", ret);
+ ret = -ENODEV;
+ goto err_free;
+ }
+
+ ocx->pdev = pdev;
+
+ for (i = 0; i < OCX_INTS; i++) {
+ ocx->msix_ent[i].entry = i;
+ ocx->msix_ent[i].vector = 0;
+ }
+
+ ret = pci_enable_msix_exact(pdev, ocx->msix_ent, OCX_INTS);
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot enable interrupt: %d\n", ret);
+ goto err_free;
+ }
+
+ for (i = 0; i < OCX_INTS; i++) {
+ ret = devm_request_threaded_irq(&pdev->dev,
+ ocx->msix_ent[i].vector,
+ (i == 3) ?
+ thunderx_ocx_com_isr :
+ thunderx_ocx_lnk_isr,
+ (i == 3) ?
+ thunderx_ocx_com_threaded_isr :
+ thunderx_ocx_lnk_threaded_isr,
+ 0, "[EDAC] ThunderX OCX",
+ &ocx->msix_ent[i]);
+ if (ret)
+ goto err_free;
+ }
+
+ edac_dev->dev = &pdev->dev;
+ edac_dev->dev_name = dev_name(&pdev->dev);
+ edac_dev->mod_name = "thunderx-ocx";
+ edac_dev->ctl_name = "thunderx-ocx";
+
+ ret = edac_device_add_device(edac_dev);
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot add EDAC device: %d\n", ret);
+ goto err_free;
+ }
+
+ if (IS_ENABLED(CONFIG_EDAC_DEBUG)) {
+ ocx->debugfs = edac_debugfs_create_dir(pdev->dev.kobj.name);
+
+ ret = thunderx_create_debugfs_nodes(ocx->debugfs,
+ ocx_dfs_ents,
+ ocx,
+ ARRAY_SIZE(ocx_dfs_ents));
+ if (ret != ARRAY_SIZE(ocx_dfs_ents)) {
+ dev_warn(&pdev->dev, "Error creating debugfs entries: %d%s\n",
+ ret, ret >= 0 ? " created" : "");
+ }
+ }
+
+ pci_set_drvdata(pdev, edac_dev);
+
+ thunderx_ocx_clearstats(ocx);
+
+ for (i = 0; i < OCX_RX_LANES; i++) {
+ writeq(OCX_LNE_INT_ENA_ALL,
+ ocx->regs + OCX_LNE_INT_EN(i));
+
+ reg = readq(ocx->regs + OCX_LNE_INT(i));
+ writeq(reg, ocx->regs + OCX_LNE_INT(i));
+
+ }
+
+ for (i = 0; i < OCX_LINK_INTS; i++) {
+ reg = readq(ocx->regs + OCX_COM_LINKX_INT(i));
+ writeq(reg, ocx->regs + OCX_COM_LINKX_INT(i));
+
+ writeq(OCX_COM_LINKX_INT_ENA_ALL,
+ ocx->regs + OCX_COM_LINKX_INT_ENA_W1S(i));
+ }
+
+ reg = readq(ocx->regs + OCX_COM_INT);
+ writeq(reg, ocx->regs + OCX_COM_INT);
+
+ writeq(OCX_COM_INT_ENA_ALL, ocx->regs + OCX_COM_INT_ENA_W1S);
+
+ return 0;
+err_free:
+ edac_device_free_ctl_info(edac_dev);
+
+ return ret;
+}
+
+static void thunderx_ocx_remove(struct pci_dev *pdev)
+{
+ struct edac_device_ctl_info *edac_dev = pci_get_drvdata(pdev);
+ struct thunderx_ocx *ocx = edac_dev->pvt_info;
+ int i;
+
+ writeq(OCX_COM_INT_ENA_ALL, ocx->regs + OCX_COM_INT_ENA_W1C);
+
+ for (i = 0; i < OCX_INTS; i++) {
+ writeq(OCX_COM_LINKX_INT_ENA_ALL,
+ ocx->regs + OCX_COM_LINKX_INT_ENA_W1C(i));
+ }
+
+ edac_debugfs_remove_recursive(ocx->debugfs);
+
+ edac_device_del_device(&pdev->dev);
+ edac_device_free_ctl_info(edac_dev);
+}
+
+MODULE_DEVICE_TABLE(pci, thunderx_ocx_pci_tbl);
+
+static struct pci_driver thunderx_ocx_driver = {
+ .name = "thunderx_ocx_edac",
+ .probe = thunderx_ocx_probe,
+ .remove = thunderx_ocx_remove,
+ .id_table = thunderx_ocx_pci_tbl,
+};
+
+/*---------------------- L2C driver ---------------------------------*/
+
+#define PCI_DEVICE_ID_THUNDER_L2C_TAD 0xa02e
+#define PCI_DEVICE_ID_THUNDER_L2C_CBC 0xa02f
+#define PCI_DEVICE_ID_THUNDER_L2C_MCI 0xa030
+
+#define L2C_TAD_INT_W1C 0x40000
+#define L2C_TAD_INT_W1S 0x40008
+
+#define L2C_TAD_INT_ENA_W1C 0x40020
+#define L2C_TAD_INT_ENA_W1S 0x40028
+
+
+#define L2C_TAD_INT_L2DDBE BIT(1)
+#define L2C_TAD_INT_SBFSBE BIT(2)
+#define L2C_TAD_INT_SBFDBE BIT(3)
+#define L2C_TAD_INT_FBFSBE BIT(4)
+#define L2C_TAD_INT_FBFDBE BIT(5)
+#define L2C_TAD_INT_TAGDBE BIT(9)
+#define L2C_TAD_INT_RDDISLMC BIT(15)
+#define L2C_TAD_INT_WRDISLMC BIT(16)
+#define L2C_TAD_INT_LFBTO BIT(17)
+#define L2C_TAD_INT_GSYNCTO BIT(18)
+#define L2C_TAD_INT_RTGSBE BIT(32)
+#define L2C_TAD_INT_RTGDBE BIT(33)
+#define L2C_TAD_INT_RDDISOCI BIT(34)
+#define L2C_TAD_INT_WRDISOCI BIT(35)
+
+#define L2C_TAD_INT_ECC (L2C_TAD_INT_L2DDBE | \
+ L2C_TAD_INT_SBFSBE | L2C_TAD_INT_SBFDBE | \
+ L2C_TAD_INT_FBFSBE | L2C_TAD_INT_FBFDBE)
+
+#define L2C_TAD_INT_CE (L2C_TAD_INT_SBFSBE | \
+ L2C_TAD_INT_FBFSBE)
+
+#define L2C_TAD_INT_UE (L2C_TAD_INT_L2DDBE | \
+ L2C_TAD_INT_SBFDBE | \
+ L2C_TAD_INT_FBFDBE | \
+ L2C_TAD_INT_TAGDBE | \
+ L2C_TAD_INT_RTGDBE | \
+ L2C_TAD_INT_WRDISOCI | \
+ L2C_TAD_INT_RDDISOCI | \
+ L2C_TAD_INT_WRDISLMC | \
+ L2C_TAD_INT_RDDISLMC | \
+ L2C_TAD_INT_LFBTO | \
+ L2C_TAD_INT_GSYNCTO)
+
+static const struct error_descr l2_tad_errors[] = {
+ {
+ .type = ERR_CORRECTED,
+ .mask = L2C_TAD_INT_SBFSBE,
+ .descr = "SBF single-bit error",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = L2C_TAD_INT_FBFSBE,
+ .descr = "FBF single-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_TAD_INT_L2DDBE,
+ .descr = "L2D double-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_TAD_INT_SBFDBE,
+ .descr = "SBF double-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_TAD_INT_FBFDBE,
+ .descr = "FBF double-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_TAD_INT_TAGDBE,
+ .descr = "TAG double-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_TAD_INT_RTGDBE,
+ .descr = "RTG double-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_TAD_INT_WRDISOCI,
+ .descr = "Write to a disabled CCPI",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_TAD_INT_RDDISOCI,
+ .descr = "Read from a disabled CCPI",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_TAD_INT_WRDISLMC,
+ .descr = "Write to a disabled LMC",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_TAD_INT_RDDISLMC,
+ .descr = "Read from a disabled LMC",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_TAD_INT_LFBTO,
+ .descr = "LFB entry timeout",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_TAD_INT_GSYNCTO,
+ .descr = "Global sync CCPI timeout",
+ },
+ {0, 0, NULL},
+};
+
+#define L2C_TAD_INT_TAG (L2C_TAD_INT_TAGDBE)
+
+#define L2C_TAD_INT_RTG (L2C_TAD_INT_RTGDBE)
+
+#define L2C_TAD_INT_DISLMC (L2C_TAD_INT_WRDISLMC | L2C_TAD_INT_RDDISLMC)
+
+#define L2C_TAD_INT_DISOCI (L2C_TAD_INT_WRDISOCI | L2C_TAD_INT_RDDISOCI)
+
+#define L2C_TAD_INT_ENA_ALL (L2C_TAD_INT_ECC | L2C_TAD_INT_TAG | \
+ L2C_TAD_INT_RTG | \
+ L2C_TAD_INT_DISLMC | L2C_TAD_INT_DISOCI | \
+ L2C_TAD_INT_LFBTO)
+
+#define L2C_TAD_TIMETWO 0x50000
+#define L2C_TAD_TIMEOUT 0x50100
+#define L2C_TAD_ERR 0x60000
+#define L2C_TAD_TQD_ERR 0x60100
+#define L2C_TAD_TTG_ERR 0x60200
+
+
+#define L2C_CBC_INT_W1C 0x60000
+
+#define L2C_CBC_INT_RSDSBE BIT(0)
+#define L2C_CBC_INT_RSDDBE BIT(1)
+
+#define L2C_CBC_INT_RSD (L2C_CBC_INT_RSDSBE | L2C_CBC_INT_RSDDBE)
+
+#define L2C_CBC_INT_MIBSBE BIT(4)
+#define L2C_CBC_INT_MIBDBE BIT(5)
+
+#define L2C_CBC_INT_MIB (L2C_CBC_INT_MIBSBE | L2C_CBC_INT_MIBDBE)
+
+#define L2C_CBC_INT_IORDDISOCI BIT(6)
+#define L2C_CBC_INT_IOWRDISOCI BIT(7)
+
+#define L2C_CBC_INT_IODISOCI (L2C_CBC_INT_IORDDISOCI | \
+ L2C_CBC_INT_IOWRDISOCI)
+
+#define L2C_CBC_INT_CE (L2C_CBC_INT_RSDSBE | L2C_CBC_INT_MIBSBE)
+#define L2C_CBC_INT_UE (L2C_CBC_INT_RSDDBE | L2C_CBC_INT_MIBDBE)
+
+
+static const struct error_descr l2_cbc_errors[] = {
+ {
+ .type = ERR_CORRECTED,
+ .mask = L2C_CBC_INT_RSDSBE,
+ .descr = "RSD single-bit error",
+ },
+ {
+ .type = ERR_CORRECTED,
+ .mask = L2C_CBC_INT_MIBSBE,
+ .descr = "MIB single-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_CBC_INT_RSDDBE,
+ .descr = "RSD double-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_CBC_INT_MIBDBE,
+ .descr = "MIB double-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_CBC_INT_IORDDISOCI,
+ .descr = "Read from a disabled CCPI",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_CBC_INT_IOWRDISOCI,
+ .descr = "Write to a disabled CCPI",
+ },
+ {0, 0, NULL},
+};
+
+#define L2C_CBC_INT_W1S 0x60008
+#define L2C_CBC_INT_ENA_W1C 0x60020
+
+#define L2C_CBC_INT_ENA_ALL (L2C_CBC_INT_RSD | L2C_CBC_INT_MIB | \
+ L2C_CBC_INT_IODISOCI)
+
+#define L2C_CBC_INT_ENA_W1S 0x60028
+
+#define L2C_CBC_IODISOCIERR 0x80008
+#define L2C_CBC_IOCERR 0x80010
+#define L2C_CBC_RSDERR 0x80018
+#define L2C_CBC_MIBERR 0x80020
+
+
+#define L2C_MCI_INT_W1C 0x0
+
+#define L2C_MCI_INT_VBFSBE BIT(0)
+#define L2C_MCI_INT_VBFDBE BIT(1)
+
+static const struct error_descr l2_mci_errors[] = {
+ {
+ .type = ERR_CORRECTED,
+ .mask = L2C_MCI_INT_VBFSBE,
+ .descr = "VBF single-bit error",
+ },
+ {
+ .type = ERR_UNCORRECTED,
+ .mask = L2C_MCI_INT_VBFDBE,
+ .descr = "VBF double-bit error",
+ },
+ {0, 0, NULL},
+};
+
+#define L2C_MCI_INT_W1S 0x8
+#define L2C_MCI_INT_ENA_W1C 0x20
+
+#define L2C_MCI_INT_ENA_ALL (L2C_MCI_INT_VBFSBE | L2C_MCI_INT_VBFDBE)
+
+#define L2C_MCI_INT_ENA_W1S 0x28
+
+#define L2C_MCI_ERR 0x10000
+
+#define L2C_MESSAGE_SIZE SZ_1K
+#define L2C_OTHER_SIZE (50 * ARRAY_SIZE(l2_tad_errors))
+
+struct l2c_err_ctx {
+ char *reg_ext_name;
+ u64 reg_int;
+ u64 reg_ext;
+};
+
+struct thunderx_l2c {
+ void __iomem *regs;
+ struct pci_dev *pdev;
+ struct edac_device_ctl_info *edac_dev;
+
+ struct dentry *debugfs;
+
+ int index;
+
+ struct msix_entry msix_ent;
+
+ struct l2c_err_ctx err_ctx[RING_ENTRIES];
+ unsigned long ring_head;
+ unsigned long ring_tail;
+};
+
+static irqreturn_t thunderx_l2c_tad_isr(int irq, void *irq_id)
+{
+ struct msix_entry *msix = irq_id;
+ struct thunderx_l2c *tad = container_of(msix, struct thunderx_l2c,
+ msix_ent);
+
+ unsigned long head = ring_pos(tad->ring_head, ARRAY_SIZE(tad->err_ctx));
+ struct l2c_err_ctx *ctx = &tad->err_ctx[head];
+
+ ctx->reg_int = readq(tad->regs + L2C_TAD_INT_W1C);
+
+ if (ctx->reg_int & L2C_TAD_INT_ECC) {
+ ctx->reg_ext_name = "TQD_ERR";
+ ctx->reg_ext = readq(tad->regs + L2C_TAD_TQD_ERR);
+ } else if (ctx->reg_int & L2C_TAD_INT_TAG) {
+ ctx->reg_ext_name = "TTG_ERR";
+ ctx->reg_ext = readq(tad->regs + L2C_TAD_TTG_ERR);
+ } else if (ctx->reg_int & L2C_TAD_INT_LFBTO) {
+ ctx->reg_ext_name = "TIMEOUT";
+ ctx->reg_ext = readq(tad->regs + L2C_TAD_TIMEOUT);
+ } else if (ctx->reg_int & L2C_TAD_INT_DISOCI) {
+ ctx->reg_ext_name = "ERR";
+ ctx->reg_ext = readq(tad->regs + L2C_TAD_ERR);
+ }
+
+ writeq(ctx->reg_int, tad->regs + L2C_TAD_INT_W1C);
+
+ tad->ring_head++;
+
+ return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_l2c_cbc_isr(int irq, void *irq_id)
+{
+ struct msix_entry *msix = irq_id;
+ struct thunderx_l2c *cbc = container_of(msix, struct thunderx_l2c,
+ msix_ent);
+
+ unsigned long head = ring_pos(cbc->ring_head, ARRAY_SIZE(cbc->err_ctx));
+ struct l2c_err_ctx *ctx = &cbc->err_ctx[head];
+
+ ctx->reg_int = readq(cbc->regs + L2C_CBC_INT_W1C);
+
+ if (ctx->reg_int & L2C_CBC_INT_RSD) {
+ ctx->reg_ext_name = "RSDERR";
+ ctx->reg_ext = readq(cbc->regs + L2C_CBC_RSDERR);
+ } else if (ctx->reg_int & L2C_CBC_INT_MIB) {
+ ctx->reg_ext_name = "MIBERR";
+ ctx->reg_ext = readq(cbc->regs + L2C_CBC_MIBERR);
+ } else if (ctx->reg_int & L2C_CBC_INT_IODISOCI) {
+ ctx->reg_ext_name = "IODISOCIERR";
+ ctx->reg_ext = readq(cbc->regs + L2C_CBC_IODISOCIERR);
+ }
+
+ writeq(ctx->reg_int, cbc->regs + L2C_CBC_INT_W1C);
+
+ cbc->ring_head++;
+
+ return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_l2c_mci_isr(int irq, void *irq_id)
+{
+ struct msix_entry *msix = irq_id;
+ struct thunderx_l2c *mci = container_of(msix, struct thunderx_l2c,
+ msix_ent);
+
+ unsigned long head = ring_pos(mci->ring_head, ARRAY_SIZE(mci->err_ctx));
+ struct l2c_err_ctx *ctx = &mci->err_ctx[head];
+
+ ctx->reg_int = readq(mci->regs + L2C_MCI_INT_W1C);
+ ctx->reg_ext = readq(mci->regs + L2C_MCI_ERR);
+
+ writeq(ctx->reg_int, mci->regs + L2C_MCI_INT_W1C);
+
+ ctx->reg_ext_name = "ERR";
+
+ mci->ring_head++;
+
+ return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_l2c_threaded_isr(int irq, void *irq_id)
+{
+ struct msix_entry *msix = irq_id;
+ struct thunderx_l2c *l2c = container_of(msix, struct thunderx_l2c,
+ msix_ent);
+
+ unsigned long tail = ring_pos(l2c->ring_tail, ARRAY_SIZE(l2c->err_ctx));
+ struct l2c_err_ctx *ctx = &l2c->err_ctx[tail];
+ irqreturn_t ret = IRQ_NONE;
+
+ u64 mask_ue, mask_ce;
+ const struct error_descr *l2_errors;
+ char *reg_int_name;
+
+ char *msg;
+ char *other;
+
+ msg = kmalloc(OCX_MESSAGE_SIZE, GFP_KERNEL);
+ other = kmalloc(OCX_OTHER_SIZE, GFP_KERNEL);
+
+ if (!msg || !other)
+ goto err_free;
+
+ switch (l2c->pdev->device) {
+ case PCI_DEVICE_ID_THUNDER_L2C_TAD:
+ reg_int_name = "L2C_TAD_INT";
+ mask_ue = L2C_TAD_INT_UE;
+ mask_ce = L2C_TAD_INT_CE;
+ l2_errors = l2_tad_errors;
+ break;
+ case PCI_DEVICE_ID_THUNDER_L2C_CBC:
+ reg_int_name = "L2C_CBC_INT";
+ mask_ue = L2C_CBC_INT_UE;
+ mask_ce = L2C_CBC_INT_CE;
+ l2_errors = l2_cbc_errors;
+ break;
+ case PCI_DEVICE_ID_THUNDER_L2C_MCI:
+ reg_int_name = "L2C_MCI_INT";
+ mask_ue = L2C_MCI_INT_VBFDBE;
+ mask_ce = L2C_MCI_INT_VBFSBE;
+ l2_errors = l2_mci_errors;
+ break;
+ default:
+ dev_err(&l2c->pdev->dev, "Unsupported device: %04x\n",
+ l2c->pdev->device);
+ return IRQ_NONE;
+ }
+
+ while (CIRC_CNT(l2c->ring_head, l2c->ring_tail,
+ ARRAY_SIZE(l2c->err_ctx))) {
+ snprintf(msg, L2C_MESSAGE_SIZE,
+ "%s: %s: %016llx, %s: %016llx",
+ l2c->edac_dev->ctl_name, reg_int_name, ctx->reg_int,
+ ctx->reg_ext_name, ctx->reg_ext);
+
+ decode_register(other, L2C_OTHER_SIZE, l2_errors, ctx->reg_int);
+
+ strncat(msg, other, L2C_MESSAGE_SIZE);
+
+ if (ctx->reg_int & mask_ue)
+ edac_device_handle_ue(l2c->edac_dev, 0, 0, msg);
+ else if (ctx->reg_int & mask_ce)
+ edac_device_handle_ce(l2c->edac_dev, 0, 0, msg);
+
+ l2c->ring_tail++;
+ }
+
+ return IRQ_HANDLED;
+
+err_free:
+ kfree(other);
+ kfree(msg);
+
+ return ret;
+}
+
+#define L2C_DEBUGFS_ATTR(_name, _reg) DEBUGFS_REG_ATTR(l2c, _name, _reg)
+
+L2C_DEBUGFS_ATTR(tad_int, L2C_TAD_INT_W1S);
+
+struct debugfs_entry *l2c_tad_dfs_ents[] = {
+ &debugfs_tad_int,
+};
+
+L2C_DEBUGFS_ATTR(cbc_int, L2C_CBC_INT_W1S);
+
+struct debugfs_entry *l2c_cbc_dfs_ents[] = {
+ &debugfs_cbc_int,
+};
+
+L2C_DEBUGFS_ATTR(mci_int, L2C_MCI_INT_W1S);
+
+struct debugfs_entry *l2c_mci_dfs_ents[] = {
+ &debugfs_mci_int,
+};
+
+static const struct pci_device_id thunderx_l2c_pci_tbl[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVICE_ID_THUNDER_L2C_TAD), },
+ { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVICE_ID_THUNDER_L2C_CBC), },
+ { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVICE_ID_THUNDER_L2C_MCI), },
+ { 0, },
+};
+
+static int thunderx_l2c_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ struct thunderx_l2c *l2c;
+ struct edac_device_ctl_info *edac_dev;
+ struct debugfs_entry **l2c_devattr;
+ size_t dfs_entries;
+ irqreturn_t (*thunderx_l2c_isr)(int, void *) = NULL;
+ char name[32];
+ const char *fmt;
+ u64 reg_en_offs, reg_en_mask;
+ int idx;
+ int ret;
+
+ ret = pcim_enable_device(pdev);
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot enable PCI device: %d\n", ret);
+ return ret;
+ }
+
+ ret = pcim_iomap_regions(pdev, BIT(0), "thunderx_l2c");
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot map PCI resources: %d\n", ret);
+ return ret;
+ }
+
+ switch (pdev->device) {
+ case PCI_DEVICE_ID_THUNDER_L2C_TAD:
+ thunderx_l2c_isr = thunderx_l2c_tad_isr;
+ l2c_devattr = l2c_tad_dfs_ents;
+ dfs_entries = ARRAY_SIZE(l2c_tad_dfs_ents);
+ fmt = "L2C-TAD%d";
+ reg_en_offs = L2C_TAD_INT_ENA_W1S;
+ reg_en_mask = L2C_TAD_INT_ENA_ALL;
+ break;
+ case PCI_DEVICE_ID_THUNDER_L2C_CBC:
+ thunderx_l2c_isr = thunderx_l2c_cbc_isr;
+ l2c_devattr = l2c_cbc_dfs_ents;
+ dfs_entries = ARRAY_SIZE(l2c_cbc_dfs_ents);
+ fmt = "L2C-CBC%d";
+ reg_en_offs = L2C_CBC_INT_ENA_W1S;
+ reg_en_mask = L2C_CBC_INT_ENA_ALL;
+ break;
+ case PCI_DEVICE_ID_THUNDER_L2C_MCI:
+ thunderx_l2c_isr = thunderx_l2c_mci_isr;
+ l2c_devattr = l2c_mci_dfs_ents;
+ dfs_entries = ARRAY_SIZE(l2c_mci_dfs_ents);
+ fmt = "L2C-MCI%d";
+ reg_en_offs = L2C_MCI_INT_ENA_W1S;
+ reg_en_mask = L2C_MCI_INT_ENA_ALL;
+ break;
+ default:
+ //Should never ever get here
+ dev_err(&pdev->dev, "Unsupported PCI device: %04x\n",
+ pdev->device);
+ return -EINVAL;
+ }
+
+ idx = edac_device_alloc_index();
+ snprintf(name, sizeof(name), fmt, idx);
+
+ edac_dev = edac_device_alloc_ctl_info(sizeof(struct thunderx_l2c),
+ name, 1, "L2C", 1, 0,
+ NULL, 0, idx);
+ if (!edac_dev) {
+ dev_err(&pdev->dev, "Cannot allocate EDAC device\n");
+ return -ENOMEM;
+ }
+
+ l2c = edac_dev->pvt_info;
+ l2c->edac_dev = edac_dev;
+
+ l2c->regs = pcim_iomap_table(pdev)[0];
+ if (!l2c->regs) {
+ dev_err(&pdev->dev, "Cannot map PCI resources\n");
+ ret = -ENODEV;
+ goto err_free;
+ }
+
+ l2c->pdev = pdev;
+
+ l2c->ring_head = 0;
+ l2c->ring_tail = 0;
+
+ l2c->msix_ent.entry = 0;
+ l2c->msix_ent.vector = 0;
+
+ ret = pci_enable_msix_exact(pdev, &l2c->msix_ent, 1);
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot enable interrupt: %d\n", ret);
+ goto err_free;
+ }
+
+ ret = devm_request_threaded_irq(&pdev->dev, l2c->msix_ent.vector,
+ thunderx_l2c_isr,
+ thunderx_l2c_threaded_isr,
+ 0, "[EDAC] ThunderX L2C",
+ &l2c->msix_ent);
+ if (ret)
+ goto err_free;
+
+ edac_dev->dev = &pdev->dev;
+ edac_dev->dev_name = dev_name(&pdev->dev);
+ edac_dev->mod_name = "thunderx-l2c";
+ edac_dev->ctl_name = "thunderx-l2c";
+
+ ret = edac_device_add_device(edac_dev);
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot add EDAC device: %d\n", ret);
+ goto err_free;
+ }
+
+ if (IS_ENABLED(CONFIG_EDAC_DEBUG)) {
+ l2c->debugfs = edac_debugfs_create_dir(pdev->dev.kobj.name);
+
+ thunderx_create_debugfs_nodes(l2c->debugfs, l2c_devattr,
+ l2c, dfs_entries);
+
+ if (ret != dfs_entries) {
+ dev_warn(&pdev->dev, "Error creating debugfs entries: %d%s\n",
+ ret, ret >= 0 ? " created" : "");
+ }
+ }
+
+ pci_set_drvdata(pdev, edac_dev);
+
+ writeq(reg_en_mask, l2c->regs + reg_en_offs);
+
+ return 0;
+
+err_free:
+ edac_device_free_ctl_info(edac_dev);
+
+ return ret;
+}
+
+static void thunderx_l2c_remove(struct pci_dev *pdev)
+{
+ struct edac_device_ctl_info *edac_dev = pci_get_drvdata(pdev);
+ struct thunderx_l2c *l2c = edac_dev->pvt_info;
+
+ switch (pdev->device) {
+ case PCI_DEVICE_ID_THUNDER_L2C_TAD:
+ writeq(L2C_TAD_INT_ENA_ALL, l2c->regs + L2C_TAD_INT_ENA_W1C);
+ break;
+ case PCI_DEVICE_ID_THUNDER_L2C_CBC:
+ writeq(L2C_CBC_INT_ENA_ALL, l2c->regs + L2C_CBC_INT_ENA_W1C);
+ break;
+ case PCI_DEVICE_ID_THUNDER_L2C_MCI:
+ writeq(L2C_MCI_INT_ENA_ALL, l2c->regs + L2C_MCI_INT_ENA_W1C);
+ break;
+ }
+
+ edac_debugfs_remove_recursive(l2c->debugfs);
+
+ edac_device_del_device(&pdev->dev);
+ edac_device_free_ctl_info(edac_dev);
+}
+
+MODULE_DEVICE_TABLE(pci, thunderx_l2c_pci_tbl);
+
+static struct pci_driver thunderx_l2c_driver = {
+ .name = "thunderx_l2c_edac",
+ .probe = thunderx_l2c_probe,
+ .remove = thunderx_l2c_remove,
+ .id_table = thunderx_l2c_pci_tbl,
+};
+
+static int __init thunderx_edac_init(void)
+{
+ int rc = 0;
+
+ rc = pci_register_driver(&thunderx_lmc_driver);
+ if (rc)
+ return rc;
+
+ rc = pci_register_driver(&thunderx_ocx_driver);
+ if (rc)
+ goto err_lmc;
+
+ rc = pci_register_driver(&thunderx_l2c_driver);
+ if (rc)
+ goto err_ocx;
+
+ return rc;
+err_ocx:
+ pci_unregister_driver(&thunderx_ocx_driver);
+err_lmc:
+ pci_unregister_driver(&thunderx_lmc_driver);
+
+ return rc;
+}
+
+static void __exit thunderx_edac_exit(void)
+{
+ pci_unregister_driver(&thunderx_l2c_driver);
+ pci_unregister_driver(&thunderx_ocx_driver);
+ pci_unregister_driver(&thunderx_lmc_driver);
+
+}
+
+module_init(thunderx_edac_init);
+module_exit(thunderx_edac_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Cavium, Inc.");
+MODULE_DESCRIPTION("EDAC Driver for Cavium ThunderX");
diff --git a/drivers/extcon/devres.c b/drivers/extcon/devres.c
index b40eb1805927..186fd735eb28 100644
--- a/drivers/extcon/devres.c
+++ b/drivers/extcon/devres.c
@@ -50,6 +50,13 @@ static void devm_extcon_dev_notifier_unreg(struct device *dev, void *res)
extcon_unregister_notifier(this->edev, this->id, this->nb);
}
+static void devm_extcon_dev_notifier_all_unreg(struct device *dev, void *res)
+{
+ struct extcon_dev_notifier_devres *this = res;
+
+ extcon_unregister_notifier_all(this->edev, this->nb);
+}
+
/**
* devm_extcon_dev_allocate - Allocate managed extcon device
* @dev: device owning the extcon device being created
@@ -214,3 +221,57 @@ void devm_extcon_unregister_notifier(struct device *dev,
devm_extcon_dev_match, edev));
}
EXPORT_SYMBOL(devm_extcon_unregister_notifier);
+
+/**
+ * devm_extcon_register_notifier_all()
+ * - Resource-managed extcon_register_notifier_all()
+ * @dev: device to allocate extcon device
+ * @edev: the extcon device that has the external connecotr.
+ * @nb: a notifier block to be registered.
+ *
+ * This function manages automatically the notifier of extcon device using
+ * device resource management and simplify the control of unregistering
+ * the notifier of extcon device. To get more information, refer that function.
+ *
+ * Returns 0 if success or negaive error number if failure.
+ */
+int devm_extcon_register_notifier_all(struct device *dev, struct extcon_dev *edev,
+ struct notifier_block *nb)
+{
+ struct extcon_dev_notifier_devres *ptr;
+ int ret;
+
+ ptr = devres_alloc(devm_extcon_dev_notifier_all_unreg, sizeof(*ptr),
+ GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ ret = extcon_register_notifier_all(edev, nb);
+ if (ret) {
+ devres_free(ptr);
+ return ret;
+ }
+
+ ptr->edev = edev;
+ ptr->nb = nb;
+ devres_add(dev, ptr);
+
+ return 0;
+}
+EXPORT_SYMBOL(devm_extcon_register_notifier_all);
+
+/**
+ * devm_extcon_unregister_notifier_all()
+ * - Resource-managed extcon_unregister_notifier_all()
+ * @dev: device to allocate extcon device
+ * @edev: the extcon device that has the external connecotr.
+ * @nb: a notifier block to be registered.
+ */
+void devm_extcon_unregister_notifier_all(struct device *dev,
+ struct extcon_dev *edev,
+ struct notifier_block *nb)
+{
+ WARN_ON(devres_release(dev, devm_extcon_dev_notifier_all_unreg,
+ devm_extcon_dev_match, edev));
+}
+EXPORT_SYMBOL(devm_extcon_unregister_notifier_all);
diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index 09ac5e70c2f3..e7750545469f 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -448,8 +448,19 @@ int extcon_sync(struct extcon_dev *edev, unsigned int id)
spin_lock_irqsave(&edev->lock, flags);
state = !!(edev->state & BIT(index));
+
+ /*
+ * Call functions in a raw notifier chain for the specific one
+ * external connector.
+ */
raw_notifier_call_chain(&edev->nh[index], state, edev);
+ /*
+ * Call functions in a raw notifier chain for the all supported
+ * external connectors.
+ */
+ raw_notifier_call_chain(&edev->nh_all, state, edev);
+
/* This could be in interrupt handler */
prop_buf = (char *)get_zeroed_page(GFP_ATOMIC);
if (!prop_buf) {
@@ -954,6 +965,59 @@ int extcon_unregister_notifier(struct extcon_dev *edev, unsigned int id,
}
EXPORT_SYMBOL_GPL(extcon_unregister_notifier);
+/**
+ * extcon_register_notifier_all() - Register a notifier block for all connectors
+ * @edev: the extcon device that has the external connecotr.
+ * @nb: a notifier block to be registered.
+ *
+ * This fucntion registers a notifier block in order to receive the state
+ * change of all supported external connectors from extcon device.
+ * And The second parameter given to the callback of nb (val) is
+ * the current state and third parameter is the edev pointer.
+ *
+ * Returns 0 if success or error number if fail
+ */
+int extcon_register_notifier_all(struct extcon_dev *edev,
+ struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ if (!edev || !nb)
+ return -EINVAL;
+
+ spin_lock_irqsave(&edev->lock, flags);
+ ret = raw_notifier_chain_register(&edev->nh_all, nb);
+ spin_unlock_irqrestore(&edev->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(extcon_register_notifier_all);
+
+/**
+ * extcon_unregister_notifier_all() - Unregister a notifier block from extcon.
+ * @edev: the extcon device that has the external connecotr.
+ * @nb: a notifier block to be registered.
+ *
+ * Returns 0 if success or error number if fail
+ */
+int extcon_unregister_notifier_all(struct extcon_dev *edev,
+ struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ if (!edev || !nb)
+ return -EINVAL;
+
+ spin_lock_irqsave(&edev->lock, flags);
+ ret = raw_notifier_chain_unregister(&edev->nh_all, nb);
+ spin_unlock_irqrestore(&edev->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(extcon_unregister_notifier_all);
+
static struct attribute *extcon_attrs[] = {
&dev_attr_state.attr,
&dev_attr_name.attr,
@@ -1212,6 +1276,8 @@ int extcon_dev_register(struct extcon_dev *edev)
for (index = 0; index < edev->max_supported; index++)
RAW_INIT_NOTIFIER_HEAD(&edev->nh[index]);
+ RAW_INIT_NOTIFIER_HEAD(&edev->nh_all);
+
dev_set_drvdata(&edev->dev, edev);
edev->state = 0;
diff --git a/drivers/extcon/extcon.h b/drivers/extcon/extcon.h
index 993ddccafe11..dddddcfa0587 100644
--- a/drivers/extcon/extcon.h
+++ b/drivers/extcon/extcon.h
@@ -21,6 +21,8 @@
* @dev: Device of this extcon.
* @state: Attach/detach state of this extcon. Do not provide at
* register-time.
+ * @nh_all: Notifier for the state change events for all supported
+ * external connectors from this extcon.
* @nh: Notifier for the state change events from this extcon
* @entry: To support list of extcon devices so that users can
* search for extcon devices based on the extcon name.
@@ -43,6 +45,7 @@ struct extcon_dev {
/* Internal data. Please do not set. */
struct device dev;
+ struct raw_notifier_head nh_all;
struct raw_notifier_head *nh;
struct list_head entry;
int max_supported;
diff --git a/drivers/hsi/clients/ssi_protocol.c b/drivers/hsi/clients/ssi_protocol.c
index 7ef819680acd..26b05106f0d3 100644
--- a/drivers/hsi/clients/ssi_protocol.c
+++ b/drivers/hsi/clients/ssi_protocol.c
@@ -980,7 +980,7 @@ static int ssip_pn_xmit(struct sk_buff *skb, struct net_device *dev)
goto drop;
/* Pad to 32-bits - FIXME: Revisit*/
if ((skb->len & 3) && skb_pad(skb, 4 - (skb->len & 3)))
- goto drop;
+ goto inc_dropped;
/*
* Modem sends Phonet messages over SSI with its own endianess...
@@ -1032,8 +1032,9 @@ static int ssip_pn_xmit(struct sk_buff *skb, struct net_device *dev)
drop2:
hsi_free_msg(msg);
drop:
- dev->stats.tx_dropped++;
dev_kfree_skb(skb);
+inc_dropped:
+ dev->stats.tx_dropped++;
return 0;
}
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 0649d53f3d16..22d5eafd6815 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -341,6 +341,15 @@ config SENSORS_ASB100
This driver can also be built as a module. If so, the module
will be called asb100.
+config SENSORS_ASPEED
+ tristate "ASPEED AST2400/AST2500 PWM and Fan tach driver"
+ help
+ This driver provides support for ASPEED AST2400/AST2500 PWM
+ and Fan Tacho controllers.
+
+ This driver can also be built as a module. If so, the module
+ will be called aspeed_pwm_tacho.
+
config SENSORS_ATXP1
tristate "Attansic ATXP1 VID controller"
depends on I2C
@@ -1643,16 +1652,6 @@ config SENSORS_TMP421
This driver can also be built as a module. If so, the module
will be called tmp421.
-config SENSORS_TWL4030_MADC
- tristate "Texas Instruments TWL4030 MADC Hwmon"
- depends on TWL4030_MADC
- help
- If you say yes here you get hwmon support for triton
- TWL4030-MADC.
-
- This driver can also be built as a module. If so it will be called
- twl4030-madc-hwmon.
-
config SENSORS_VEXPRESS
tristate "Versatile Express"
depends on VEXPRESS_CONFIG
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 5509edf6186a..d4641a9f16c1 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_SENSORS_ADT7475) += adt7475.o
obj-$(CONFIG_SENSORS_APPLESMC) += applesmc.o
obj-$(CONFIG_SENSORS_ARM_SCPI) += scpi-hwmon.o
obj-$(CONFIG_SENSORS_ASC7621) += asc7621.o
+obj-$(CONFIG_SENSORS_ASPEED) += aspeed-pwm-tacho.o
obj-$(CONFIG_SENSORS_ATXP1) += atxp1.o
obj-$(CONFIG_SENSORS_CORETEMP) += coretemp.o
obj-$(CONFIG_SENSORS_DA9052_ADC)+= da9052-hwmon.o
@@ -157,7 +158,6 @@ obj-$(CONFIG_SENSORS_TMP103) += tmp103.o
obj-$(CONFIG_SENSORS_TMP108) += tmp108.o
obj-$(CONFIG_SENSORS_TMP401) += tmp401.o
obj-$(CONFIG_SENSORS_TMP421) += tmp421.o
-obj-$(CONFIG_SENSORS_TWL4030_MADC)+= twl4030-madc-hwmon.o
obj-$(CONFIG_SENSORS_VEXPRESS) += vexpress-hwmon.o
obj-$(CONFIG_SENSORS_VIA_CPUTEMP)+= via-cputemp.o
obj-$(CONFIG_SENSORS_VIA686A) += via686a.o
diff --git a/drivers/hwmon/ad7414.c b/drivers/hwmon/ad7414.c
index 763490acc0df..cec227f13874 100644
--- a/drivers/hwmon/ad7414.c
+++ b/drivers/hwmon/ad7414.c
@@ -217,9 +217,16 @@ static const struct i2c_device_id ad7414_id[] = {
};
MODULE_DEVICE_TABLE(i2c, ad7414_id);
+static const struct of_device_id ad7414_of_match[] = {
+ { .compatible = "ad,ad7414" },
+ { },
+};
+MODULE_DEVICE_TABLE(of, ad7414_of_match);
+
static struct i2c_driver ad7414_driver = {
.driver = {
.name = "ad7414",
+ .of_match_table = of_match_ptr(ad7414_of_match),
},
.probe = ad7414_probe,
.id_table = ad7414_id,
diff --git a/drivers/hwmon/adc128d818.c b/drivers/hwmon/adc128d818.c
index bbe3a5c5b3f5..a557b46dbe8e 100644
--- a/drivers/hwmon/adc128d818.c
+++ b/drivers/hwmon/adc128d818.c
@@ -546,10 +546,17 @@ static const struct i2c_device_id adc128_id[] = {
};
MODULE_DEVICE_TABLE(i2c, adc128_id);
+static const struct of_device_id adc128_of_match[] = {
+ { .compatible = "ti,adc128d818" },
+ { },
+};
+MODULE_DEVICE_TABLE(of, adc128_of_match);
+
static struct i2c_driver adc128_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = "adc128d818",
+ .of_match_table = of_match_ptr(adc128_of_match),
},
.probe = adc128_probe,
.remove = adc128_remove,
diff --git a/drivers/hwmon/ads1015.c b/drivers/hwmon/ads1015.c
index 2b3105c8aed3..5140c27d16dd 100644
--- a/drivers/hwmon/ads1015.c
+++ b/drivers/hwmon/ads1015.c
@@ -31,6 +31,7 @@
#include <linux/hwmon-sysfs.h>
#include <linux/err.h>
#include <linux/mutex.h>
+#include <linux/of_device.h>
#include <linux/of.h>
#include <linux/i2c/ads1015.h>
@@ -268,7 +269,12 @@ static int ads1015_probe(struct i2c_client *client,
GFP_KERNEL);
if (!data)
return -ENOMEM;
- data->id = id->driver_data;
+
+ if (client->dev.of_node)
+ data->id = (enum ads1015_chips)
+ of_device_get_match_data(&client->dev);
+ else
+ data->id = id->driver_data;
i2c_set_clientdata(client, data);
mutex_init(&data->update_lock);
@@ -303,9 +309,23 @@ static const struct i2c_device_id ads1015_id[] = {
};
MODULE_DEVICE_TABLE(i2c, ads1015_id);
+static const struct of_device_id ads1015_of_match[] = {
+ {
+ .compatible = "ti,ads1015",
+ .data = (void *)ads1015
+ },
+ {
+ .compatible = "ti,ads1115",
+ .data = (void *)ads1115
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, ads1015_of_match);
+
static struct i2c_driver ads1015_driver = {
.driver = {
.name = "ads1015",
+ .of_match_table = of_match_ptr(ads1015_of_match),
},
.probe = ads1015_probe,
.remove = ads1015_remove,
diff --git a/drivers/hwmon/ads7828.c b/drivers/hwmon/ads7828.c
index ee396ff167d9..898607bf682b 100644
--- a/drivers/hwmon/ads7828.c
+++ b/drivers/hwmon/ads7828.c
@@ -31,9 +31,11 @@
#include <linux/i2c.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/of_device.h>
#include <linux/platform_data/ads7828.h>
#include <linux/regmap.h>
#include <linux/slab.h>
+#include <linux/regulator/consumer.h>
/* The ADS7828 registers */
#define ADS7828_CMD_SD_SE 0x80 /* Single ended inputs */
@@ -118,9 +120,12 @@ static int ads7828_probe(struct i2c_client *client,
struct ads7828_data *data;
struct device *hwmon_dev;
unsigned int vref_mv = ADS7828_INT_VREF_MV;
+ unsigned int vref_uv;
bool diff_input = false;
bool ext_vref = false;
unsigned int regval;
+ enum ads7828_chips chip;
+ struct regulator *reg;
data = devm_kzalloc(dev, sizeof(struct ads7828_data), GFP_KERNEL);
if (!data)
@@ -131,14 +136,32 @@ static int ads7828_probe(struct i2c_client *client,
ext_vref = pdata->ext_vref;
if (ext_vref && pdata->vref_mv)
vref_mv = pdata->vref_mv;
+ } else if (dev->of_node) {
+ diff_input = of_property_read_bool(dev->of_node,
+ "ti,differential-input");
+ reg = devm_regulator_get_optional(dev, "vref");
+ if (!IS_ERR(reg)) {
+ vref_uv = regulator_get_voltage(reg);
+ vref_mv = DIV_ROUND_CLOSEST(vref_uv, 1000);
+ if (vref_mv < ADS7828_EXT_VREF_MV_MIN ||
+ vref_mv > ADS7828_EXT_VREF_MV_MAX)
+ return -EINVAL;
+ ext_vref = true;
+ }
}
+ if (client->dev.of_node)
+ chip = (enum ads7828_chips)
+ of_device_get_match_data(&client->dev);
+ else
+ chip = id->driver_data;
+
/* Bound Vref with min/max values */
vref_mv = clamp_val(vref_mv, ADS7828_EXT_VREF_MV_MIN,
ADS7828_EXT_VREF_MV_MAX);
/* ADS7828 uses 12-bit samples, while ADS7830 is 8-bit */
- if (id->driver_data == ads7828) {
+ if (chip == ads7828) {
data->lsb_resol = DIV_ROUND_CLOSEST(vref_mv * 1000, 4096);
data->regmap = devm_regmap_init_i2c(client,
&ads2828_regmap_config);
@@ -177,9 +200,23 @@ static const struct i2c_device_id ads7828_device_ids[] = {
};
MODULE_DEVICE_TABLE(i2c, ads7828_device_ids);
+static const struct of_device_id ads7828_of_match[] = {
+ {
+ .compatible = "ti,ads7828",
+ .data = (void *)ads7828
+ },
+ {
+ .compatible = "ti,ads7830",
+ .data = (void *)ads7830
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, ads7828_of_match);
+
static struct i2c_driver ads7828_driver = {
.driver = {
.name = "ads7828",
+ .of_match_table = of_match_ptr(ads7828_of_match),
},
.id_table = ads7828_device_ids,
diff --git a/drivers/hwmon/adt7475.c b/drivers/hwmon/adt7475.c
index c646670b9ea9..c803e3c5fcd4 100644
--- a/drivers/hwmon/adt7475.c
+++ b/drivers/hwmon/adt7475.c
@@ -13,6 +13,7 @@
*/
#include <linux/module.h>
+#include <linux/of_device.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/i2c.h>
@@ -58,6 +59,8 @@
#define REG_VENDID 0x3E
#define REG_DEVID2 0x3F
+#define REG_CONFIG1 0x40
+
#define REG_STATUS1 0x41
#define REG_STATUS2 0x42
@@ -161,6 +164,27 @@ static const struct i2c_device_id adt7475_id[] = {
};
MODULE_DEVICE_TABLE(i2c, adt7475_id);
+static const struct of_device_id adt7475_of_match[] = {
+ {
+ .compatible = "adi,adt7473",
+ .data = (void *)adt7473
+ },
+ {
+ .compatible = "adi,adt7475",
+ .data = (void *)adt7475
+ },
+ {
+ .compatible = "adi,adt7476",
+ .data = (void *)adt7476
+ },
+ {
+ .compatible = "adi,adt7490",
+ .data = (void *)adt7490
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, adt7475_of_match);
+
struct adt7475_data {
struct device *hwmon_dev;
struct mutex lock;
@@ -1250,6 +1274,7 @@ static void adt7475_remove_files(struct i2c_client *client,
static int adt7475_probe(struct i2c_client *client,
const struct i2c_device_id *id)
{
+ enum chips chip;
static const char * const names[] = {
[adt7473] = "ADT7473",
[adt7475] = "ADT7475",
@@ -1268,8 +1293,13 @@ static int adt7475_probe(struct i2c_client *client,
mutex_init(&data->lock);
i2c_set_clientdata(client, data);
+ if (client->dev.of_node)
+ chip = (enum chips)of_device_get_match_data(&client->dev);
+ else
+ chip = id->driver_data;
+
/* Initialize device-specific values */
- switch (id->driver_data) {
+ switch (chip) {
case adt7476:
data->has_voltage = 0x0e; /* in1 to in3 */
revision = adt7475_read(REG_DEVID2) & 0x07;
@@ -1343,6 +1373,17 @@ static int adt7475_probe(struct i2c_client *client,
for (i = 0; i < ADT7475_PWM_COUNT; i++)
adt7475_read_pwm(client, i);
+ /* Start monitoring */
+ switch (chip) {
+ case adt7475:
+ case adt7476:
+ i2c_smbus_write_byte_data(client, REG_CONFIG1,
+ adt7475_read(REG_CONFIG1) | 0x01);
+ break;
+ default:
+ break;
+ }
+
ret = sysfs_create_group(&client->dev.kobj, &adt7475_attr_group);
if (ret)
return ret;
@@ -1428,6 +1469,7 @@ static struct i2c_driver adt7475_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = "adt7475",
+ .of_match_table = of_match_ptr(adt7475_of_match),
},
.probe = adt7475_probe,
.remove = adt7475_remove,
diff --git a/drivers/hwmon/aspeed-pwm-tacho.c b/drivers/hwmon/aspeed-pwm-tacho.c
new file mode 100644
index 000000000000..48403a2115be
--- /dev/null
+++ b/drivers/hwmon/aspeed-pwm-tacho.c
@@ -0,0 +1,835 @@
+/*
+ * Copyright (c) 2016 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 or later as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/gpio/consumer.h>
+#include <linux/delay.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/sysfs.h>
+#include <linux/regmap.h>
+
+/* ASPEED PWM & FAN Tach Register Definition */
+#define ASPEED_PTCR_CTRL 0x00
+#define ASPEED_PTCR_CLK_CTRL 0x04
+#define ASPEED_PTCR_DUTY0_CTRL 0x08
+#define ASPEED_PTCR_DUTY1_CTRL 0x0c
+#define ASPEED_PTCR_TYPEM_CTRL 0x10
+#define ASPEED_PTCR_TYPEM_CTRL1 0x14
+#define ASPEED_PTCR_TYPEN_CTRL 0x18
+#define ASPEED_PTCR_TYPEN_CTRL1 0x1c
+#define ASPEED_PTCR_TACH_SOURCE 0x20
+#define ASPEED_PTCR_TRIGGER 0x28
+#define ASPEED_PTCR_RESULT 0x2c
+#define ASPEED_PTCR_INTR_CTRL 0x30
+#define ASPEED_PTCR_INTR_STS 0x34
+#define ASPEED_PTCR_TYPEM_LIMIT 0x38
+#define ASPEED_PTCR_TYPEN_LIMIT 0x3C
+#define ASPEED_PTCR_CTRL_EXT 0x40
+#define ASPEED_PTCR_CLK_CTRL_EXT 0x44
+#define ASPEED_PTCR_DUTY2_CTRL 0x48
+#define ASPEED_PTCR_DUTY3_CTRL 0x4c
+#define ASPEED_PTCR_TYPEO_CTRL 0x50
+#define ASPEED_PTCR_TYPEO_CTRL1 0x54
+#define ASPEED_PTCR_TACH_SOURCE_EXT 0x60
+#define ASPEED_PTCR_TYPEO_LIMIT 0x78
+
+/* ASPEED_PTCR_CTRL : 0x00 - General Control Register */
+#define ASPEED_PTCR_CTRL_SET_PWMD_TYPE_PART1 15
+#define ASPEED_PTCR_CTRL_SET_PWMD_TYPE_PART2 6
+#define ASPEED_PTCR_CTRL_SET_PWMD_TYPE_MASK (BIT(7) | BIT(15))
+
+#define ASPEED_PTCR_CTRL_SET_PWMC_TYPE_PART1 14
+#define ASPEED_PTCR_CTRL_SET_PWMC_TYPE_PART2 5
+#define ASPEED_PTCR_CTRL_SET_PWMC_TYPE_MASK (BIT(6) | BIT(14))
+
+#define ASPEED_PTCR_CTRL_SET_PWMB_TYPE_PART1 13
+#define ASPEED_PTCR_CTRL_SET_PWMB_TYPE_PART2 4
+#define ASPEED_PTCR_CTRL_SET_PWMB_TYPE_MASK (BIT(5) | BIT(13))
+
+#define ASPEED_PTCR_CTRL_SET_PWMA_TYPE_PART1 12
+#define ASPEED_PTCR_CTRL_SET_PWMA_TYPE_PART2 3
+#define ASPEED_PTCR_CTRL_SET_PWMA_TYPE_MASK (BIT(4) | BIT(12))
+
+#define ASPEED_PTCR_CTRL_FAN_NUM_EN(x) BIT(16 + (x))
+
+#define ASPEED_PTCR_CTRL_PWMD_EN BIT(11)
+#define ASPEED_PTCR_CTRL_PWMC_EN BIT(10)
+#define ASPEED_PTCR_CTRL_PWMB_EN BIT(9)
+#define ASPEED_PTCR_CTRL_PWMA_EN BIT(8)
+
+#define ASPEED_PTCR_CTRL_CLK_SRC BIT(1)
+#define ASPEED_PTCR_CTRL_CLK_EN BIT(0)
+
+/* ASPEED_PTCR_CLK_CTRL : 0x04 - Clock Control Register */
+/* TYPE N */
+#define ASPEED_PTCR_CLK_CTRL_TYPEN_MASK GENMASK(31, 16)
+#define ASPEED_PTCR_CLK_CTRL_TYPEN_UNIT 24
+#define ASPEED_PTCR_CLK_CTRL_TYPEN_H 20
+#define ASPEED_PTCR_CLK_CTRL_TYPEN_L 16
+/* TYPE M */
+#define ASPEED_PTCR_CLK_CTRL_TYPEM_MASK GENMASK(15, 0)
+#define ASPEED_PTCR_CLK_CTRL_TYPEM_UNIT 8
+#define ASPEED_PTCR_CLK_CTRL_TYPEM_H 4
+#define ASPEED_PTCR_CLK_CTRL_TYPEM_L 0
+
+/*
+ * ASPEED_PTCR_DUTY_CTRL/1/2/3 : 0x08/0x0C/0x48/0x4C - PWM-FAN duty control
+ * 0/1/2/3 register
+ */
+#define DUTY_CTRL_PWM2_FALL_POINT 24
+#define DUTY_CTRL_PWM2_RISE_POINT 16
+#define DUTY_CTRL_PWM2_RISE_FALL_MASK GENMASK(31, 16)
+#define DUTY_CTRL_PWM1_FALL_POINT 8
+#define DUTY_CTRL_PWM1_RISE_POINT 0
+#define DUTY_CTRL_PWM1_RISE_FALL_MASK GENMASK(15, 0)
+
+/* ASPEED_PTCR_TYPEM_CTRL : 0x10/0x18/0x50 - Type M/N/O Ctrl 0 Register */
+#define TYPE_CTRL_FAN_MASK (GENMASK(5, 1) | GENMASK(31, 16))
+#define TYPE_CTRL_FAN1_MASK GENMASK(31, 0)
+#define TYPE_CTRL_FAN_PERIOD 16
+#define TYPE_CTRL_FAN_MODE 4
+#define TYPE_CTRL_FAN_DIVISION 1
+#define TYPE_CTRL_FAN_TYPE_EN 1
+
+/* ASPEED_PTCR_TACH_SOURCE : 0x20/0x60 - Tach Source Register */
+/* bit [0,1] at 0x20, bit [2] at 0x60 */
+#define TACH_PWM_SOURCE_BIT01(x) ((x) * 2)
+#define TACH_PWM_SOURCE_BIT2(x) ((x) * 2)
+#define TACH_PWM_SOURCE_MASK_BIT01(x) (0x3 << ((x) * 2))
+#define TACH_PWM_SOURCE_MASK_BIT2(x) BIT((x) * 2)
+
+/* ASPEED_PTCR_RESULT : 0x2c - Result Register */
+#define RESULT_STATUS_MASK BIT(31)
+#define RESULT_VALUE_MASK 0xfffff
+
+/* ASPEED_PTCR_CTRL_EXT : 0x40 - General Control Extension #1 Register */
+#define ASPEED_PTCR_CTRL_SET_PWMH_TYPE_PART1 15
+#define ASPEED_PTCR_CTRL_SET_PWMH_TYPE_PART2 6
+#define ASPEED_PTCR_CTRL_SET_PWMH_TYPE_MASK (BIT(7) | BIT(15))
+
+#define ASPEED_PTCR_CTRL_SET_PWMG_TYPE_PART1 14
+#define ASPEED_PTCR_CTRL_SET_PWMG_TYPE_PART2 5
+#define ASPEED_PTCR_CTRL_SET_PWMG_TYPE_MASK (BIT(6) | BIT(14))
+
+#define ASPEED_PTCR_CTRL_SET_PWMF_TYPE_PART1 13
+#define ASPEED_PTCR_CTRL_SET_PWMF_TYPE_PART2 4
+#define ASPEED_PTCR_CTRL_SET_PWMF_TYPE_MASK (BIT(5) | BIT(13))
+
+#define ASPEED_PTCR_CTRL_SET_PWME_TYPE_PART1 12
+#define ASPEED_PTCR_CTRL_SET_PWME_TYPE_PART2 3
+#define ASPEED_PTCR_CTRL_SET_PWME_TYPE_MASK (BIT(4) | BIT(12))
+
+#define ASPEED_PTCR_CTRL_PWMH_EN BIT(11)
+#define ASPEED_PTCR_CTRL_PWMG_EN BIT(10)
+#define ASPEED_PTCR_CTRL_PWMF_EN BIT(9)
+#define ASPEED_PTCR_CTRL_PWME_EN BIT(8)
+
+/* ASPEED_PTCR_CLK_EXT_CTRL : 0x44 - Clock Control Extension #1 Register */
+/* TYPE O */
+#define ASPEED_PTCR_CLK_CTRL_TYPEO_MASK GENMASK(15, 0)
+#define ASPEED_PTCR_CLK_CTRL_TYPEO_UNIT 8
+#define ASPEED_PTCR_CLK_CTRL_TYPEO_H 4
+#define ASPEED_PTCR_CLK_CTRL_TYPEO_L 0
+
+#define PWM_MAX 255
+
+#define M_PWM_DIV_H 0x00
+#define M_PWM_DIV_L 0x05
+#define M_PWM_PERIOD 0x5F
+#define M_TACH_CLK_DIV 0x00
+#define M_TACH_MODE 0x00
+#define M_TACH_UNIT 0x1000
+#define INIT_FAN_CTRL 0xFF
+
+struct aspeed_pwm_tacho_data {
+ struct regmap *regmap;
+ unsigned long clk_freq;
+ bool pwm_present[8];
+ bool fan_tach_present[16];
+ u8 type_pwm_clock_unit[3];
+ u8 type_pwm_clock_division_h[3];
+ u8 type_pwm_clock_division_l[3];
+ u8 type_fan_tach_clock_division[3];
+ u16 type_fan_tach_unit[3];
+ u8 pwm_port_type[8];
+ u8 pwm_port_fan_ctrl[8];
+ u8 fan_tach_ch_source[16];
+ const struct attribute_group *groups[3];
+};
+
+enum type { TYPEM, TYPEN, TYPEO };
+
+struct type_params {
+ u32 l_value;
+ u32 h_value;
+ u32 unit_value;
+ u32 clk_ctrl_mask;
+ u32 clk_ctrl_reg;
+ u32 ctrl_reg;
+ u32 ctrl_reg1;
+};
+
+static const struct type_params type_params[] = {
+ [TYPEM] = {
+ .l_value = ASPEED_PTCR_CLK_CTRL_TYPEM_L,
+ .h_value = ASPEED_PTCR_CLK_CTRL_TYPEM_H,
+ .unit_value = ASPEED_PTCR_CLK_CTRL_TYPEM_UNIT,
+ .clk_ctrl_mask = ASPEED_PTCR_CLK_CTRL_TYPEM_MASK,
+ .clk_ctrl_reg = ASPEED_PTCR_CLK_CTRL,
+ .ctrl_reg = ASPEED_PTCR_TYPEM_CTRL,
+ .ctrl_reg1 = ASPEED_PTCR_TYPEM_CTRL1,
+ },
+ [TYPEN] = {
+ .l_value = ASPEED_PTCR_CLK_CTRL_TYPEN_L,
+ .h_value = ASPEED_PTCR_CLK_CTRL_TYPEN_H,
+ .unit_value = ASPEED_PTCR_CLK_CTRL_TYPEN_UNIT,
+ .clk_ctrl_mask = ASPEED_PTCR_CLK_CTRL_TYPEN_MASK,
+ .clk_ctrl_reg = ASPEED_PTCR_CLK_CTRL,
+ .ctrl_reg = ASPEED_PTCR_TYPEN_CTRL,
+ .ctrl_reg1 = ASPEED_PTCR_TYPEN_CTRL1,
+ },
+ [TYPEO] = {
+ .l_value = ASPEED_PTCR_CLK_CTRL_TYPEO_L,
+ .h_value = ASPEED_PTCR_CLK_CTRL_TYPEO_H,
+ .unit_value = ASPEED_PTCR_CLK_CTRL_TYPEO_UNIT,
+ .clk_ctrl_mask = ASPEED_PTCR_CLK_CTRL_TYPEO_MASK,
+ .clk_ctrl_reg = ASPEED_PTCR_CLK_CTRL_EXT,
+ .ctrl_reg = ASPEED_PTCR_TYPEO_CTRL,
+ .ctrl_reg1 = ASPEED_PTCR_TYPEO_CTRL1,
+ }
+};
+
+enum pwm_port { PWMA, PWMB, PWMC, PWMD, PWME, PWMF, PWMG, PWMH };
+
+struct pwm_port_params {
+ u32 pwm_en;
+ u32 ctrl_reg;
+ u32 type_part1;
+ u32 type_part2;
+ u32 type_mask;
+ u32 duty_ctrl_rise_point;
+ u32 duty_ctrl_fall_point;
+ u32 duty_ctrl_reg;
+ u32 duty_ctrl_rise_fall_mask;
+};
+
+static const struct pwm_port_params pwm_port_params[] = {
+ [PWMA] = {
+ .pwm_en = ASPEED_PTCR_CTRL_PWMA_EN,
+ .ctrl_reg = ASPEED_PTCR_CTRL,
+ .type_part1 = ASPEED_PTCR_CTRL_SET_PWMA_TYPE_PART1,
+ .type_part2 = ASPEED_PTCR_CTRL_SET_PWMA_TYPE_PART2,
+ .type_mask = ASPEED_PTCR_CTRL_SET_PWMA_TYPE_MASK,
+ .duty_ctrl_rise_point = DUTY_CTRL_PWM1_RISE_POINT,
+ .duty_ctrl_fall_point = DUTY_CTRL_PWM1_FALL_POINT,
+ .duty_ctrl_reg = ASPEED_PTCR_DUTY0_CTRL,
+ .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM1_RISE_FALL_MASK,
+ },
+ [PWMB] = {
+ .pwm_en = ASPEED_PTCR_CTRL_PWMB_EN,
+ .ctrl_reg = ASPEED_PTCR_CTRL,
+ .type_part1 = ASPEED_PTCR_CTRL_SET_PWMB_TYPE_PART1,
+ .type_part2 = ASPEED_PTCR_CTRL_SET_PWMB_TYPE_PART2,
+ .type_mask = ASPEED_PTCR_CTRL_SET_PWMB_TYPE_MASK,
+ .duty_ctrl_rise_point = DUTY_CTRL_PWM2_RISE_POINT,
+ .duty_ctrl_fall_point = DUTY_CTRL_PWM2_FALL_POINT,
+ .duty_ctrl_reg = ASPEED_PTCR_DUTY0_CTRL,
+ .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM2_RISE_FALL_MASK,
+ },
+ [PWMC] = {
+ .pwm_en = ASPEED_PTCR_CTRL_PWMC_EN,
+ .ctrl_reg = ASPEED_PTCR_CTRL,
+ .type_part1 = ASPEED_PTCR_CTRL_SET_PWMC_TYPE_PART1,
+ .type_part2 = ASPEED_PTCR_CTRL_SET_PWMC_TYPE_PART2,
+ .type_mask = ASPEED_PTCR_CTRL_SET_PWMC_TYPE_MASK,
+ .duty_ctrl_rise_point = DUTY_CTRL_PWM1_RISE_POINT,
+ .duty_ctrl_fall_point = DUTY_CTRL_PWM1_FALL_POINT,
+ .duty_ctrl_reg = ASPEED_PTCR_DUTY1_CTRL,
+ .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM1_RISE_FALL_MASK,
+ },
+ [PWMD] = {
+ .pwm_en = ASPEED_PTCR_CTRL_PWMD_EN,
+ .ctrl_reg = ASPEED_PTCR_CTRL,
+ .type_part1 = ASPEED_PTCR_CTRL_SET_PWMD_TYPE_PART1,
+ .type_part2 = ASPEED_PTCR_CTRL_SET_PWMD_TYPE_PART2,
+ .type_mask = ASPEED_PTCR_CTRL_SET_PWMD_TYPE_MASK,
+ .duty_ctrl_rise_point = DUTY_CTRL_PWM2_RISE_POINT,
+ .duty_ctrl_fall_point = DUTY_CTRL_PWM2_FALL_POINT,
+ .duty_ctrl_reg = ASPEED_PTCR_DUTY1_CTRL,
+ .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM2_RISE_FALL_MASK,
+ },
+ [PWME] = {
+ .pwm_en = ASPEED_PTCR_CTRL_PWME_EN,
+ .ctrl_reg = ASPEED_PTCR_CTRL_EXT,
+ .type_part1 = ASPEED_PTCR_CTRL_SET_PWME_TYPE_PART1,
+ .type_part2 = ASPEED_PTCR_CTRL_SET_PWME_TYPE_PART2,
+ .type_mask = ASPEED_PTCR_CTRL_SET_PWME_TYPE_MASK,
+ .duty_ctrl_rise_point = DUTY_CTRL_PWM1_RISE_POINT,
+ .duty_ctrl_fall_point = DUTY_CTRL_PWM1_FALL_POINT,
+ .duty_ctrl_reg = ASPEED_PTCR_DUTY2_CTRL,
+ .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM1_RISE_FALL_MASK,
+ },
+ [PWMF] = {
+ .pwm_en = ASPEED_PTCR_CTRL_PWMF_EN,
+ .ctrl_reg = ASPEED_PTCR_CTRL_EXT,
+ .type_part1 = ASPEED_PTCR_CTRL_SET_PWMF_TYPE_PART1,
+ .type_part2 = ASPEED_PTCR_CTRL_SET_PWMF_TYPE_PART2,
+ .type_mask = ASPEED_PTCR_CTRL_SET_PWMF_TYPE_MASK,
+ .duty_ctrl_rise_point = DUTY_CTRL_PWM2_RISE_POINT,
+ .duty_ctrl_fall_point = DUTY_CTRL_PWM2_FALL_POINT,
+ .duty_ctrl_reg = ASPEED_PTCR_DUTY2_CTRL,
+ .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM2_RISE_FALL_MASK,
+ },
+ [PWMG] = {
+ .pwm_en = ASPEED_PTCR_CTRL_PWMG_EN,
+ .ctrl_reg = ASPEED_PTCR_CTRL_EXT,
+ .type_part1 = ASPEED_PTCR_CTRL_SET_PWMG_TYPE_PART1,
+ .type_part2 = ASPEED_PTCR_CTRL_SET_PWMG_TYPE_PART2,
+ .type_mask = ASPEED_PTCR_CTRL_SET_PWMG_TYPE_MASK,
+ .duty_ctrl_rise_point = DUTY_CTRL_PWM1_RISE_POINT,
+ .duty_ctrl_fall_point = DUTY_CTRL_PWM1_FALL_POINT,
+ .duty_ctrl_reg = ASPEED_PTCR_DUTY3_CTRL,
+ .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM1_RISE_FALL_MASK,
+ },
+ [PWMH] = {
+ .pwm_en = ASPEED_PTCR_CTRL_PWMH_EN,
+ .ctrl_reg = ASPEED_PTCR_CTRL_EXT,
+ .type_part1 = ASPEED_PTCR_CTRL_SET_PWMH_TYPE_PART1,
+ .type_part2 = ASPEED_PTCR_CTRL_SET_PWMH_TYPE_PART2,
+ .type_mask = ASPEED_PTCR_CTRL_SET_PWMH_TYPE_MASK,
+ .duty_ctrl_rise_point = DUTY_CTRL_PWM2_RISE_POINT,
+ .duty_ctrl_fall_point = DUTY_CTRL_PWM2_FALL_POINT,
+ .duty_ctrl_reg = ASPEED_PTCR_DUTY3_CTRL,
+ .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM2_RISE_FALL_MASK,
+ }
+};
+
+static int regmap_aspeed_pwm_tacho_reg_write(void *context, unsigned int reg,
+ unsigned int val)
+{
+ void __iomem *regs = (void __iomem *)context;
+
+ writel(val, regs + reg);
+ return 0;
+}
+
+static int regmap_aspeed_pwm_tacho_reg_read(void *context, unsigned int reg,
+ unsigned int *val)
+{
+ void __iomem *regs = (void __iomem *)context;
+
+ *val = readl(regs + reg);
+ return 0;
+}
+
+static const struct regmap_config aspeed_pwm_tacho_regmap_config = {
+ .reg_bits = 32,
+ .val_bits = 32,
+ .reg_stride = 4,
+ .max_register = ASPEED_PTCR_TYPEO_LIMIT,
+ .reg_write = regmap_aspeed_pwm_tacho_reg_write,
+ .reg_read = regmap_aspeed_pwm_tacho_reg_read,
+ .fast_io = true,
+};
+
+static void aspeed_set_clock_enable(struct regmap *regmap, bool val)
+{
+ regmap_update_bits(regmap, ASPEED_PTCR_CTRL,
+ ASPEED_PTCR_CTRL_CLK_EN,
+ val ? ASPEED_PTCR_CTRL_CLK_EN : 0);
+}
+
+static void aspeed_set_clock_source(struct regmap *regmap, int val)
+{
+ regmap_update_bits(regmap, ASPEED_PTCR_CTRL,
+ ASPEED_PTCR_CTRL_CLK_SRC,
+ val ? ASPEED_PTCR_CTRL_CLK_SRC : 0);
+}
+
+static void aspeed_set_pwm_clock_values(struct regmap *regmap, u8 type,
+ u8 div_high, u8 div_low, u8 unit)
+{
+ u32 reg_value = ((div_high << type_params[type].h_value) |
+ (div_low << type_params[type].l_value) |
+ (unit << type_params[type].unit_value));
+
+ regmap_update_bits(regmap, type_params[type].clk_ctrl_reg,
+ type_params[type].clk_ctrl_mask, reg_value);
+}
+
+static void aspeed_set_pwm_port_enable(struct regmap *regmap, u8 pwm_port,
+ bool enable)
+{
+ regmap_update_bits(regmap, pwm_port_params[pwm_port].ctrl_reg,
+ pwm_port_params[pwm_port].pwm_en,
+ enable ? pwm_port_params[pwm_port].pwm_en : 0);
+}
+
+static void aspeed_set_pwm_port_type(struct regmap *regmap,
+ u8 pwm_port, u8 type)
+{
+ u32 reg_value = (type & 0x1) << pwm_port_params[pwm_port].type_part1;
+
+ reg_value |= (type & 0x2) << pwm_port_params[pwm_port].type_part2;
+
+ regmap_update_bits(regmap, pwm_port_params[pwm_port].ctrl_reg,
+ pwm_port_params[pwm_port].type_mask, reg_value);
+}
+
+static void aspeed_set_pwm_port_duty_rising_falling(struct regmap *regmap,
+ u8 pwm_port, u8 rising,
+ u8 falling)
+{
+ u32 reg_value = (rising <<
+ pwm_port_params[pwm_port].duty_ctrl_rise_point);
+ reg_value |= (falling <<
+ pwm_port_params[pwm_port].duty_ctrl_fall_point);
+
+ regmap_update_bits(regmap, pwm_port_params[pwm_port].duty_ctrl_reg,
+ pwm_port_params[pwm_port].duty_ctrl_rise_fall_mask,
+ reg_value);
+}
+
+static void aspeed_set_tacho_type_enable(struct regmap *regmap, u8 type,
+ bool enable)
+{
+ regmap_update_bits(regmap, type_params[type].ctrl_reg,
+ TYPE_CTRL_FAN_TYPE_EN,
+ enable ? TYPE_CTRL_FAN_TYPE_EN : 0);
+}
+
+static void aspeed_set_tacho_type_values(struct regmap *regmap, u8 type,
+ u8 mode, u16 unit, u8 division)
+{
+ u32 reg_value = ((mode << TYPE_CTRL_FAN_MODE) |
+ (unit << TYPE_CTRL_FAN_PERIOD) |
+ (division << TYPE_CTRL_FAN_DIVISION));
+
+ regmap_update_bits(regmap, type_params[type].ctrl_reg,
+ TYPE_CTRL_FAN_MASK, reg_value);
+ regmap_update_bits(regmap, type_params[type].ctrl_reg1,
+ TYPE_CTRL_FAN1_MASK, unit << 16);
+}
+
+static void aspeed_set_fan_tach_ch_enable(struct regmap *regmap, u8 fan_tach_ch,
+ bool enable)
+{
+ regmap_update_bits(regmap, ASPEED_PTCR_CTRL,
+ ASPEED_PTCR_CTRL_FAN_NUM_EN(fan_tach_ch),
+ enable ?
+ ASPEED_PTCR_CTRL_FAN_NUM_EN(fan_tach_ch) : 0);
+}
+
+static void aspeed_set_fan_tach_ch_source(struct regmap *regmap, u8 fan_tach_ch,
+ u8 fan_tach_ch_source)
+{
+ u32 reg_value1 = ((fan_tach_ch_source & 0x3) <<
+ TACH_PWM_SOURCE_BIT01(fan_tach_ch));
+ u32 reg_value2 = (((fan_tach_ch_source & 0x4) >> 2) <<
+ TACH_PWM_SOURCE_BIT2(fan_tach_ch));
+
+ regmap_update_bits(regmap, ASPEED_PTCR_TACH_SOURCE,
+ TACH_PWM_SOURCE_MASK_BIT01(fan_tach_ch),
+ reg_value1);
+
+ regmap_update_bits(regmap, ASPEED_PTCR_TACH_SOURCE_EXT,
+ TACH_PWM_SOURCE_MASK_BIT2(fan_tach_ch),
+ reg_value2);
+}
+
+static void aspeed_set_pwm_port_fan_ctrl(struct aspeed_pwm_tacho_data *priv,
+ u8 index, u8 fan_ctrl)
+{
+ u16 period, dc_time_on;
+
+ period = priv->type_pwm_clock_unit[priv->pwm_port_type[index]];
+ period += 1;
+ dc_time_on = (fan_ctrl * period) / PWM_MAX;
+
+ if (dc_time_on == 0) {
+ aspeed_set_pwm_port_enable(priv->regmap, index, false);
+ } else {
+ if (dc_time_on == period)
+ dc_time_on = 0;
+
+ aspeed_set_pwm_port_duty_rising_falling(priv->regmap, index, 0,
+ dc_time_on);
+ aspeed_set_pwm_port_enable(priv->regmap, index, true);
+ }
+}
+
+static u32 aspeed_get_fan_tach_ch_measure_period(struct aspeed_pwm_tacho_data
+ *priv, u8 type)
+{
+ u32 clk;
+ u16 tacho_unit;
+ u8 clk_unit, div_h, div_l, tacho_div;
+
+ clk = priv->clk_freq;
+ clk_unit = priv->type_pwm_clock_unit[type];
+ div_h = priv->type_pwm_clock_division_h[type];
+ div_h = 0x1 << div_h;
+ div_l = priv->type_pwm_clock_division_l[type];
+ if (div_l == 0)
+ div_l = 1;
+ else
+ div_l = div_l * 2;
+
+ tacho_unit = priv->type_fan_tach_unit[type];
+ tacho_div = priv->type_fan_tach_clock_division[type];
+
+ tacho_div = 0x4 << (tacho_div * 2);
+ return clk / (clk_unit * div_h * div_l * tacho_div * tacho_unit);
+}
+
+static u32 aspeed_get_fan_tach_ch_rpm(struct aspeed_pwm_tacho_data *priv,
+ u8 fan_tach_ch)
+{
+ u32 raw_data, tach_div, clk_source, sec, val;
+ u8 fan_tach_ch_source, type;
+
+ regmap_write(priv->regmap, ASPEED_PTCR_TRIGGER, 0);
+ regmap_write(priv->regmap, ASPEED_PTCR_TRIGGER, 0x1 << fan_tach_ch);
+
+ fan_tach_ch_source = priv->fan_tach_ch_source[fan_tach_ch];
+ type = priv->pwm_port_type[fan_tach_ch_source];
+
+ sec = (1000 / aspeed_get_fan_tach_ch_measure_period(priv, type));
+ msleep(sec);
+
+ regmap_read(priv->regmap, ASPEED_PTCR_RESULT, &val);
+ raw_data = val & RESULT_VALUE_MASK;
+ tach_div = priv->type_fan_tach_clock_division[type];
+ tach_div = 0x4 << (tach_div * 2);
+ clk_source = priv->clk_freq;
+
+ if (raw_data == 0)
+ return 0;
+
+ return (clk_source * 60) / (2 * raw_data * tach_div);
+}
+
+static ssize_t set_pwm(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+ int index = sensor_attr->index;
+ int ret;
+ struct aspeed_pwm_tacho_data *priv = dev_get_drvdata(dev);
+ long fan_ctrl;
+
+ ret = kstrtol(buf, 10, &fan_ctrl);
+ if (ret != 0)
+ return ret;
+
+ if (fan_ctrl < 0 || fan_ctrl > PWM_MAX)
+ return -EINVAL;
+
+ if (priv->pwm_port_fan_ctrl[index] == fan_ctrl)
+ return count;
+
+ priv->pwm_port_fan_ctrl[index] = fan_ctrl;
+ aspeed_set_pwm_port_fan_ctrl(priv, index, fan_ctrl);
+
+ return count;
+}
+
+static ssize_t show_pwm(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+ int index = sensor_attr->index;
+ struct aspeed_pwm_tacho_data *priv = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%u\n", priv->pwm_port_fan_ctrl[index]);
+}
+
+static ssize_t show_rpm(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+ int index = sensor_attr->index;
+ u32 rpm;
+ struct aspeed_pwm_tacho_data *priv = dev_get_drvdata(dev);
+
+ rpm = aspeed_get_fan_tach_ch_rpm(priv, index);
+
+ return sprintf(buf, "%u\n", rpm);
+}
+
+static umode_t pwm_is_visible(struct kobject *kobj,
+ struct attribute *a, int index)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct aspeed_pwm_tacho_data *priv = dev_get_drvdata(dev);
+
+ if (!priv->pwm_present[index])
+ return 0;
+ return a->mode;
+}
+
+static umode_t fan_dev_is_visible(struct kobject *kobj,
+ struct attribute *a, int index)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct aspeed_pwm_tacho_data *priv = dev_get_drvdata(dev);
+
+ if (!priv->fan_tach_present[index])
+ return 0;
+ return a->mode;
+}
+
+static SENSOR_DEVICE_ATTR(pwm0, 0644,
+ show_pwm, set_pwm, 0);
+static SENSOR_DEVICE_ATTR(pwm1, 0644,
+ show_pwm, set_pwm, 1);
+static SENSOR_DEVICE_ATTR(pwm2, 0644,
+ show_pwm, set_pwm, 2);
+static SENSOR_DEVICE_ATTR(pwm3, 0644,
+ show_pwm, set_pwm, 3);
+static SENSOR_DEVICE_ATTR(pwm4, 0644,
+ show_pwm, set_pwm, 4);
+static SENSOR_DEVICE_ATTR(pwm5, 0644,
+ show_pwm, set_pwm, 5);
+static SENSOR_DEVICE_ATTR(pwm6, 0644,
+ show_pwm, set_pwm, 6);
+static SENSOR_DEVICE_ATTR(pwm7, 0644,
+ show_pwm, set_pwm, 7);
+static struct attribute *pwm_dev_attrs[] = {
+ &sensor_dev_attr_pwm0.dev_attr.attr,
+ &sensor_dev_attr_pwm1.dev_attr.attr,
+ &sensor_dev_attr_pwm2.dev_attr.attr,
+ &sensor_dev_attr_pwm3.dev_attr.attr,
+ &sensor_dev_attr_pwm4.dev_attr.attr,
+ &sensor_dev_attr_pwm5.dev_attr.attr,
+ &sensor_dev_attr_pwm6.dev_attr.attr,
+ &sensor_dev_attr_pwm7.dev_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group pwm_dev_group = {
+ .attrs = pwm_dev_attrs,
+ .is_visible = pwm_is_visible,
+};
+
+static SENSOR_DEVICE_ATTR(fan0_input, 0444,
+ show_rpm, NULL, 0);
+static SENSOR_DEVICE_ATTR(fan1_input, 0444,
+ show_rpm, NULL, 1);
+static SENSOR_DEVICE_ATTR(fan2_input, 0444,
+ show_rpm, NULL, 2);
+static SENSOR_DEVICE_ATTR(fan3_input, 0444,
+ show_rpm, NULL, 3);
+static SENSOR_DEVICE_ATTR(fan4_input, 0444,
+ show_rpm, NULL, 4);
+static SENSOR_DEVICE_ATTR(fan5_input, 0444,
+ show_rpm, NULL, 5);
+static SENSOR_DEVICE_ATTR(fan6_input, 0444,
+ show_rpm, NULL, 6);
+static SENSOR_DEVICE_ATTR(fan7_input, 0444,
+ show_rpm, NULL, 7);
+static SENSOR_DEVICE_ATTR(fan8_input, 0444,
+ show_rpm, NULL, 8);
+static SENSOR_DEVICE_ATTR(fan9_input, 0444,
+ show_rpm, NULL, 9);
+static SENSOR_DEVICE_ATTR(fan10_input, 0444,
+ show_rpm, NULL, 10);
+static SENSOR_DEVICE_ATTR(fan11_input, 0444,
+ show_rpm, NULL, 11);
+static SENSOR_DEVICE_ATTR(fan12_input, 0444,
+ show_rpm, NULL, 12);
+static SENSOR_DEVICE_ATTR(fan13_input, 0444,
+ show_rpm, NULL, 13);
+static SENSOR_DEVICE_ATTR(fan14_input, 0444,
+ show_rpm, NULL, 14);
+static SENSOR_DEVICE_ATTR(fan15_input, 0444,
+ show_rpm, NULL, 15);
+static struct attribute *fan_dev_attrs[] = {
+ &sensor_dev_attr_fan0_input.dev_attr.attr,
+ &sensor_dev_attr_fan1_input.dev_attr.attr,
+ &sensor_dev_attr_fan2_input.dev_attr.attr,
+ &sensor_dev_attr_fan3_input.dev_attr.attr,
+ &sensor_dev_attr_fan4_input.dev_attr.attr,
+ &sensor_dev_attr_fan5_input.dev_attr.attr,
+ &sensor_dev_attr_fan6_input.dev_attr.attr,
+ &sensor_dev_attr_fan7_input.dev_attr.attr,
+ &sensor_dev_attr_fan8_input.dev_attr.attr,
+ &sensor_dev_attr_fan9_input.dev_attr.attr,
+ &sensor_dev_attr_fan10_input.dev_attr.attr,
+ &sensor_dev_attr_fan11_input.dev_attr.attr,
+ &sensor_dev_attr_fan12_input.dev_attr.attr,
+ &sensor_dev_attr_fan13_input.dev_attr.attr,
+ &sensor_dev_attr_fan14_input.dev_attr.attr,
+ &sensor_dev_attr_fan15_input.dev_attr.attr,
+ NULL
+};
+
+static const struct attribute_group fan_dev_group = {
+ .attrs = fan_dev_attrs,
+ .is_visible = fan_dev_is_visible,
+};
+
+/*
+ * The clock type is type M :
+ * The PWM frequency = 24MHz / (type M clock division L bit *
+ * type M clock division H bit * (type M PWM period bit + 1))
+ */
+static void aspeed_create_type(struct aspeed_pwm_tacho_data *priv)
+{
+ priv->type_pwm_clock_division_h[TYPEM] = M_PWM_DIV_H;
+ priv->type_pwm_clock_division_l[TYPEM] = M_PWM_DIV_L;
+ priv->type_pwm_clock_unit[TYPEM] = M_PWM_PERIOD;
+ aspeed_set_pwm_clock_values(priv->regmap, TYPEM, M_PWM_DIV_H,
+ M_PWM_DIV_L, M_PWM_PERIOD);
+ aspeed_set_tacho_type_enable(priv->regmap, TYPEM, true);
+ priv->type_fan_tach_clock_division[TYPEM] = M_TACH_CLK_DIV;
+ priv->type_fan_tach_unit[TYPEM] = M_TACH_UNIT;
+ aspeed_set_tacho_type_values(priv->regmap, TYPEM, M_TACH_MODE,
+ M_TACH_UNIT, M_TACH_CLK_DIV);
+}
+
+static void aspeed_create_pwm_port(struct aspeed_pwm_tacho_data *priv,
+ u8 pwm_port)
+{
+ aspeed_set_pwm_port_enable(priv->regmap, pwm_port, true);
+ priv->pwm_present[pwm_port] = true;
+
+ priv->pwm_port_type[pwm_port] = TYPEM;
+ aspeed_set_pwm_port_type(priv->regmap, pwm_port, TYPEM);
+
+ priv->pwm_port_fan_ctrl[pwm_port] = INIT_FAN_CTRL;
+ aspeed_set_pwm_port_fan_ctrl(priv, pwm_port, INIT_FAN_CTRL);
+}
+
+static void aspeed_create_fan_tach_channel(struct aspeed_pwm_tacho_data *priv,
+ u8 *fan_tach_ch,
+ int count,
+ u8 pwm_source)
+{
+ u8 val, index;
+
+ for (val = 0; val < count; val++) {
+ index = fan_tach_ch[val];
+ aspeed_set_fan_tach_ch_enable(priv->regmap, index, true);
+ priv->fan_tach_present[index] = true;
+ priv->fan_tach_ch_source[index] = pwm_source;
+ aspeed_set_fan_tach_ch_source(priv->regmap, index, pwm_source);
+ }
+}
+
+static int aspeed_create_fan(struct device *dev,
+ struct device_node *child,
+ struct aspeed_pwm_tacho_data *priv)
+{
+ u8 *fan_tach_ch;
+ u32 pwm_port;
+ int ret, count;
+
+ ret = of_property_read_u32(child, "reg", &pwm_port);
+ if (ret)
+ return ret;
+ aspeed_create_pwm_port(priv, (u8)pwm_port);
+
+ count = of_property_count_u8_elems(child, "aspeed,fan-tach-ch");
+ if (count < 1)
+ return -EINVAL;
+ fan_tach_ch = devm_kzalloc(dev, sizeof(*fan_tach_ch) * count,
+ GFP_KERNEL);
+ if (!fan_tach_ch)
+ return -ENOMEM;
+ ret = of_property_read_u8_array(child, "aspeed,fan-tach-ch",
+ fan_tach_ch, count);
+ if (ret)
+ return ret;
+ aspeed_create_fan_tach_channel(priv, fan_tach_ch, count, pwm_port);
+
+ return 0;
+}
+
+static int aspeed_pwm_tacho_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct device_node *np, *child;
+ struct aspeed_pwm_tacho_data *priv;
+ void __iomem *regs;
+ struct resource *res;
+ struct device *hwmon;
+ struct clk *clk;
+ int ret;
+
+ np = dev->of_node;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res)
+ return -ENOENT;
+ regs = devm_ioremap_resource(dev, res);
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
+ priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+ priv->regmap = devm_regmap_init(dev, NULL, (__force void *)regs,
+ &aspeed_pwm_tacho_regmap_config);
+ if (IS_ERR(priv->regmap))
+ return PTR_ERR(priv->regmap);
+ regmap_write(priv->regmap, ASPEED_PTCR_TACH_SOURCE, 0);
+ regmap_write(priv->regmap, ASPEED_PTCR_TACH_SOURCE_EXT, 0);
+
+ clk = devm_clk_get(dev, NULL);
+ if (IS_ERR(clk))
+ return -ENODEV;
+ priv->clk_freq = clk_get_rate(clk);
+ aspeed_set_clock_enable(priv->regmap, true);
+ aspeed_set_clock_source(priv->regmap, 0);
+
+ aspeed_create_type(priv);
+
+ for_each_child_of_node(np, child) {
+ ret = aspeed_create_fan(dev, child, priv);
+ of_node_put(child);
+ if (ret)
+ return ret;
+ }
+ of_node_put(np);
+
+ priv->groups[0] = &pwm_dev_group;
+ priv->groups[1] = &fan_dev_group;
+ priv->groups[2] = NULL;
+ hwmon = devm_hwmon_device_register_with_groups(dev,
+ "aspeed_pwm_tacho",
+ priv, priv->groups);
+ return PTR_ERR_OR_ZERO(hwmon);
+}
+
+static const struct of_device_id of_pwm_tacho_match_table[] = {
+ { .compatible = "aspeed,ast2400-pwm-tacho", },
+ { .compatible = "aspeed,ast2500-pwm-tacho", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, of_pwm_tacho_match_table);
+
+static struct platform_driver aspeed_pwm_tacho_driver = {
+ .probe = aspeed_pwm_tacho_probe,
+ .driver = {
+ .name = "aspeed_pwm_tacho",
+ .of_match_table = of_pwm_tacho_match_table,
+ },
+};
+
+module_platform_driver(aspeed_pwm_tacho_driver);
+
+MODULE_AUTHOR("Jaghathiswari Rankappagounder Natarajan <jaghu@google.com>");
+MODULE_DESCRIPTION("ASPEED PWM and Fan Tacho device driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c
index 34704b0451b4..3189246302a6 100644
--- a/drivers/hwmon/dell-smm-hwmon.c
+++ b/drivers/hwmon/dell-smm-hwmon.c
@@ -995,6 +995,13 @@ static struct dmi_system_id i8k_dmi_table[] __initdata = {
},
.driver_data = (void *)&i8k_config_data[DELL_XPS],
},
+ {
+ .ident = "Dell XPS 15 9560",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "XPS 15 9560"),
+ },
+ },
{ }
};
diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index 28375d59cc36..dd6e17c1076b 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -186,7 +186,7 @@ static ssize_t hwmon_attr_show_string(struct device *dev,
char *buf)
{
struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr);
- char *s;
+ const char *s;
int ret;
ret = hattr->ops->read_string(dev, hattr->type, hattr->attr,
diff --git a/drivers/hwmon/ina209.c b/drivers/hwmon/ina209.c
index 5378fdefc1f7..aa0768ce8aea 100644
--- a/drivers/hwmon/ina209.c
+++ b/drivers/hwmon/ina209.c
@@ -117,7 +117,7 @@ static long ina209_from_reg(const u8 reg, const u16 val)
case INA209_SHUNT_VOLTAGE_POS_WARN:
case INA209_SHUNT_VOLTAGE_NEG_WARN:
/* LSB=10 uV. Convert to mV. */
- return DIV_ROUND_CLOSEST(val, 100);
+ return DIV_ROUND_CLOSEST((s16)val, 100);
case INA209_BUS_VOLTAGE:
case INA209_BUS_VOLTAGE_MAX_PEAK:
@@ -146,7 +146,7 @@ static long ina209_from_reg(const u8 reg, const u16 val)
case INA209_CURRENT:
/* LSB=1 mA (selected). Is in mA */
- return val;
+ return (s16)val;
}
/* programmer goofed */
@@ -608,11 +608,18 @@ static const struct i2c_device_id ina209_id[] = {
};
MODULE_DEVICE_TABLE(i2c, ina209_id);
+static const struct of_device_id ina209_of_match[] = {
+ { .compatible = "ti,ina209" },
+ { },
+};
+MODULE_DEVICE_TABLE(of, ina209_of_match);
+
/* This is the driver that will be inserted */
static struct i2c_driver ina209_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = "ina209",
+ .of_match_table = of_match_ptr(ina209_of_match),
},
.probe = ina209_probe,
.remove = ina209_remove,
diff --git a/drivers/hwmon/ina2xx.c b/drivers/hwmon/ina2xx.c
index b24f1d3045f0..62e38fa8cda2 100644
--- a/drivers/hwmon/ina2xx.c
+++ b/drivers/hwmon/ina2xx.c
@@ -34,6 +34,7 @@
#include <linux/hwmon.h>
#include <linux/hwmon-sysfs.h>
#include <linux/jiffies.h>
+#include <linux/of_device.h>
#include <linux/of.h>
#include <linux/delay.h>
#include <linux/util_macros.h>
@@ -424,13 +425,19 @@ static int ina2xx_probe(struct i2c_client *client,
struct device *hwmon_dev;
u32 val;
int ret, group = 0;
+ enum ina2xx_ids chip;
+
+ if (client->dev.of_node)
+ chip = (enum ina2xx_ids)of_device_get_match_data(&client->dev);
+ else
+ chip = id->driver_data;
data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
/* set the device type */
- data->config = &ina2xx_config[id->driver_data];
+ data->config = &ina2xx_config[chip];
if (of_property_read_u32(dev->of_node, "shunt-resistor", &val) < 0) {
struct ina2xx_platform_data *pdata = dev_get_platdata(dev);
@@ -487,9 +494,35 @@ static const struct i2c_device_id ina2xx_id[] = {
};
MODULE_DEVICE_TABLE(i2c, ina2xx_id);
+static const struct of_device_id ina2xx_of_match[] = {
+ {
+ .compatible = "ti,ina219",
+ .data = (void *)ina219
+ },
+ {
+ .compatible = "ti,ina220",
+ .data = (void *)ina219
+ },
+ {
+ .compatible = "ti,ina226",
+ .data = (void *)ina226
+ },
+ {
+ .compatible = "ti,ina230",
+ .data = (void *)ina226
+ },
+ {
+ .compatible = "ti,ina231",
+ .data = (void *)ina226
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, ina2xx_of_match);
+
static struct i2c_driver ina2xx_driver = {
.driver = {
.name = "ina2xx",
+ .of_match_table = of_match_ptr(ina2xx_of_match),
},
.probe = ina2xx_probe,
.id_table = ina2xx_id,
diff --git a/drivers/hwmon/lm63.c b/drivers/hwmon/lm63.c
index 2e1948699114..4c1770920d29 100644
--- a/drivers/hwmon/lm63.c
+++ b/drivers/hwmon/lm63.c
@@ -46,6 +46,7 @@
#include <linux/hwmon.h>
#include <linux/err.h>
#include <linux/mutex.h>
+#include <linux/of_device.h>
#include <linux/sysfs.h>
#include <linux/types.h>
@@ -1115,6 +1116,10 @@ static int lm63_probe(struct i2c_client *client,
mutex_init(&data->update_lock);
/* Set the device type */
+ if (client->dev.of_node)
+ data->kind = (enum chips)of_device_get_match_data(&client->dev);
+ else
+ data->kind = id->driver_data;
data->kind = id->driver_data;
if (data->kind == lm64)
data->temp2_offset = 16000;
@@ -1149,10 +1154,28 @@ static const struct i2c_device_id lm63_id[] = {
};
MODULE_DEVICE_TABLE(i2c, lm63_id);
+static const struct of_device_id lm63_of_match[] = {
+ {
+ .compatible = "national,lm63",
+ .data = (void *)lm63
+ },
+ {
+ .compatible = "national,lm64",
+ .data = (void *)lm64
+ },
+ {
+ .compatible = "national,lm96163",
+ .data = (void *)lm96163
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, lm63_of_match);
+
static struct i2c_driver lm63_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = "lm63",
+ .of_match_table = of_match_ptr(lm63_of_match),
},
.probe = lm63_probe,
.id_table = lm63_id,
diff --git a/drivers/hwmon/lm75.c b/drivers/hwmon/lm75.c
index eff3b24d8473..005ffb5ffa92 100644
--- a/drivers/hwmon/lm75.c
+++ b/drivers/hwmon/lm75.c
@@ -26,6 +26,7 @@
#include <linux/hwmon.h>
#include <linux/hwmon-sysfs.h>
#include <linux/err.h>
+#include <linux/of_device.h>
#include <linux/of.h>
#include <linux/regmap.h>
#include "lm75.h"
@@ -273,7 +274,12 @@ lm75_probe(struct i2c_client *client, const struct i2c_device_id *id)
int status, err;
u8 set_mask, clr_mask;
int new;
- enum lm75_type kind = id->driver_data;
+ enum lm75_type kind;
+
+ if (client->dev.of_node)
+ kind = (enum lm75_type)of_device_get_match_data(&client->dev);
+ else
+ kind = id->driver_data;
if (!i2c_check_functionality(client->adapter,
I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA))
@@ -424,6 +430,95 @@ static const struct i2c_device_id lm75_ids[] = {
};
MODULE_DEVICE_TABLE(i2c, lm75_ids);
+static const struct of_device_id lm75_of_match[] = {
+ {
+ .compatible = "adi,adt75",
+ .data = (void *)adt75
+ },
+ {
+ .compatible = "dallas,ds1775",
+ .data = (void *)ds1775
+ },
+ {
+ .compatible = "dallas,ds75",
+ .data = (void *)ds75
+ },
+ {
+ .compatible = "dallas,ds7505",
+ .data = (void *)ds7505
+ },
+ {
+ .compatible = "gmt,g751",
+ .data = (void *)g751
+ },
+ {
+ .compatible = "national,lm75",
+ .data = (void *)lm75
+ },
+ {
+ .compatible = "national,lm75a",
+ .data = (void *)lm75a
+ },
+ {
+ .compatible = "national,lm75b",
+ .data = (void *)lm75b
+ },
+ {
+ .compatible = "maxim,max6625",
+ .data = (void *)max6625
+ },
+ {
+ .compatible = "maxim,max6626",
+ .data = (void *)max6626
+ },
+ {
+ .compatible = "maxim,mcp980x",
+ .data = (void *)mcp980x
+ },
+ {
+ .compatible = "st,stds75",
+ .data = (void *)stds75
+ },
+ {
+ .compatible = "microchip,tcn75",
+ .data = (void *)tcn75
+ },
+ {
+ .compatible = "ti,tmp100",
+ .data = (void *)tmp100
+ },
+ {
+ .compatible = "ti,tmp101",
+ .data = (void *)tmp101
+ },
+ {
+ .compatible = "ti,tmp105",
+ .data = (void *)tmp105
+ },
+ {
+ .compatible = "ti,tmp112",
+ .data = (void *)tmp112
+ },
+ {
+ .compatible = "ti,tmp175",
+ .data = (void *)tmp175
+ },
+ {
+ .compatible = "ti,tmp275",
+ .data = (void *)tmp275
+ },
+ {
+ .compatible = "ti,tmp75",
+ .data = (void *)tmp75
+ },
+ {
+ .compatible = "ti,tmp75c",
+ .data = (void *)tmp75c
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, lm75_of_match);
+
#define LM75A_ID 0xA1
/* Return 0 if detection is successful, -ENODEV otherwise */
@@ -560,6 +655,7 @@ static struct i2c_driver lm75_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = "lm75",
+ .of_match_table = of_match_ptr(lm75_of_match),
.pm = LM75_DEV_PM_OPS,
},
.probe = lm75_probe,
diff --git a/drivers/hwmon/lm85.c b/drivers/hwmon/lm85.c
index 691469ffa24e..0a325878e8f5 100644
--- a/drivers/hwmon/lm85.c
+++ b/drivers/hwmon/lm85.c
@@ -25,6 +25,7 @@
*/
#include <linux/module.h>
+#include <linux/of_device.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/jiffies.h>
@@ -1552,7 +1553,10 @@ static int lm85_probe(struct i2c_client *client, const struct i2c_device_id *id)
return -ENOMEM;
data->client = client;
- data->type = id->driver_data;
+ if (client->dev.of_node)
+ data->type = (enum chips)of_device_get_match_data(&client->dev);
+ else
+ data->type = id->driver_data;
mutex_init(&data->update_lock);
/* Fill in the chip specific driver values */
@@ -1623,10 +1627,60 @@ static const struct i2c_device_id lm85_id[] = {
};
MODULE_DEVICE_TABLE(i2c, lm85_id);
+static const struct of_device_id lm85_of_match[] = {
+ {
+ .compatible = "adi,adm1027",
+ .data = (void *)adm1027
+ },
+ {
+ .compatible = "adi,adt7463",
+ .data = (void *)adt7463
+ },
+ {
+ .compatible = "adi,adt7468",
+ .data = (void *)adt7468
+ },
+ {
+ .compatible = "national,lm85",
+ .data = (void *)lm85
+ },
+ {
+ .compatible = "national,lm85b",
+ .data = (void *)lm85
+ },
+ {
+ .compatible = "national,lm85c",
+ .data = (void *)lm85
+ },
+ {
+ .compatible = "smsc,emc6d100",
+ .data = (void *)emc6d100
+ },
+ {
+ .compatible = "smsc,emc6d101",
+ .data = (void *)emc6d100
+ },
+ {
+ .compatible = "smsc,emc6d102",
+ .data = (void *)emc6d102
+ },
+ {
+ .compatible = "smsc,emc6d103",
+ .data = (void *)emc6d103
+ },
+ {
+ .compatible = "smsc,emc6d103s",
+ .data = (void *)emc6d103s
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, lm85_of_match);
+
static struct i2c_driver lm85_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = "lm85",
+ .of_match_table = of_match_ptr(lm85_of_match),
},
.probe = lm85_probe,
.id_table = lm85_id,
diff --git a/drivers/hwmon/lm87.c b/drivers/hwmon/lm87.c
index e06faf9d3f0f..b48d30760388 100644
--- a/drivers/hwmon/lm87.c
+++ b/drivers/hwmon/lm87.c
@@ -66,6 +66,7 @@
#include <linux/hwmon-vid.h>
#include <linux/err.h>
#include <linux/mutex.h>
+#include <linux/regulator/consumer.h>
/*
* Addresses to scan
@@ -74,8 +75,6 @@
static const unsigned short normal_i2c[] = { 0x2c, 0x2d, 0x2e, I2C_CLIENT_END };
-enum chips { lm87, adm1024 };
-
/*
* The LM87 registers
*/
@@ -855,8 +854,26 @@ static int lm87_init_client(struct i2c_client *client)
{
struct lm87_data *data = i2c_get_clientdata(client);
int rc;
-
- if (dev_get_platdata(&client->dev)) {
+ struct device_node *of_node = client->dev.of_node;
+ u8 val = 0;
+ struct regulator *vcc = NULL;
+
+ if (of_node) {
+ if (of_property_read_bool(of_node, "has-temp3"))
+ val |= CHAN_TEMP3;
+ if (of_property_read_bool(of_node, "has-in6"))
+ val |= CHAN_NO_FAN(0);
+ if (of_property_read_bool(of_node, "has-in7"))
+ val |= CHAN_NO_FAN(1);
+ vcc = devm_regulator_get_optional(&client->dev, "vcc");
+ if (!IS_ERR(vcc)) {
+ if (regulator_get_voltage(vcc) == 5000000)
+ val |= CHAN_VCC_5V;
+ }
+ data->channel = val;
+ lm87_write_value(client,
+ LM87_REG_CHANNEL_MODE, data->channel);
+ } else if (dev_get_platdata(&client->dev)) {
data->channel = *(u8 *)dev_get_platdata(&client->dev);
lm87_write_value(client,
LM87_REG_CHANNEL_MODE, data->channel);
@@ -962,16 +979,24 @@ static int lm87_probe(struct i2c_client *client, const struct i2c_device_id *id)
*/
static const struct i2c_device_id lm87_id[] = {
- { "lm87", lm87 },
- { "adm1024", adm1024 },
+ { "lm87", 0 },
+ { "adm1024", 0 },
{ }
};
MODULE_DEVICE_TABLE(i2c, lm87_id);
+static const struct of_device_id lm87_of_match[] = {
+ { .compatible = "ti,lm87" },
+ { .compatible = "adi,adm1024" },
+ { },
+};
+MODULE_DEVICE_TABLE(of, lm87_of_match);
+
static struct i2c_driver lm87_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = "lm87",
+ .of_match_table = lm87_of_match,
},
.probe = lm87_probe,
.id_table = lm87_id,
diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c
index aff5297bc2bc..c2f411c290bf 100644
--- a/drivers/hwmon/lm90.c
+++ b/drivers/hwmon/lm90.c
@@ -92,6 +92,7 @@
#include <linux/hwmon.h>
#include <linux/err.h>
#include <linux/mutex.h>
+#include <linux/of_device.h>
#include <linux/sysfs.h>
#include <linux/interrupt.h>
#include <linux/regulator/consumer.h>
@@ -235,6 +236,99 @@ static const struct i2c_device_id lm90_id[] = {
};
MODULE_DEVICE_TABLE(i2c, lm90_id);
+static const struct of_device_id lm90_of_match[] = {
+ {
+ .compatible = "adi,adm1032",
+ .data = (void *)adm1032
+ },
+ {
+ .compatible = "adi,adt7461",
+ .data = (void *)adt7461
+ },
+ {
+ .compatible = "adi,adt7461a",
+ .data = (void *)adt7461
+ },
+ {
+ .compatible = "gmt,g781",
+ .data = (void *)g781
+ },
+ {
+ .compatible = "national,lm90",
+ .data = (void *)lm90
+ },
+ {
+ .compatible = "national,lm86",
+ .data = (void *)lm86
+ },
+ {
+ .compatible = "national,lm89",
+ .data = (void *)lm86
+ },
+ {
+ .compatible = "national,lm99",
+ .data = (void *)lm99
+ },
+ {
+ .compatible = "dallas,max6646",
+ .data = (void *)max6646
+ },
+ {
+ .compatible = "dallas,max6647",
+ .data = (void *)max6646
+ },
+ {
+ .compatible = "dallas,max6649",
+ .data = (void *)max6646
+ },
+ {
+ .compatible = "dallas,max6657",
+ .data = (void *)max6657
+ },
+ {
+ .compatible = "dallas,max6658",
+ .data = (void *)max6657
+ },
+ {
+ .compatible = "dallas,max6659",
+ .data = (void *)max6659
+ },
+ {
+ .compatible = "dallas,max6680",
+ .data = (void *)max6680
+ },
+ {
+ .compatible = "dallas,max6681",
+ .data = (void *)max6680
+ },
+ {
+ .compatible = "dallas,max6695",
+ .data = (void *)max6696
+ },
+ {
+ .compatible = "dallas,max6696",
+ .data = (void *)max6696
+ },
+ {
+ .compatible = "onnn,nct1008",
+ .data = (void *)adt7461
+ },
+ {
+ .compatible = "winbond,w83l771",
+ .data = (void *)w83l771
+ },
+ {
+ .compatible = "nxp,sa56004",
+ .data = (void *)sa56004
+ },
+ {
+ .compatible = "ti,tmp451",
+ .data = (void *)tmp451
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, lm90_of_match);
+
/*
* chip type specific parameters
*/
@@ -1677,7 +1771,10 @@ static int lm90_probe(struct i2c_client *client,
mutex_init(&data->update_lock);
/* Set the device type */
- data->kind = id->driver_data;
+ if (client->dev.of_node)
+ data->kind = (enum chips)of_device_get_match_data(&client->dev);
+ else
+ data->kind = id->driver_data;
if (data->kind == adm1032) {
if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE))
client->flags &= ~I2C_CLIENT_PEC;
@@ -1816,6 +1913,7 @@ static struct i2c_driver lm90_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = "lm90",
+ .of_match_table = of_match_ptr(lm90_of_match),
},
.probe = lm90_probe,
.alert = lm90_alert,
diff --git a/drivers/hwmon/lm95245.c b/drivers/hwmon/lm95245.c
index a3bfd88752ca..27cb06d65594 100644
--- a/drivers/hwmon/lm95245.c
+++ b/drivers/hwmon/lm95245.c
@@ -622,10 +622,18 @@ static const struct i2c_device_id lm95245_id[] = {
};
MODULE_DEVICE_TABLE(i2c, lm95245_id);
+static const struct of_device_id lm95245_of_match[] = {
+ { .compatible = "national,lm95235" },
+ { .compatible = "national,lm95245" },
+ { },
+};
+MODULE_DEVICE_TABLE(of, lm95245_of_match);
+
static struct i2c_driver lm95245_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = "lm95245",
+ .of_match_table = of_match_ptr(lm95245_of_match),
},
.probe = lm95245_probe,
.id_table = lm95245_id,
diff --git a/drivers/hwmon/max6697.c b/drivers/hwmon/max6697.c
index f03a71722849..221fd1492057 100644
--- a/drivers/hwmon/max6697.c
+++ b/drivers/hwmon/max6697.c
@@ -24,6 +24,7 @@
#include <linux/hwmon-sysfs.h>
#include <linux/err.h>
#include <linux/mutex.h>
+#include <linux/of_device.h>
#include <linux/of.h>
#include <linux/platform_data/max6697.h>
@@ -632,7 +633,10 @@ static int max6697_probe(struct i2c_client *client,
if (!data)
return -ENOMEM;
- data->type = id->driver_data;
+ if (client->dev.of_node)
+ data->type = (enum chips)of_device_get_match_data(&client->dev);
+ else
+ data->type = id->driver_data;
data->chip = &max6697_chip_data[data->type];
data->client = client;
mutex_init(&data->update_lock);
@@ -662,10 +666,56 @@ static const struct i2c_device_id max6697_id[] = {
};
MODULE_DEVICE_TABLE(i2c, max6697_id);
+static const struct of_device_id max6697_of_match[] = {
+ {
+ .compatible = "maxim,max6581",
+ .data = (void *)max6581
+ },
+ {
+ .compatible = "maxim,max6602",
+ .data = (void *)max6602
+ },
+ {
+ .compatible = "maxim,max6622",
+ .data = (void *)max6622
+ },
+ {
+ .compatible = "maxim,max6636",
+ .data = (void *)max6636
+ },
+ {
+ .compatible = "maxim,max6689",
+ .data = (void *)max6689
+ },
+ {
+ .compatible = "maxim,max6693",
+ .data = (void *)max6693
+ },
+ {
+ .compatible = "maxim,max6694",
+ .data = (void *)max6694
+ },
+ {
+ .compatible = "maxim,max6697",
+ .data = (void *)max6697
+ },
+ {
+ .compatible = "maxim,max6698",
+ .data = (void *)max6698
+ },
+ {
+ .compatible = "maxim,max6699",
+ .data = (void *)max6699
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, max6697_of_match);
+
static struct i2c_driver max6697_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = "max6697",
+ .of_match_table = of_match_ptr(max6697_of_match),
},
.probe = max6697_probe,
.id_table = max6697_id,
diff --git a/drivers/hwmon/pmbus/adm1275.c b/drivers/hwmon/pmbus/adm1275.c
index 4ab5293c7bf0..00d6995af4c2 100644
--- a/drivers/hwmon/pmbus/adm1275.c
+++ b/drivers/hwmon/pmbus/adm1275.c
@@ -101,8 +101,8 @@ static const struct coefficients adm1075_coefficients[] = {
[0] = { 27169, 0, -1 }, /* voltage */
[1] = { 806, 20475, -1 }, /* current, irange25 */
[2] = { 404, 20475, -1 }, /* current, irange50 */
- [3] = { 0, -1, 8549 }, /* power, irange25 */
- [4] = { 0, -1, 4279 }, /* power, irange50 */
+ [3] = { 8549, 0, -1 }, /* power, irange25 */
+ [4] = { 4279, 0, -1 }, /* power, irange50 */
};
static const struct coefficients adm1275_coefficients[] = {
diff --git a/drivers/hwmon/pmbus/ucd9000.c b/drivers/hwmon/pmbus/ucd9000.c
index 3e3aa950277f..3518f0c08934 100644
--- a/drivers/hwmon/pmbus/ucd9000.c
+++ b/drivers/hwmon/pmbus/ucd9000.c
@@ -21,6 +21,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/of_device.h>
#include <linux/init.h>
#include <linux/err.h>
#include <linux/slab.h>
@@ -119,6 +120,35 @@ static const struct i2c_device_id ucd9000_id[] = {
};
MODULE_DEVICE_TABLE(i2c, ucd9000_id);
+static const struct of_device_id ucd9000_of_match[] = {
+ {
+ .compatible = "ti,ucd9000",
+ .data = (void *)ucd9000
+ },
+ {
+ .compatible = "ti,ucd90120",
+ .data = (void *)ucd90120
+ },
+ {
+ .compatible = "ti,ucd90124",
+ .data = (void *)ucd90124
+ },
+ {
+ .compatible = "ti,ucd90160",
+ .data = (void *)ucd90160
+ },
+ {
+ .compatible = "ti,ucd9090",
+ .data = (void *)ucd9090
+ },
+ {
+ .compatible = "ti,ucd90910",
+ .data = (void *)ucd90910
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, ucd9000_of_match);
+
static int ucd9000_probe(struct i2c_client *client,
const struct i2c_device_id *id)
{
@@ -126,6 +156,7 @@ static int ucd9000_probe(struct i2c_client *client,
struct ucd9000_data *data;
struct pmbus_driver_info *info;
const struct i2c_device_id *mid;
+ enum chips chip;
int i, ret;
if (!i2c_check_functionality(client->adapter,
@@ -151,7 +182,12 @@ static int ucd9000_probe(struct i2c_client *client,
return -ENODEV;
}
- if (id->driver_data != ucd9000 && id->driver_data != mid->driver_data)
+ if (client->dev.of_node)
+ chip = (enum chips)of_device_get_match_data(&client->dev);
+ else
+ chip = id->driver_data;
+
+ if (chip != ucd9000 && chip != mid->driver_data)
dev_notice(&client->dev,
"Device mismatch: Configured %s, detected %s\n",
id->name, mid->name);
@@ -234,6 +270,7 @@ static int ucd9000_probe(struct i2c_client *client,
static struct i2c_driver ucd9000_driver = {
.driver = {
.name = "ucd9000",
+ .of_match_table = of_match_ptr(ucd9000_of_match),
},
.probe = ucd9000_probe,
.remove = pmbus_do_remove,
diff --git a/drivers/hwmon/pmbus/ucd9200.c b/drivers/hwmon/pmbus/ucd9200.c
index 033d6aca47d3..a8712c5ded4e 100644
--- a/drivers/hwmon/pmbus/ucd9200.c
+++ b/drivers/hwmon/pmbus/ucd9200.c
@@ -20,6 +20,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/of_device.h>
#include <linux/init.h>
#include <linux/err.h>
#include <linux/slab.h>
@@ -46,12 +47,50 @@ static const struct i2c_device_id ucd9200_id[] = {
};
MODULE_DEVICE_TABLE(i2c, ucd9200_id);
+static const struct of_device_id ucd9200_of_match[] = {
+ {
+ .compatible = "ti,cd9200",
+ .data = (void *)ucd9200
+ },
+ {
+ .compatible = "ti,cd9220",
+ .data = (void *)ucd9220
+ },
+ {
+ .compatible = "ti,cd9222",
+ .data = (void *)ucd9222
+ },
+ {
+ .compatible = "ti,cd9224",
+ .data = (void *)ucd9224
+ },
+ {
+ .compatible = "ti,cd9240",
+ .data = (void *)ucd9240
+ },
+ {
+ .compatible = "ti,cd9244",
+ .data = (void *)ucd9244
+ },
+ {
+ .compatible = "ti,cd9246",
+ .data = (void *)ucd9246
+ },
+ {
+ .compatible = "ti,cd9248",
+ .data = (void *)ucd9248
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, ucd9200_of_match);
+
static int ucd9200_probe(struct i2c_client *client,
const struct i2c_device_id *id)
{
u8 block_buffer[I2C_SMBUS_BLOCK_MAX + 1];
struct pmbus_driver_info *info;
const struct i2c_device_id *mid;
+ enum chips chip;
int i, j, ret;
if (!i2c_check_functionality(client->adapter,
@@ -76,7 +115,13 @@ static int ucd9200_probe(struct i2c_client *client,
dev_err(&client->dev, "Unsupported device\n");
return -ENODEV;
}
- if (id->driver_data != ucd9200 && id->driver_data != mid->driver_data)
+
+ if (client->dev.of_node)
+ chip = (enum chips)of_device_get_match_data(&client->dev);
+ else
+ chip = id->driver_data;
+
+ if (chip != ucd9200 && chip != mid->driver_data)
dev_notice(&client->dev,
"Device mismatch: Configured %s, detected %s\n",
id->name, mid->name);
@@ -167,6 +212,7 @@ static int ucd9200_probe(struct i2c_client *client,
static struct i2c_driver ucd9200_driver = {
.driver = {
.name = "ucd9200",
+ .of_match_table = of_match_ptr(ucd9200_of_match),
},
.probe = ucd9200_probe,
.remove = pmbus_do_remove,
diff --git a/drivers/hwmon/stts751.c b/drivers/hwmon/stts751.c
index 55450680fb58..d56251d6eec2 100644
--- a/drivers/hwmon/stts751.c
+++ b/drivers/hwmon/stts751.c
@@ -85,6 +85,12 @@ static const struct i2c_device_id stts751_id[] = {
{ }
};
+static const struct of_device_id stts751_of_match[] = {
+ { .compatible = "stts751" },
+ { },
+};
+MODULE_DEVICE_TABLE(of, stts751_of_match);
+
struct stts751_priv {
struct device *dev;
struct i2c_client *client;
@@ -819,6 +825,7 @@ static struct i2c_driver stts751_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = DEVNAME,
+ .of_match_table = of_match_ptr(stts751_of_match),
},
.probe = stts751_probe,
.id_table = stts751_id,
diff --git a/drivers/hwmon/tmp102.c b/drivers/hwmon/tmp102.c
index 36bba2a816a4..5eafbaada795 100644
--- a/drivers/hwmon/tmp102.c
+++ b/drivers/hwmon/tmp102.c
@@ -323,8 +323,15 @@ static const struct i2c_device_id tmp102_id[] = {
};
MODULE_DEVICE_TABLE(i2c, tmp102_id);
+static const struct of_device_id tmp102_of_match[] = {
+ { .compatible = "ti,tmp102" },
+ { },
+};
+MODULE_DEVICE_TABLE(of, tmp102_of_match);
+
static struct i2c_driver tmp102_driver = {
.driver.name = DRIVER_NAME,
+ .driver.of_match_table = of_match_ptr(tmp102_of_match),
.driver.pm = &tmp102_dev_pm_ops,
.probe = tmp102_probe,
.id_table = tmp102_id,
diff --git a/drivers/hwmon/tmp103.c b/drivers/hwmon/tmp103.c
index ad571ec795a3..7f85b14544df 100644
--- a/drivers/hwmon/tmp103.c
+++ b/drivers/hwmon/tmp103.c
@@ -150,8 +150,7 @@ static int tmp103_probe(struct i2c_client *client,
return PTR_ERR_OR_ZERO(hwmon_dev);
}
-#ifdef CONFIG_PM
-static int tmp103_suspend(struct device *dev)
+static int __maybe_unused tmp103_suspend(struct device *dev)
{
struct regmap *regmap = dev_get_drvdata(dev);
@@ -159,7 +158,7 @@ static int tmp103_suspend(struct device *dev)
TMP103_CONF_SD_MASK, 0);
}
-static int tmp103_resume(struct device *dev)
+static int __maybe_unused tmp103_resume(struct device *dev)
{
struct regmap *regmap = dev_get_drvdata(dev);
@@ -167,15 +166,7 @@ static int tmp103_resume(struct device *dev)
TMP103_CONF_SD_MASK, TMP103_CONF_SD);
}
-static const struct dev_pm_ops tmp103_dev_pm_ops = {
- .suspend = tmp103_suspend,
- .resume = tmp103_resume,
-};
-
-#define TMP103_DEV_PM_OPS (&tmp103_dev_pm_ops)
-#else
-#define TMP103_DEV_PM_OPS NULL
-#endif /* CONFIG_PM */
+static SIMPLE_DEV_PM_OPS(tmp103_dev_pm_ops, tmp103_suspend, tmp103_resume);
static const struct i2c_device_id tmp103_id[] = {
{ "tmp103", 0 },
@@ -183,10 +174,17 @@ static const struct i2c_device_id tmp103_id[] = {
};
MODULE_DEVICE_TABLE(i2c, tmp103_id);
+static const struct of_device_id tmp103_of_match[] = {
+ { .compatible = "ti,tmp103" },
+ { },
+};
+MODULE_DEVICE_TABLE(of, tmp103_of_match);
+
static struct i2c_driver tmp103_driver = {
.driver = {
.name = "tmp103",
- .pm = TMP103_DEV_PM_OPS,
+ .of_match_table = of_match_ptr(tmp103_of_match),
+ .pm = &tmp103_dev_pm_ops,
},
.probe = tmp103_probe,
.id_table = tmp103_id,
diff --git a/drivers/hwmon/tmp421.c b/drivers/hwmon/tmp421.c
index bfb98b96c781..e36399213324 100644
--- a/drivers/hwmon/tmp421.c
+++ b/drivers/hwmon/tmp421.c
@@ -29,6 +29,7 @@
#include <linux/hwmon-sysfs.h>
#include <linux/err.h>
#include <linux/mutex.h>
+#include <linux/of_device.h>
#include <linux/sysfs.h>
/* Addresses to scan */
@@ -69,6 +70,31 @@ static const struct i2c_device_id tmp421_id[] = {
};
MODULE_DEVICE_TABLE(i2c, tmp421_id);
+static const struct of_device_id tmp421_of_match[] = {
+ {
+ .compatible = "ti,tmp421",
+ .data = (void *)2
+ },
+ {
+ .compatible = "ti,tmp422",
+ .data = (void *)3
+ },
+ {
+ .compatible = "ti,tmp423",
+ .data = (void *)4
+ },
+ {
+ .compatible = "ti,tmp441",
+ .data = (void *)2
+ },
+ {
+ .compatible = "ti,tmp422",
+ .data = (void *)3
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, tmp421_of_match);
+
struct tmp421_data {
struct i2c_client *client;
struct mutex update_lock;
@@ -78,7 +104,7 @@ struct tmp421_data {
struct hwmon_chip_info chip;
char valid;
unsigned long last_updated;
- int channels;
+ unsigned long channels;
u8 config;
s16 temp[4];
};
@@ -272,7 +298,11 @@ static int tmp421_probe(struct i2c_client *client,
return -ENOMEM;
mutex_init(&data->update_lock);
- data->channels = id->driver_data;
+ if (client->dev.of_node)
+ data->channels = (unsigned long)
+ of_device_get_match_data(&client->dev);
+ else
+ data->channels = id->driver_data;
data->client = client;
err = tmp421_init_client(client);
@@ -301,6 +331,7 @@ static struct i2c_driver tmp421_driver = {
.class = I2C_CLASS_HWMON,
.driver = {
.name = "tmp421",
+ .of_match_table = of_match_ptr(tmp421_of_match),
},
.probe = tmp421_probe,
.id_table = tmp421_id,
diff --git a/drivers/hwmon/twl4030-madc-hwmon.c b/drivers/hwmon/twl4030-madc-hwmon.c
deleted file mode 100644
index b5caf7fdb487..000000000000
--- a/drivers/hwmon/twl4030-madc-hwmon.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *
- * TWL4030 MADC Hwmon driver-This driver monitors the real time
- * conversion of analog signals like battery temperature,
- * battery type, battery level etc. User can ask for the conversion on a
- * particular channel using the sysfs nodes.
- *
- * Copyright (C) 2011 Texas Instruments Incorporated - http://www.ti.com/
- * J Keerthy <j-keerthy@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
- *
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/i2c/twl.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/i2c/twl4030-madc.h>
-#include <linux/hwmon.h>
-#include <linux/hwmon-sysfs.h>
-#include <linux/stddef.h>
-#include <linux/sysfs.h>
-#include <linux/err.h>
-#include <linux/types.h>
-
-/*
- * sysfs hook function
- */
-static ssize_t madc_read(struct device *dev,
- struct device_attribute *devattr, char *buf)
-{
- struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
- struct twl4030_madc_request req = {
- .channels = 1 << attr->index,
- .method = TWL4030_MADC_SW2,
- .type = TWL4030_MADC_WAIT,
- };
- long val;
-
- val = twl4030_madc_conversion(&req);
- if (val < 0)
- return val;
-
- return sprintf(buf, "%d\n", req.rbuf[attr->index]);
-}
-
-/* sysfs nodes to read individual channels from user side */
-static SENSOR_DEVICE_ATTR(in0_input, S_IRUGO, madc_read, NULL, 0);
-static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, madc_read, NULL, 1);
-static SENSOR_DEVICE_ATTR(in2_input, S_IRUGO, madc_read, NULL, 2);
-static SENSOR_DEVICE_ATTR(in3_input, S_IRUGO, madc_read, NULL, 3);
-static SENSOR_DEVICE_ATTR(in4_input, S_IRUGO, madc_read, NULL, 4);
-static SENSOR_DEVICE_ATTR(in5_input, S_IRUGO, madc_read, NULL, 5);
-static SENSOR_DEVICE_ATTR(in6_input, S_IRUGO, madc_read, NULL, 6);
-static SENSOR_DEVICE_ATTR(in7_input, S_IRUGO, madc_read, NULL, 7);
-static SENSOR_DEVICE_ATTR(in8_input, S_IRUGO, madc_read, NULL, 8);
-static SENSOR_DEVICE_ATTR(in9_input, S_IRUGO, madc_read, NULL, 9);
-static SENSOR_DEVICE_ATTR(curr10_input, S_IRUGO, madc_read, NULL, 10);
-static SENSOR_DEVICE_ATTR(in11_input, S_IRUGO, madc_read, NULL, 11);
-static SENSOR_DEVICE_ATTR(in12_input, S_IRUGO, madc_read, NULL, 12);
-static SENSOR_DEVICE_ATTR(in15_input, S_IRUGO, madc_read, NULL, 15);
-
-static struct attribute *twl4030_madc_attrs[] = {
- &sensor_dev_attr_in0_input.dev_attr.attr,
- &sensor_dev_attr_temp1_input.dev_attr.attr,
- &sensor_dev_attr_in2_input.dev_attr.attr,
- &sensor_dev_attr_in3_input.dev_attr.attr,
- &sensor_dev_attr_in4_input.dev_attr.attr,
- &sensor_dev_attr_in5_input.dev_attr.attr,
- &sensor_dev_attr_in6_input.dev_attr.attr,
- &sensor_dev_attr_in7_input.dev_attr.attr,
- &sensor_dev_attr_in8_input.dev_attr.attr,
- &sensor_dev_attr_in9_input.dev_attr.attr,
- &sensor_dev_attr_curr10_input.dev_attr.attr,
- &sensor_dev_attr_in11_input.dev_attr.attr,
- &sensor_dev_attr_in12_input.dev_attr.attr,
- &sensor_dev_attr_in15_input.dev_attr.attr,
- NULL
-};
-ATTRIBUTE_GROUPS(twl4030_madc);
-
-static int twl4030_madc_hwmon_probe(struct platform_device *pdev)
-{
- struct device *hwmon;
-
- hwmon = devm_hwmon_device_register_with_groups(&pdev->dev,
- "twl4030_madc", NULL,
- twl4030_madc_groups);
- return PTR_ERR_OR_ZERO(hwmon);
-}
-
-static struct platform_driver twl4030_madc_hwmon_driver = {
- .probe = twl4030_madc_hwmon_probe,
- .driver = {
- .name = "twl4030_madc_hwmon",
- },
-};
-
-module_platform_driver(twl4030_madc_hwmon_driver);
-
-MODULE_DESCRIPTION("TWL4030 ADC Hwmon driver");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("J Keerthy");
-MODULE_ALIAS("platform:twl4030_madc_hwmon");
diff --git a/drivers/hwmon/w83627ehf.c b/drivers/hwmon/w83627ehf.c
index ab346ed142de..ad68b6d9ff17 100644
--- a/drivers/hwmon/w83627ehf.c
+++ b/drivers/hwmon/w83627ehf.c
@@ -135,11 +135,16 @@ superio_select(int ioreg, int ld)
outb(ld, ioreg + 1);
}
-static inline void
+static inline int
superio_enter(int ioreg)
{
+ if (!request_muxed_region(ioreg, 2, DRVNAME))
+ return -EBUSY;
+
outb(0x87, ioreg);
outb(0x87, ioreg);
+
+ return 0;
}
static inline void
@@ -148,6 +153,7 @@ superio_exit(int ioreg)
outb(0xaa, ioreg);
outb(0x02, ioreg);
outb(0x02, ioreg + 1);
+ release_region(ioreg, 2);
}
/*
@@ -1970,8 +1976,6 @@ w83627ehf_check_fan_inputs(const struct w83627ehf_sio_data *sio_data,
return;
}
- superio_enter(sio_data->sioreg);
-
/* fan4 and fan5 share some pins with the GPIO and serial flash */
if (sio_data->kind == nct6775) {
/* On NCT6775, fan4 shares pins with the fdc interface */
@@ -2013,8 +2017,6 @@ w83627ehf_check_fan_inputs(const struct w83627ehf_sio_data *sio_data,
fan4min = fan4pin;
}
- superio_exit(sio_data->sioreg);
-
data->has_fan = data->has_fan_min = 0x03; /* fan1 and fan2 */
data->has_fan |= (fan3pin << 2);
data->has_fan_min |= (fan3pin << 2);
@@ -2352,7 +2354,11 @@ static int w83627ehf_probe(struct platform_device *pdev)
w83627ehf_init_device(data, sio_data->kind);
data->vrm = vid_which_vrm();
- superio_enter(sio_data->sioreg);
+
+ err = superio_enter(sio_data->sioreg);
+ if (err)
+ goto exit_release;
+
/* Read VID value */
if (sio_data->kind == w83667hg || sio_data->kind == w83667hg_b ||
sio_data->kind == nct6775 || sio_data->kind == nct6776) {
@@ -2364,8 +2370,10 @@ static int w83627ehf_probe(struct platform_device *pdev)
superio_select(sio_data->sioreg, W83667HG_LD_VID);
data->vid = superio_inb(sio_data->sioreg, 0xe3);
err = device_create_file(dev, &dev_attr_cpu0_vid);
- if (err)
+ if (err) {
+ superio_exit(sio_data->sioreg);
goto exit_release;
+ }
} else if (sio_data->kind != w83627uhg) {
superio_select(sio_data->sioreg, W83627EHF_LD_HWM);
if (superio_inb(sio_data->sioreg, SIO_REG_VID_CTRL) & 0x80) {
@@ -2401,8 +2409,10 @@ static int w83627ehf_probe(struct platform_device *pdev)
data->vid &= 0x3f;
err = device_create_file(dev, &dev_attr_cpu0_vid);
- if (err)
+ if (err) {
+ superio_exit(sio_data->sioreg);
goto exit_release;
+ }
} else {
dev_info(dev,
"VID pins in output mode, CPU VID not available\n");
@@ -2424,10 +2434,10 @@ static int w83627ehf_probe(struct platform_device *pdev)
pr_info("Enabled fan debounce for chip %s\n", data->name);
}
- superio_exit(sio_data->sioreg);
-
w83627ehf_check_fan_inputs(sio_data, data);
+ superio_exit(sio_data->sioreg);
+
/* Read fan clock dividers immediately */
w83627ehf_update_fan_div_common(dev, data);
@@ -2712,8 +2722,11 @@ static int __init w83627ehf_find(int sioaddr, unsigned short *addr,
u16 val;
const char *sio_name;
+ int err;
- superio_enter(sioaddr);
+ err = superio_enter(sioaddr);
+ if (err)
+ return err;
if (force_id)
val = force_id;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index feb30061123b..5901937284e7 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -107,7 +107,8 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
memcpy(scsi_req(rq)->cmd, pc->c, 12);
if (drive->media == ide_tape)
scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
- error = blk_execute_rq(drive->queue, disk, rq, 0);
+ blk_execute_rq(drive->queue, disk, rq, 0);
+ error = scsi_req(rq)->result ? -EIO : 0;
put_req:
blk_put_request(rq);
return error;
@@ -454,7 +455,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
debug_log("%s: I/O error\n", drive->name);
if (drive->media != ide_tape)
- pc->rq->errors++;
+ scsi_req(pc->rq)->result++;
if (scsi_req(rq)->cmd[0] == REQUEST_SENSE) {
printk(KERN_ERR PFX "%s: I/O error in request "
@@ -488,13 +489,13 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
drive->failed_pc = NULL;
if (ata_misc_request(rq)) {
- rq->errors = 0;
+ scsi_req(rq)->result = 0;
error = 0;
} else {
if (blk_rq_is_passthrough(rq) && uptodate <= 0) {
- if (rq->errors == 0)
- rq->errors = -EIO;
+ if (scsi_req(rq)->result == 0)
+ scsi_req(rq)->result = -EIO;
}
error = uptodate ? 0 : -EIO;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 74f1b7dc03f7..07e5ff3a64c3 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -247,10 +247,10 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
struct cdrom_info *info = drive->driver_data;
- if (!rq->errors)
+ if (!scsi_req(rq)->result)
info->write_timeout = jiffies + ATAPI_WAIT_WRITE_BUSY;
- rq->errors = 1;
+ scsi_req(rq)->result = 1;
if (time_after(jiffies, info->write_timeout))
return 0;
@@ -294,8 +294,8 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
}
/* if we have an error, pass CHECK_CONDITION as the SCSI status byte */
- if (blk_rq_is_scsi(rq) && !rq->errors)
- rq->errors = SAM_STAT_CHECK_CONDITION;
+ if (blk_rq_is_scsi(rq) && !scsi_req(rq)->result)
+ scsi_req(rq)->result = SAM_STAT_CHECK_CONDITION;
if (blk_noretry_request(rq))
do_end_request = 1;
@@ -325,7 +325,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
* Arrange to retry the request but be sure to give up if we've
* retried too many times.
*/
- if (++rq->errors > ERROR_MAX)
+ if (++scsi_req(rq)->result > ERROR_MAX)
do_end_request = 1;
break;
case ILLEGAL_REQUEST:
@@ -372,7 +372,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
/* go to the default handler for other errors */
ide_error(drive, "cdrom_decode_status", stat);
return 1;
- } else if (++rq->errors > ERROR_MAX)
+ } else if (++scsi_req(rq)->result > ERROR_MAX)
/* we've racked up too many retries, abort */
do_end_request = 1;
}
@@ -452,7 +452,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
}
}
- error = blk_execute_rq(drive->queue, info->disk, rq, 0);
+ blk_execute_rq(drive->queue, info->disk, rq, 0);
+ error = scsi_req(rq)->result ? -EIO : 0;
if (buffer)
*bufflen = scsi_req(rq)->resid_len;
@@ -683,8 +684,8 @@ out_end:
if (cmd->nleft == 0)
uptodate = 1;
} else {
- if (uptodate <= 0 && rq->errors == 0)
- rq->errors = -EIO;
+ if (uptodate <= 0 && scsi_req(rq)->result == 0)
+ scsi_req(rq)->result = -EIO;
}
if (uptodate == 0 && rq->bio)
@@ -1379,7 +1380,7 @@ static int ide_cdrom_prep_pc(struct request *rq)
* appropriate action
*/
if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
- rq->errors = ILLEGAL_REQUEST;
+ scsi_req(rq)->result = ILLEGAL_REQUEST;
return BLKPREP_KILL;
}
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 9fcefbc8425e..55cd736c39c6 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -307,7 +307,8 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
scsi_req_init(rq);
ide_req(rq)->type = ATA_PRIV_MISC;
rq->rq_flags = RQF_QUIET;
- ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
+ blk_execute_rq(drive->queue, cd->disk, rq, 0);
+ ret = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
/*
* A reset will unlock the door. If it was previously locked,
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index a45dda5386e4..9b69c32ee560 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -173,8 +173,8 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
*(int *)&scsi_req(rq)->cmd[1] = arg;
rq->special = setting->set;
- if (blk_execute_rq(q, NULL, rq, 0))
- ret = rq->errors;
+ blk_execute_rq(q, NULL, rq, 0);
+ ret = scsi_req(rq)->result;
blk_put_request(rq);
return ret;
@@ -186,7 +186,7 @@ ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]);
if (err)
- rq->errors = err;
- ide_complete_rq(drive, err, blk_rq_bytes(rq));
+ scsi_req(rq)->result = err;
+ ide_complete_rq(drive, 0, blk_rq_bytes(rq));
return ide_stopped;
}
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 186159715b71..7c06237f3479 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -470,7 +470,6 @@ ide_devset_get(multcount, mult_count);
static int set_multcount(ide_drive_t *drive, int arg)
{
struct request *rq;
- int error;
if (arg < 0 || arg > (drive->id[ATA_ID_MAX_MULTSECT] & 0xff))
return -EINVAL;
@@ -484,7 +483,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
drive->mult_req = arg;
drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
- error = blk_execute_rq(drive->queue, NULL, rq, 0);
+ blk_execute_rq(drive->queue, NULL, rq, 0);
blk_put_request(rq);
return (drive->mult_count == arg) ? 0 : -EIO;
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 17a65ac56491..51c81223e56d 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -490,7 +490,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
* make sure request is sane
*/
if (hwif->rq)
- hwif->rq->errors = 0;
+ scsi_req(hwif->rq)->result = 0;
return ret;
}
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index cf3af6840368..4b7ffd7d158d 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -12,7 +12,7 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
if ((stat & ATA_BUSY) ||
((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
/* other bits are useless when BUSY */
- rq->errors |= ERROR_RESET;
+ scsi_req(rq)->result |= ERROR_RESET;
} else if (stat & ATA_ERR) {
/* err has different meaning on cdrom and tape */
if (err == ATA_ABORTED) {
@@ -25,10 +25,10 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
drive->crc_count++;
} else if (err & (ATA_BBK | ATA_UNC)) {
/* retries won't help these */
- rq->errors = ERROR_MAX;
+ scsi_req(rq)->result = ERROR_MAX;
} else if (err & ATA_TRK0NF) {
/* help it find track zero */
- rq->errors |= ERROR_RECAL;
+ scsi_req(rq)->result |= ERROR_RECAL;
}
}
@@ -39,23 +39,23 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
ide_pad_transfer(drive, READ, nsect * SECTOR_SIZE);
}
- if (rq->errors >= ERROR_MAX || blk_noretry_request(rq)) {
+ if (scsi_req(rq)->result >= ERROR_MAX || blk_noretry_request(rq)) {
ide_kill_rq(drive, rq);
return ide_stopped;
}
if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
- rq->errors |= ERROR_RESET;
+ scsi_req(rq)->result |= ERROR_RESET;
- if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
- ++rq->errors;
+ if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
+ ++scsi_req(rq)->result;
return ide_do_reset(drive);
}
- if ((rq->errors & ERROR_RECAL) == ERROR_RECAL)
+ if ((scsi_req(rq)->result & ERROR_RECAL) == ERROR_RECAL)
drive->special_flags |= IDE_SFLAG_RECALIBRATE;
- ++rq->errors;
+ ++scsi_req(rq)->result;
return ide_stopped;
}
@@ -68,7 +68,7 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
if ((stat & ATA_BUSY) ||
((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
/* other bits are useless when BUSY */
- rq->errors |= ERROR_RESET;
+ scsi_req(rq)->result |= ERROR_RESET;
} else {
/* add decoding error stuff */
}
@@ -77,14 +77,14 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
/* force an abort */
hwif->tp_ops->exec_command(hwif, ATA_CMD_IDLEIMMEDIATE);
- if (rq->errors >= ERROR_MAX) {
+ if (scsi_req(rq)->result >= ERROR_MAX) {
ide_kill_rq(drive, rq);
} else {
- if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
- ++rq->errors;
+ if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
+ ++scsi_req(rq)->result;
return ide_do_reset(drive);
}
- ++rq->errors;
+ ++scsi_req(rq)->result;
}
return ide_stopped;
@@ -130,11 +130,11 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
if (cmd)
ide_complete_cmd(drive, cmd, stat, err);
} else if (ata_pm_request(rq)) {
- rq->errors = 1;
+ scsi_req(rq)->result = 1;
ide_complete_pm_rq(drive, rq);
return ide_stopped;
}
- rq->errors = err;
+ scsi_req(rq)->result = err;
ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq));
return ide_stopped;
}
@@ -149,8 +149,8 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
if (rq && ata_misc_request(rq) &&
scsi_req(rq)->cmd[0] == REQ_DRIVE_RESET) {
- if (err <= 0 && rq->errors == 0)
- rq->errors = -EIO;
+ if (err <= 0 && scsi_req(rq)->result == 0)
+ scsi_req(rq)->result = -EIO;
ide_complete_rq(drive, err ? err : 0, blk_rq_bytes(rq));
}
}
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a69e8013f1df..8ac6048cd2df 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -98,7 +98,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc)
}
if (ata_misc_request(rq))
- rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
+ scsi_req(rq)->result = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
return uptodate;
}
@@ -239,7 +239,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
? rq->rq_disk->disk_name
: "dev?"));
- if (rq->errors >= ERROR_MAX) {
+ if (scsi_req(rq)->result >= ERROR_MAX) {
if (drive->failed_pc) {
ide_floppy_report_error(floppy, drive->failed_pc);
drive->failed_pc = NULL;
@@ -247,7 +247,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
if (ata_misc_request(rq)) {
- rq->errors = 0;
+ scsi_req(rq)->result = 0;
ide_complete_rq(drive, 0, blk_rq_bytes(rq));
return ide_stopped;
} else
@@ -301,8 +301,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
return ide_floppy_issue_pc(drive, &cmd, pc);
out_end:
drive->failed_pc = NULL;
- if (blk_rq_is_passthrough(rq) && rq->errors == 0)
- rq->errors = -EIO;
+ if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
+ scsi_req(rq)->result = -EIO;
ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
return ide_stopped;
}
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 043b1fb963cb..45b3f41a43d4 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -141,12 +141,12 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq)
drive->failed_pc = NULL;
if ((media == ide_floppy || media == ide_tape) && drv_req) {
- rq->errors = 0;
+ scsi_req(rq)->result = 0;
} else {
if (media == ide_tape)
- rq->errors = IDE_DRV_ERROR_GENERAL;
- else if (blk_rq_is_passthrough(rq) && rq->errors == 0)
- rq->errors = -EIO;
+ scsi_req(rq)->result = IDE_DRV_ERROR_GENERAL;
+ else if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
+ scsi_req(rq)->result = -EIO;
}
ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
@@ -271,7 +271,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
#ifdef DEBUG
printk("%s: DRIVE_CMD (null)\n", drive->name);
#endif
- rq->errors = 0;
+ scsi_req(rq)->result = 0;
ide_complete_rq(drive, 0, blk_rq_bytes(rq));
return ide_stopped;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 248a3e0ceb46..8c0d17297a7a 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -128,7 +128,8 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
scsi_req_init(rq);
ide_req(rq)->type = ATA_PRIV_TASKFILE;
- err = blk_execute_rq(drive->queue, NULL, rq, 0);
+ blk_execute_rq(drive->queue, NULL, rq, 0);
+ err = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
return err;
@@ -227,8 +228,8 @@ static int generic_drive_reset(ide_drive_t *drive)
ide_req(rq)->type = ATA_PRIV_MISC;
scsi_req(rq)->cmd_len = 1;
scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
- if (blk_execute_rq(drive->queue, NULL, rq, 1))
- ret = rq->errors;
+ blk_execute_rq(drive->queue, NULL, rq, 1);
+ ret = scsi_req(rq)->result;
blk_put_request(rq);
return ret;
}
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 101aed9a61ca..94e3107f59b9 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -37,7 +37,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
scsi_req(rq)->cmd_len = 1;
ide_req(rq)->type = ATA_PRIV_MISC;
rq->special = &timeout;
- rc = blk_execute_rq(q, NULL, rq, 1);
+ blk_execute_rq(q, NULL, rq, 1);
+ rc = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
if (rc)
goto out;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ec951be4b0c8..0977fc1f40ce 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -27,7 +27,8 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
mesg.event = PM_EVENT_FREEZE;
rqpm.pm_state = mesg.event;
- ret = blk_execute_rq(drive->queue, NULL, rq, 0);
+ blk_execute_rq(drive->queue, NULL, rq, 0);
+ ret = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
if (ret == 0 && ide_port_acpi(hwif)) {
@@ -55,8 +56,8 @@ static int ide_pm_execute_rq(struct request *rq)
spin_lock_irq(q->queue_lock);
if (unlikely(blk_queue_dying(q))) {
rq->rq_flags |= RQF_QUIET;
- rq->errors = -ENXIO;
- __blk_end_request_all(rq, rq->errors);
+ scsi_req(rq)->result = -ENXIO;
+ __blk_end_request_all(rq, 0);
spin_unlock_irq(q->queue_lock);
return -ENXIO;
}
@@ -66,7 +67,7 @@ static int ide_pm_execute_rq(struct request *rq)
wait_for_completion_io(&wait);
- return rq->errors ? -EIO : 0;
+ return scsi_req(rq)->result ? -EIO : 0;
}
int generic_ide_resume(struct device *dev)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d8a552b47718..a0651f948b76 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -366,7 +366,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
err = pc->error;
}
}
- rq->errors = err;
+ scsi_req(rq)->result = err;
return uptodate;
}
@@ -879,7 +879,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
tape->valid = 0;
ret = size;
- if (rq->errors == IDE_DRV_ERROR_GENERAL)
+ if (scsi_req(rq)->result == IDE_DRV_ERROR_GENERAL)
ret = -EIO;
out_put:
blk_put_request(rq);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 4c0007cb74e3..d71199d23c9e 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -287,7 +287,7 @@ static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
u8 saved_io_32bit = drive->io_32bit;
if (cmd->tf_flags & IDE_TFLAG_FS)
- cmd->rq->errors = 0;
+ scsi_req(cmd->rq)->result = 0;
if (cmd->tf_flags & IDE_TFLAG_IO_16BIT)
drive->io_32bit = 0;
@@ -329,7 +329,7 @@ void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat)
u8 set_xfer = !!(cmd->tf_flags & IDE_TFLAG_SET_XFER);
ide_complete_cmd(drive, cmd, stat, err);
- rq->errors = err;
+ scsi_req(rq)->result = err;
if (err == 0 && set_xfer) {
ide_set_xfer_rate(drive, nsect);
@@ -452,8 +452,8 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
rq->special = cmd;
cmd->rq = rq;
- error = blk_execute_rq(drive->queue, NULL, rq, 0);
-
+ blk_execute_rq(drive->queue, NULL, rq, 0);
+ error = scsi_req(rq)->result ? -EIO : 0;
put_req:
blk_put_request(rq);
return error;
diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h
index 312bd6ca9198..09720d950686 100644
--- a/drivers/input/serio/i8042-x86ia64io.h
+++ b/drivers/input/serio/i8042-x86ia64io.h
@@ -620,6 +620,13 @@ static const struct dmi_system_id __initconst i8042_dmi_reset_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "20046"),
},
},
+ {
+ /* Clevo P650RS, 650RP6, Sager NP8152-S, and others */
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Notebook"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "P65xRP"),
+ },
+ },
{ }
};
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 275f467956ee..6c2999872090 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -76,6 +76,15 @@ config LEDS_BCM6358
This option enables support for LEDs connected to the BCM6358
LED HW controller accessed via MMIO registers.
+config LEDS_CPCAP
+ tristate "LED Support for Motorola CPCAP"
+ depends on LEDS_CLASS
+ depends on MFD_CPCAP
+ depends on OF
+ help
+ This option enables support for LEDs offered by Motorola's
+ CPCAP PMIC.
+
config LEDS_LM3530
tristate "LCD Backlight driver for LM3530"
depends on LEDS_CLASS
@@ -126,6 +135,14 @@ config LEDS_MIKROTIK_RB532
This option enables support for the so called "User LED" of
Mikrotik's Routerboard 532.
+config LEDS_MT6323
+ tristate "LED Support for Mediatek MT6323 PMIC"
+ depends on LEDS_CLASS
+ depends on MFD_MT6397
+ help
+ This option enables support for on-chip LED drivers found on
+ Mediatek MT6323 PMIC.
+
config LEDS_S3C24XX
tristate "LED Support for Samsung S3C24XX GPIO LEDs"
depends on LEDS_CLASS
@@ -241,7 +258,6 @@ config LEDS_LP3952
tristate "LED Support for TI LP3952 2 channel LED driver"
depends on LEDS_CLASS
depends on I2C
- depends on ACPI
depends on GPIOLIB
select REGMAP_I2C
help
@@ -463,15 +479,6 @@ config LEDS_ADP5520
To compile this driver as a module, choose M here: the module will
be called leds-adp5520.
-config LEDS_DELL_NETBOOKS
- tristate "External LED on Dell Business Netbooks"
- depends on LEDS_CLASS
- depends on X86 && ACPI_WMI
- depends on DELL_SMBIOS
- help
- This adds support for the Latitude 2100 and similar
- notebooks that have an external LED.
-
config LEDS_MC13783
tristate "LED Support for MC13XXX PMIC"
depends on LEDS_CLASS
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 6b8273736478..45f133962ed8 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_LEDS_AAT1290) += leds-aat1290.o
obj-$(CONFIG_LEDS_BCM6328) += leds-bcm6328.o
obj-$(CONFIG_LEDS_BCM6358) += leds-bcm6358.o
obj-$(CONFIG_LEDS_BD2802) += leds-bd2802.o
+obj-$(CONFIG_LEDS_CPCAP) += leds-cpcap.o
obj-$(CONFIG_LEDS_LOCOMO) += leds-locomo.o
obj-$(CONFIG_LEDS_LM3530) += leds-lm3530.o
obj-$(CONFIG_LEDS_LM3533) += leds-lm3533.o
@@ -52,7 +53,6 @@ obj-$(CONFIG_LEDS_REGULATOR) += leds-regulator.o
obj-$(CONFIG_LEDS_INTEL_SS4200) += leds-ss4200.o
obj-$(CONFIG_LEDS_LT3593) += leds-lt3593.o
obj-$(CONFIG_LEDS_ADP5520) += leds-adp5520.o
-obj-$(CONFIG_LEDS_DELL_NETBOOKS) += dell-led.o
obj-$(CONFIG_LEDS_MC13783) += leds-mc13783.o
obj-$(CONFIG_LEDS_NS2) += leds-ns2.o
obj-$(CONFIG_LEDS_NETXBIG) += leds-netxbig.o
@@ -72,6 +72,7 @@ obj-$(CONFIG_LEDS_IS31FL32XX) += leds-is31fl32xx.o
obj-$(CONFIG_LEDS_PM8058) += leds-pm8058.o
obj-$(CONFIG_LEDS_MLXCPLD) += leds-mlxcpld.o
obj-$(CONFIG_LEDS_NIC78BX) += leds-nic78bx.o
+obj-$(CONFIG_LEDS_MT6323) += leds-mt6323.o
# LED SPI Drivers
obj-$(CONFIG_LEDS_DAC124S085) += leds-dac124s085.o
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index f2b0a80a62b4..b0e2d55acbd6 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -244,11 +244,14 @@ static int led_classdev_next_name(const char *init_name, char *name,
}
/**
- * led_classdev_register - register a new object of led_classdev class.
- * @parent: The device to register.
+ * of_led_classdev_register - register a new object of led_classdev class.
+ *
+ * @parent: parent of LED device
* @led_cdev: the led_classdev structure for this device.
+ * @np: DT node describing this LED
*/
-int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
+int of_led_classdev_register(struct device *parent, struct device_node *np,
+ struct led_classdev *led_cdev)
{
char name[LED_MAX_NAME_SIZE];
int ret;
@@ -261,6 +264,7 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
led_cdev, led_cdev->groups, "%s", name);
if (IS_ERR(led_cdev->dev))
return PTR_ERR(led_cdev->dev);
+ led_cdev->dev->of_node = np;
if (ret)
dev_warn(parent, "Led %s renamed to %s due to name collision",
@@ -303,7 +307,7 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
return 0;
}
-EXPORT_SYMBOL_GPL(led_classdev_register);
+EXPORT_SYMBOL_GPL(of_led_classdev_register);
/**
* led_classdev_unregister - unregisters a object of led_properties class.
@@ -348,12 +352,14 @@ static void devm_led_classdev_release(struct device *dev, void *res)
}
/**
- * devm_led_classdev_register - resource managed led_classdev_register()
- * @parent: The device to register.
+ * devm_of_led_classdev_register - resource managed led_classdev_register()
+ *
+ * @parent: parent of LED device
* @led_cdev: the led_classdev structure for this device.
*/
-int devm_led_classdev_register(struct device *parent,
- struct led_classdev *led_cdev)
+int devm_of_led_classdev_register(struct device *parent,
+ struct device_node *np,
+ struct led_classdev *led_cdev)
{
struct led_classdev **dr;
int rc;
@@ -362,7 +368,7 @@ int devm_led_classdev_register(struct device *parent,
if (!dr)
return -ENOMEM;
- rc = led_classdev_register(parent, led_cdev);
+ rc = of_led_classdev_register(parent, np, led_cdev);
if (rc) {
devres_free(dr);
return rc;
@@ -373,7 +379,7 @@ int devm_led_classdev_register(struct device *parent,
return 0;
}
-EXPORT_SYMBOL_GPL(devm_led_classdev_register);
+EXPORT_SYMBOL_GPL(devm_of_led_classdev_register);
static int devm_led_classdev_match(struct device *dev, void *res, void *data)
{
diff --git a/drivers/leds/leds-cpcap.c b/drivers/leds/leds-cpcap.c
new file mode 100644
index 000000000000..f0f28c442807
--- /dev/null
+++ b/drivers/leds/leds-cpcap.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2017 Sebastian Reichel <sre@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 or
+ * later as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/leds.h>
+#include <linux/mfd/motorola-cpcap.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/regulator/consumer.h>
+
+#define CPCAP_LED_NO_CURRENT 0x0001
+
+struct cpcap_led_info {
+ u16 reg;
+ u16 mask;
+ u16 limit;
+ u16 init_mask;
+ u16 init_val;
+};
+
+static const struct cpcap_led_info cpcap_led_red = {
+ .reg = CPCAP_REG_REDC,
+ .mask = 0x03FF,
+ .limit = 31,
+};
+
+static const struct cpcap_led_info cpcap_led_green = {
+ .reg = CPCAP_REG_GREENC,
+ .mask = 0x03FF,
+ .limit = 31,
+};
+
+static const struct cpcap_led_info cpcap_led_blue = {
+ .reg = CPCAP_REG_BLUEC,
+ .mask = 0x03FF,
+ .limit = 31,
+};
+
+/* aux display light */
+static const struct cpcap_led_info cpcap_led_adl = {
+ .reg = CPCAP_REG_ADLC,
+ .mask = 0x000F,
+ .limit = 1,
+ .init_mask = 0x7FFF,
+ .init_val = 0x5FF0,
+};
+
+/* camera privacy led */
+static const struct cpcap_led_info cpcap_led_cp = {
+ .reg = CPCAP_REG_CLEDC,
+ .mask = 0x0007,
+ .limit = 1,
+ .init_mask = 0x03FF,
+ .init_val = 0x0008,
+};
+
+struct cpcap_led {
+ struct led_classdev led;
+ const struct cpcap_led_info *info;
+ struct device *dev;
+ struct regmap *regmap;
+ struct mutex update_lock;
+ struct regulator *vdd;
+ bool powered;
+
+ u32 current_limit;
+};
+
+static u16 cpcap_led_val(u8 current_limit, u8 duty_cycle)
+{
+ current_limit &= 0x1f; /* 5 bit */
+ duty_cycle &= 0x0f; /* 4 bit */
+
+ return current_limit << 4 | duty_cycle;
+}
+
+static int cpcap_led_set_power(struct cpcap_led *led, bool status)
+{
+ int err;
+
+ if (status == led->powered)
+ return 0;
+
+ if (status)
+ err = regulator_enable(led->vdd);
+ else
+ err = regulator_disable(led->vdd);
+
+ if (err) {
+ dev_err(led->dev, "regulator failure: %d", err);
+ return err;
+ }
+
+ led->powered = status;
+
+ return 0;
+}
+
+static int cpcap_led_set(struct led_classdev *ledc, enum led_brightness value)
+{
+ struct cpcap_led *led = container_of(ledc, struct cpcap_led, led);
+ int brightness;
+ int err;
+
+ mutex_lock(&led->update_lock);
+
+ if (value > LED_OFF) {
+ err = cpcap_led_set_power(led, true);
+ if (err)
+ goto exit;
+ }
+
+ if (value == LED_OFF) {
+ /* Avoid HW issue by turning off current before duty cycle */
+ err = regmap_update_bits(led->regmap,
+ led->info->reg, led->info->mask, CPCAP_LED_NO_CURRENT);
+ if (err) {
+ dev_err(led->dev, "regmap failed: %d", err);
+ goto exit;
+ }
+
+ brightness = cpcap_led_val(value, LED_OFF);
+ } else {
+ brightness = cpcap_led_val(value, LED_ON);
+ }
+
+ err = regmap_update_bits(led->regmap, led->info->reg, led->info->mask,
+ brightness);
+ if (err) {
+ dev_err(led->dev, "regmap failed: %d", err);
+ goto exit;
+ }
+
+ if (value == LED_OFF) {
+ err = cpcap_led_set_power(led, false);
+ if (err)
+ goto exit;
+ }
+
+exit:
+ mutex_unlock(&led->update_lock);
+ return err;
+}
+
+static const struct of_device_id cpcap_led_of_match[] = {
+ { .compatible = "motorola,cpcap-led-red", .data = &cpcap_led_red },
+ { .compatible = "motorola,cpcap-led-green", .data = &cpcap_led_green },
+ { .compatible = "motorola,cpcap-led-blue", .data = &cpcap_led_blue },
+ { .compatible = "motorola,cpcap-led-adl", .data = &cpcap_led_adl },
+ { .compatible = "motorola,cpcap-led-cp", .data = &cpcap_led_cp },
+ {},
+};
+MODULE_DEVICE_TABLE(of, cpcap_led_of_match);
+
+static int cpcap_led_probe(struct platform_device *pdev)
+{
+ const struct of_device_id *match;
+ struct cpcap_led *led;
+ int err;
+
+ match = of_match_device(of_match_ptr(cpcap_led_of_match), &pdev->dev);
+ if (!match || !match->data)
+ return -EINVAL;
+
+ led = devm_kzalloc(&pdev->dev, sizeof(*led), GFP_KERNEL);
+ if (!led)
+ return -ENOMEM;
+ platform_set_drvdata(pdev, led);
+ led->info = match->data;
+ led->dev = &pdev->dev;
+
+ if (led->info->reg == 0x0000) {
+ dev_err(led->dev, "Unsupported LED");
+ return -ENODEV;
+ }
+
+ led->regmap = dev_get_regmap(pdev->dev.parent, NULL);
+ if (!led->regmap)
+ return -ENODEV;
+
+ led->vdd = devm_regulator_get(&pdev->dev, "vdd");
+ if (IS_ERR(led->vdd)) {
+ err = PTR_ERR(led->vdd);
+ dev_err(led->dev, "Couldn't get regulator: %d", err);
+ return err;
+ }
+
+ err = device_property_read_string(&pdev->dev, "label", &led->led.name);
+ if (err) {
+ dev_err(led->dev, "Couldn't read LED label: %d", err);
+ return err;
+ }
+
+ if (led->info->init_mask) {
+ err = regmap_update_bits(led->regmap, led->info->reg,
+ led->info->init_mask, led->info->init_val);
+ if (err) {
+ dev_err(led->dev, "regmap failed: %d", err);
+ return err;
+ }
+ }
+
+ mutex_init(&led->update_lock);
+
+ led->led.max_brightness = led->info->limit;
+ led->led.brightness_set_blocking = cpcap_led_set;
+ err = devm_led_classdev_register(&pdev->dev, &led->led);
+ if (err) {
+ dev_err(led->dev, "Couldn't register LED: %d", err);
+ return err;
+ }
+
+ return 0;
+}
+
+static struct platform_driver cpcap_led_driver = {
+ .probe = cpcap_led_probe,
+ .driver = {
+ .name = "cpcap-led",
+ .of_match_table = cpcap_led_of_match,
+ },
+};
+module_platform_driver(cpcap_led_driver);
+
+MODULE_DESCRIPTION("CPCAP LED driver");
+MODULE_AUTHOR("Sebastian Reichel <sre@kernel.org>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index 066fc7590729..e753ba93ba1e 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -77,7 +77,7 @@ static int gpio_blink_set(struct led_classdev *led_cdev,
static int create_gpio_led(const struct gpio_led *template,
struct gpio_led_data *led_dat, struct device *parent,
- gpio_blink_set_t blink_set)
+ struct device_node *np, gpio_blink_set_t blink_set)
{
int ret, state;
@@ -139,7 +139,7 @@ static int create_gpio_led(const struct gpio_led *template,
if (ret < 0)
return ret;
- return devm_led_classdev_register(parent, &led_dat->cdev);
+ return devm_of_led_classdev_register(parent, np, &led_dat->cdev);
}
struct gpio_leds_priv {
@@ -208,7 +208,7 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev)
if (fwnode_property_present(child, "panic-indicator"))
led.panic_indicator = 1;
- ret = create_gpio_led(&led, led_dat, dev, NULL);
+ ret = create_gpio_led(&led, led_dat, dev, np, NULL);
if (ret < 0) {
fwnode_handle_put(child);
return ERR_PTR(ret);
@@ -242,9 +242,9 @@ static int gpio_led_probe(struct platform_device *pdev)
priv->num_leds = pdata->num_leds;
for (i = 0; i < priv->num_leds; i++) {
- ret = create_gpio_led(&pdata->leds[i],
- &priv->leds[i],
- &pdev->dev, pdata->gpio_blink_set);
+ ret = create_gpio_led(&pdata->leds[i], &priv->leds[i],
+ &pdev->dev, NULL,
+ pdata->gpio_blink_set);
if (ret < 0)
return ret;
}
diff --git a/drivers/leds/leds-lp3952.c b/drivers/leds/leds-lp3952.c
index 4847e89883a7..847f7f282126 100644
--- a/drivers/leds/leds-lp3952.c
+++ b/drivers/leds/leds-lp3952.c
@@ -10,7 +10,6 @@
*
*/
-#include <linux/acpi.h>
#include <linux/delay.h>
#include <linux/gpio.h>
#include <linux/i2c.h>
@@ -103,10 +102,11 @@ static int lp3952_get_label(struct device *dev, const char *label, char *dest)
const char *str;
ret = device_property_read_string(dev, label, &str);
- if (!ret)
- strncpy(dest, str, LP3952_LABEL_MAX_LEN);
+ if (ret)
+ return ret;
- return ret;
+ strncpy(dest, str, LP3952_LABEL_MAX_LEN);
+ return 0;
}
static int lp3952_register_led_classdev(struct lp3952_led_array *priv)
@@ -276,19 +276,9 @@ static const struct i2c_device_id lp3952_id[] = {
};
MODULE_DEVICE_TABLE(i2c, lp3952_id);
-#ifdef CONFIG_ACPI
-static const struct acpi_device_id lp3952_acpi_match[] = {
- {"TXNW3952", 0},
- {}
-};
-
-MODULE_DEVICE_TABLE(acpi, lp3952_acpi_match);
-#endif
-
static struct i2c_driver lp3952_i2c_driver = {
.driver = {
.name = LP3952_NAME,
- .acpi_match_table = ACPI_PTR(lp3952_acpi_match),
},
.probe = lp3952_probe,
.remove = lp3952_remove,
diff --git a/drivers/leds/leds-mt6323.c b/drivers/leds/leds-mt6323.c
new file mode 100644
index 000000000000..8893c74e9a1f
--- /dev/null
+++ b/drivers/leds/leds-mt6323.c
@@ -0,0 +1,502 @@
+/*
+ * LED driver for Mediatek MT6323 PMIC
+ *
+ * Copyright (C) 2017 Sean Wang <sean.wang@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/leds.h>
+#include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6397/core.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+/*
+ * Register field for MT6323_TOP_CKPDN0 to enable
+ * 32K clock common for LED device.
+ */
+#define MT6323_RG_DRV_32K_CK_PDN BIT(11)
+#define MT6323_RG_DRV_32K_CK_PDN_MASK BIT(11)
+
+/*
+ * Register field for MT6323_TOP_CKPDN2 to enable
+ * individual clock for LED device.
+ */
+#define MT6323_RG_ISINK_CK_PDN(i) BIT(i)
+#define MT6323_RG_ISINK_CK_PDN_MASK(i) BIT(i)
+
+/*
+ * Register field for MT6323_TOP_CKCON1 to select
+ * clock source.
+ */
+#define MT6323_RG_ISINK_CK_SEL_MASK(i) (BIT(10) << (i))
+
+/*
+ * Register for MT6323_ISINK_CON0 to setup the
+ * duty cycle of the blink.
+ */
+#define MT6323_ISINK_CON0(i) (MT6323_ISINK0_CON0 + 0x8 * (i))
+#define MT6323_ISINK_DIM_DUTY_MASK (0x1f << 8)
+#define MT6323_ISINK_DIM_DUTY(i) (((i) << 8) & \
+ MT6323_ISINK_DIM_DUTY_MASK)
+
+/* Register to setup the period of the blink. */
+#define MT6323_ISINK_CON1(i) (MT6323_ISINK0_CON1 + 0x8 * (i))
+#define MT6323_ISINK_DIM_FSEL_MASK (0xffff)
+#define MT6323_ISINK_DIM_FSEL(i) ((i) & MT6323_ISINK_DIM_FSEL_MASK)
+
+/* Register to control the brightness. */
+#define MT6323_ISINK_CON2(i) (MT6323_ISINK0_CON2 + 0x8 * (i))
+#define MT6323_ISINK_CH_STEP_SHIFT 12
+#define MT6323_ISINK_CH_STEP_MASK (0x7 << 12)
+#define MT6323_ISINK_CH_STEP(i) (((i) << 12) & \
+ MT6323_ISINK_CH_STEP_MASK)
+#define MT6323_ISINK_SFSTR0_TC_MASK (0x3 << 1)
+#define MT6323_ISINK_SFSTR0_TC(i) (((i) << 1) & \
+ MT6323_ISINK_SFSTR0_TC_MASK)
+#define MT6323_ISINK_SFSTR0_EN_MASK BIT(0)
+#define MT6323_ISINK_SFSTR0_EN BIT(0)
+
+/* Register to LED channel enablement. */
+#define MT6323_ISINK_CH_EN_MASK(i) BIT(i)
+#define MT6323_ISINK_CH_EN(i) BIT(i)
+
+#define MT6323_MAX_PERIOD 10000
+#define MT6323_MAX_LEDS 4
+#define MT6323_MAX_BRIGHTNESS 6
+#define MT6323_UNIT_DUTY 3125
+#define MT6323_CAL_HW_DUTY(o, p) DIV_ROUND_CLOSEST((o) * 100000ul,\
+ (p) * MT6323_UNIT_DUTY)
+
+struct mt6323_leds;
+
+/**
+ * struct mt6323_led - state container for the LED device
+ * @id: the identifier in MT6323 LED device
+ * @parent: the pointer to MT6323 LED controller
+ * @cdev: LED class device for this LED device
+ * @current_brightness: current state of the LED device
+ */
+struct mt6323_led {
+ int id;
+ struct mt6323_leds *parent;
+ struct led_classdev cdev;
+ enum led_brightness current_brightness;
+};
+
+/**
+ * struct mt6323_leds - state container for holding LED controller
+ * of the driver
+ * @dev: the device pointer
+ * @hw: the underlying hardware providing shared
+ * bus for the register operations
+ * @lock: the lock among process context
+ * @led: the array that contains the state of individual
+ * LED device
+ */
+struct mt6323_leds {
+ struct device *dev;
+ struct mt6397_chip *hw;
+ /* protect among process context */
+ struct mutex lock;
+ struct mt6323_led *led[MT6323_MAX_LEDS];
+};
+
+static int mt6323_led_hw_brightness(struct led_classdev *cdev,
+ enum led_brightness brightness)
+{
+ struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+ struct mt6323_leds *leds = led->parent;
+ struct regmap *regmap = leds->hw->regmap;
+ u32 con2_mask = 0, con2_val = 0;
+ int ret;
+
+ /*
+ * Setup current output for the corresponding
+ * brightness level.
+ */
+ con2_mask |= MT6323_ISINK_CH_STEP_MASK |
+ MT6323_ISINK_SFSTR0_TC_MASK |
+ MT6323_ISINK_SFSTR0_EN_MASK;
+ con2_val |= MT6323_ISINK_CH_STEP(brightness - 1) |
+ MT6323_ISINK_SFSTR0_TC(2) |
+ MT6323_ISINK_SFSTR0_EN;
+
+ ret = regmap_update_bits(regmap, MT6323_ISINK_CON2(led->id),
+ con2_mask, con2_val);
+ return ret;
+}
+
+static int mt6323_led_hw_off(struct led_classdev *cdev)
+{
+ struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+ struct mt6323_leds *leds = led->parent;
+ struct regmap *regmap = leds->hw->regmap;
+ unsigned int status;
+ int ret;
+
+ status = MT6323_ISINK_CH_EN(led->id);
+ ret = regmap_update_bits(regmap, MT6323_ISINK_EN_CTRL,
+ MT6323_ISINK_CH_EN_MASK(led->id), ~status);
+ if (ret < 0)
+ return ret;
+
+ usleep_range(100, 300);
+ ret = regmap_update_bits(regmap, MT6323_TOP_CKPDN2,
+ MT6323_RG_ISINK_CK_PDN_MASK(led->id),
+ MT6323_RG_ISINK_CK_PDN(led->id));
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static enum led_brightness
+mt6323_get_led_hw_brightness(struct led_classdev *cdev)
+{
+ struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+ struct mt6323_leds *leds = led->parent;
+ struct regmap *regmap = leds->hw->regmap;
+ unsigned int status;
+ int ret;
+
+ ret = regmap_read(regmap, MT6323_TOP_CKPDN2, &status);
+ if (ret < 0)
+ return ret;
+
+ if (status & MT6323_RG_ISINK_CK_PDN_MASK(led->id))
+ return 0;
+
+ ret = regmap_read(regmap, MT6323_ISINK_EN_CTRL, &status);
+ if (ret < 0)
+ return ret;
+
+ if (!(status & MT6323_ISINK_CH_EN(led->id)))
+ return 0;
+
+ ret = regmap_read(regmap, MT6323_ISINK_CON2(led->id), &status);
+ if (ret < 0)
+ return ret;
+
+ return ((status & MT6323_ISINK_CH_STEP_MASK)
+ >> MT6323_ISINK_CH_STEP_SHIFT) + 1;
+}
+
+static int mt6323_led_hw_on(struct led_classdev *cdev,
+ enum led_brightness brightness)
+{
+ struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+ struct mt6323_leds *leds = led->parent;
+ struct regmap *regmap = leds->hw->regmap;
+ unsigned int status;
+ int ret;
+
+ /*
+ * Setup required clock source, enable the corresponding
+ * clock and channel and let work with continuous blink as
+ * the default.
+ */
+ ret = regmap_update_bits(regmap, MT6323_TOP_CKCON1,
+ MT6323_RG_ISINK_CK_SEL_MASK(led->id), 0);
+ if (ret < 0)
+ return ret;
+
+ status = MT6323_RG_ISINK_CK_PDN(led->id);
+ ret = regmap_update_bits(regmap, MT6323_TOP_CKPDN2,
+ MT6323_RG_ISINK_CK_PDN_MASK(led->id),
+ ~status);
+ if (ret < 0)
+ return ret;
+
+ usleep_range(100, 300);
+
+ ret = regmap_update_bits(regmap, MT6323_ISINK_EN_CTRL,
+ MT6323_ISINK_CH_EN_MASK(led->id),
+ MT6323_ISINK_CH_EN(led->id));
+ if (ret < 0)
+ return ret;
+
+ ret = mt6323_led_hw_brightness(cdev, brightness);
+ if (ret < 0)
+ return ret;
+
+ ret = regmap_update_bits(regmap, MT6323_ISINK_CON0(led->id),
+ MT6323_ISINK_DIM_DUTY_MASK,
+ MT6323_ISINK_DIM_DUTY(31));
+ if (ret < 0)
+ return ret;
+
+ ret = regmap_update_bits(regmap, MT6323_ISINK_CON1(led->id),
+ MT6323_ISINK_DIM_FSEL_MASK,
+ MT6323_ISINK_DIM_FSEL(1000));
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static int mt6323_led_set_blink(struct led_classdev *cdev,
+ unsigned long *delay_on,
+ unsigned long *delay_off)
+{
+ struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+ struct mt6323_leds *leds = led->parent;
+ struct regmap *regmap = leds->hw->regmap;
+ unsigned long period;
+ u8 duty_hw;
+ int ret;
+
+ /*
+ * Units are in ms, if over the hardware able
+ * to support, fallback into software blink
+ */
+ period = *delay_on + *delay_off;
+
+ if (period > MT6323_MAX_PERIOD)
+ return -EINVAL;
+
+ /*
+ * LED subsystem requires a default user
+ * friendly blink pattern for the LED so using
+ * 1Hz duty cycle 50% here if without specific
+ * value delay_on and delay off being assigned.
+ */
+ if (!*delay_on && !*delay_off) {
+ *delay_on = 500;
+ *delay_off = 500;
+ }
+
+ /*
+ * Calculate duty_hw based on the percentage of period during
+ * which the led is ON.
+ */
+ duty_hw = MT6323_CAL_HW_DUTY(*delay_on, period);
+
+ /* hardware doesn't support zero duty cycle. */
+ if (!duty_hw)
+ return -EINVAL;
+
+ mutex_lock(&leds->lock);
+ /*
+ * Set max_brightness as the software blink behavior
+ * when no blink brightness.
+ */
+ if (!led->current_brightness) {
+ ret = mt6323_led_hw_on(cdev, cdev->max_brightness);
+ if (ret < 0)
+ goto out;
+ led->current_brightness = cdev->max_brightness;
+ }
+
+ ret = regmap_update_bits(regmap, MT6323_ISINK_CON0(led->id),
+ MT6323_ISINK_DIM_DUTY_MASK,
+ MT6323_ISINK_DIM_DUTY(duty_hw - 1));
+ if (ret < 0)
+ goto out;
+
+ ret = regmap_update_bits(regmap, MT6323_ISINK_CON1(led->id),
+ MT6323_ISINK_DIM_FSEL_MASK,
+ MT6323_ISINK_DIM_FSEL(period - 1));
+out:
+ mutex_unlock(&leds->lock);
+
+ return ret;
+}
+
+static int mt6323_led_set_brightness(struct led_classdev *cdev,
+ enum led_brightness brightness)
+{
+ struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+ struct mt6323_leds *leds = led->parent;
+ int ret;
+
+ mutex_lock(&leds->lock);
+
+ if (!led->current_brightness && brightness) {
+ ret = mt6323_led_hw_on(cdev, brightness);
+ if (ret < 0)
+ goto out;
+ } else if (brightness) {
+ ret = mt6323_led_hw_brightness(cdev, brightness);
+ if (ret < 0)
+ goto out;
+ } else {
+ ret = mt6323_led_hw_off(cdev);
+ if (ret < 0)
+ goto out;
+ }
+
+ led->current_brightness = brightness;
+out:
+ mutex_unlock(&leds->lock);
+
+ return ret;
+}
+
+static int mt6323_led_set_dt_default(struct led_classdev *cdev,
+ struct device_node *np)
+{
+ struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+ const char *state;
+ int ret = 0;
+
+ led->cdev.name = of_get_property(np, "label", NULL) ? : np->name;
+ led->cdev.default_trigger = of_get_property(np,
+ "linux,default-trigger",
+ NULL);
+
+ state = of_get_property(np, "default-state", NULL);
+ if (state) {
+ if (!strcmp(state, "keep")) {
+ ret = mt6323_get_led_hw_brightness(cdev);
+ if (ret < 0)
+ return ret;
+ led->current_brightness = ret;
+ ret = 0;
+ } else if (!strcmp(state, "on")) {
+ ret =
+ mt6323_led_set_brightness(cdev, cdev->max_brightness);
+ } else {
+ ret = mt6323_led_set_brightness(cdev, LED_OFF);
+ }
+ }
+
+ return ret;
+}
+
+static int mt6323_led_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct device_node *np = pdev->dev.of_node;
+ struct device_node *child;
+ struct mt6397_chip *hw = dev_get_drvdata(pdev->dev.parent);
+ struct mt6323_leds *leds;
+ struct mt6323_led *led;
+ int ret;
+ unsigned int status;
+ u32 reg;
+
+ leds = devm_kzalloc(dev, sizeof(*leds), GFP_KERNEL);
+ if (!leds)
+ return -ENOMEM;
+
+ platform_set_drvdata(pdev, leds);
+ leds->dev = dev;
+
+ /*
+ * leds->hw points to the underlying bus for the register
+ * controlled.
+ */
+ leds->hw = hw;
+ mutex_init(&leds->lock);
+
+ status = MT6323_RG_DRV_32K_CK_PDN;
+ ret = regmap_update_bits(leds->hw->regmap, MT6323_TOP_CKPDN0,
+ MT6323_RG_DRV_32K_CK_PDN_MASK, ~status);
+ if (ret < 0) {
+ dev_err(leds->dev,
+ "Failed to update MT6323_TOP_CKPDN0 Register\n");
+ return ret;
+ }
+
+ for_each_available_child_of_node(np, child) {
+ ret = of_property_read_u32(child, "reg", &reg);
+ if (ret) {
+ dev_err(dev, "Failed to read led 'reg' property\n");
+ goto put_child_node;
+ }
+
+ if (reg >= MT6323_MAX_LEDS || leds->led[reg]) {
+ dev_err(dev, "Invalid led reg %u\n", reg);
+ ret = -EINVAL;
+ goto put_child_node;
+ }
+
+ led = devm_kzalloc(dev, sizeof(*led), GFP_KERNEL);
+ if (!led) {
+ ret = -ENOMEM;
+ goto put_child_node;
+ }
+
+ leds->led[reg] = led;
+ leds->led[reg]->id = reg;
+ leds->led[reg]->cdev.max_brightness = MT6323_MAX_BRIGHTNESS;
+ leds->led[reg]->cdev.brightness_set_blocking =
+ mt6323_led_set_brightness;
+ leds->led[reg]->cdev.blink_set = mt6323_led_set_blink;
+ leds->led[reg]->cdev.brightness_get =
+ mt6323_get_led_hw_brightness;
+ leds->led[reg]->parent = leds;
+
+ ret = mt6323_led_set_dt_default(&leds->led[reg]->cdev, child);
+ if (ret < 0) {
+ dev_err(leds->dev,
+ "Failed to LED set default from devicetree\n");
+ goto put_child_node;
+ }
+
+ ret = devm_led_classdev_register(dev, &leds->led[reg]->cdev);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to register LED: %d\n",
+ ret);
+ goto put_child_node;
+ }
+ leds->led[reg]->cdev.dev->of_node = child;
+ }
+
+ return 0;
+
+put_child_node:
+ of_node_put(child);
+ return ret;
+}
+
+static int mt6323_led_remove(struct platform_device *pdev)
+{
+ struct mt6323_leds *leds = platform_get_drvdata(pdev);
+ int i;
+
+ /* Turn the LEDs off on driver removal. */
+ for (i = 0 ; leds->led[i] ; i++)
+ mt6323_led_hw_off(&leds->led[i]->cdev);
+
+ regmap_update_bits(leds->hw->regmap, MT6323_TOP_CKPDN0,
+ MT6323_RG_DRV_32K_CK_PDN_MASK,
+ MT6323_RG_DRV_32K_CK_PDN);
+
+ mutex_destroy(&leds->lock);
+
+ return 0;
+}
+
+static const struct of_device_id mt6323_led_dt_match[] = {
+ { .compatible = "mediatek,mt6323-led" },
+ {},
+};
+MODULE_DEVICE_TABLE(of, mt6323_led_dt_match);
+
+static struct platform_driver mt6323_led_driver = {
+ .probe = mt6323_led_probe,
+ .remove = mt6323_led_remove,
+ .driver = {
+ .name = "mt6323-led",
+ .of_match_table = mt6323_led_dt_match,
+ },
+};
+
+module_platform_driver(mt6323_led_driver);
+
+MODULE_DESCRIPTION("LED driver for Mediatek MT6323 PMIC");
+MODULE_AUTHOR("Sean Wang <sean.wang@mediatek.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/leds/leds-pca9532.c b/drivers/leds/leds-pca9532.c
index 06e63106ae1e..7fea18b0c15d 100644
--- a/drivers/leds/leds-pca9532.c
+++ b/drivers/leds/leds-pca9532.c
@@ -254,6 +254,21 @@ static void pca9532_input_work(struct work_struct *work)
mutex_unlock(&data->update_lock);
}
+static enum pca9532_state pca9532_getled(struct pca9532_led *led)
+{
+ struct i2c_client *client = led->client;
+ struct pca9532_data *data = i2c_get_clientdata(client);
+ u8 maxleds = data->chip_info->num_leds;
+ char reg;
+ enum pca9532_state ret;
+
+ mutex_lock(&data->update_lock);
+ reg = i2c_smbus_read_byte_data(client, LED_REG(maxleds, led->id));
+ ret = reg >> LED_NUM(led->id)/2;
+ mutex_unlock(&data->update_lock);
+ return ret;
+}
+
#ifdef CONFIG_LEDS_PCA9532_GPIO
static int pca9532_gpio_request_pin(struct gpio_chip *gc, unsigned offset)
{
@@ -366,7 +381,10 @@ static int pca9532_configure(struct i2c_client *client,
gpios++;
break;
case PCA9532_TYPE_LED:
- led->state = pled->state;
+ if (pled->state == PCA9532_KEEP)
+ led->state = pca9532_getled(led);
+ else
+ led->state = pled->state;
led->name = pled->name;
led->ldev.name = led->name;
led->ldev.default_trigger = pled->default_trigger;
@@ -456,6 +474,7 @@ pca9532_of_populate_pdata(struct device *dev, struct device_node *np)
const struct of_device_id *match;
int devid, maxleds;
int i = 0;
+ const char *state;
match = of_match_device(of_pca9532_leds_match, dev);
if (!match)
@@ -475,6 +494,12 @@ pca9532_of_populate_pdata(struct device *dev, struct device_node *np)
of_property_read_u32(child, "type", &pdata->leds[i].type);
of_property_read_string(child, "linux,default-trigger",
&pdata->leds[i].default_trigger);
+ if (!of_property_read_string(child, "default-state", &state)) {
+ if (!strcmp(state, "on"))
+ pdata->leds[i].state = PCA9532_ON;
+ else if (!strcmp(state, "keep"))
+ pdata->leds[i].state = PCA9532_KEEP;
+ }
if (++i >= maxleds) {
of_node_put(child);
break;
diff --git a/drivers/leds/trigger/ledtrig-cpu.c b/drivers/leds/trigger/ledtrig-cpu.c
index a41896468cb3..66a626091936 100644
--- a/drivers/leds/trigger/ledtrig-cpu.c
+++ b/drivers/leds/trigger/ledtrig-cpu.c
@@ -31,12 +31,16 @@
#define MAX_NAME_LEN 8
struct led_trigger_cpu {
+ bool is_active;
char name[MAX_NAME_LEN];
struct led_trigger *_trig;
};
static DEFINE_PER_CPU(struct led_trigger_cpu, cpu_trig);
+static struct led_trigger *trig_cpu_all;
+static atomic_t num_active_cpus = ATOMIC_INIT(0);
+
/**
* ledtrig_cpu - emit a CPU event as a trigger
* @evt: CPU event to be emitted
@@ -47,26 +51,46 @@ static DEFINE_PER_CPU(struct led_trigger_cpu, cpu_trig);
void ledtrig_cpu(enum cpu_led_event ledevt)
{
struct led_trigger_cpu *trig = this_cpu_ptr(&cpu_trig);
+ bool is_active = trig->is_active;
/* Locate the correct CPU LED */
switch (ledevt) {
case CPU_LED_IDLE_END:
case CPU_LED_START:
/* Will turn the LED on, max brightness */
- led_trigger_event(trig->_trig, LED_FULL);
+ is_active = true;
break;
case CPU_LED_IDLE_START:
case CPU_LED_STOP:
case CPU_LED_HALTED:
/* Will turn the LED off */
- led_trigger_event(trig->_trig, LED_OFF);
+ is_active = false;
break;
default:
/* Will leave the LED as it is */
break;
}
+
+ if (is_active != trig->is_active) {
+ unsigned int active_cpus;
+ unsigned int total_cpus;
+
+ /* Update trigger state */
+ trig->is_active = is_active;
+ atomic_add(is_active ? 1 : -1, &num_active_cpus);
+ active_cpus = atomic_read(&num_active_cpus);
+ total_cpus = num_present_cpus();
+
+ led_trigger_event(trig->_trig,
+ is_active ? LED_FULL : LED_OFF);
+
+
+ led_trigger_event(trig_cpu_all,
+ DIV_ROUND_UP(LED_FULL * active_cpus, total_cpus));
+
+ }
}
EXPORT_SYMBOL(ledtrig_cpu);
@@ -113,6 +137,11 @@ static int __init ledtrig_cpu_init(void)
BUILD_BUG_ON(CONFIG_NR_CPUS > 9999);
/*
+ * Registering a trigger for all CPUs.
+ */
+ led_trigger_register_simple("cpu", &trig_cpu_all);
+
+ /*
* Registering CPU led trigger for each CPU core here
* ignores CPU hotplug, but after this CPU hotplug works
* fine with this trigger.
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 052714106b7b..ead61a93cb4e 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -33,4 +33,13 @@ config NVM_RRPC
host. The target is implemented using a linear mapping table and
cost-based garbage collection. It is optimized for 4K IO sizes.
+config NVM_PBLK
+ tristate "Physical Block Device Open-Channel SSD target"
+ ---help---
+ Allows an open-channel SSD to be exposed as a block device to the
+ host. The target assumes the device exposes raw flash and must be
+ explicitly managed by the host.
+
+ Please note the disk format is considered EXPERIMENTAL for now.
+
endif # NVM
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index b2a39e2d2895..82d1a117fb27 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -4,3 +4,8 @@
obj-$(CONFIG_NVM) := core.o
obj-$(CONFIG_NVM_RRPC) += rrpc.o
+obj-$(CONFIG_NVM_PBLK) += pblk.o
+pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
+ pblk-write.o pblk-cache.o pblk-read.o \
+ pblk-gc.o pblk-recovery.o pblk-map.o \
+ pblk-rl.o pblk-sysfs.o
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 5262ba66a7a7..54a06c3a2b8c 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -89,7 +89,7 @@ static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin,
WARN_ON(!test_and_clear_bit(i, dev->lun_map));
}
-static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
+static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
{
struct nvm_dev *dev = tgt_dev->parent;
struct nvm_dev_map *dev_map = tgt_dev->map;
@@ -100,11 +100,14 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
int *lun_offs = ch_map->lun_offs;
int ch = i + ch_map->ch_off;
- for (j = 0; j < ch_map->nr_luns; j++) {
- int lun = j + lun_offs[j];
- int lunid = (ch * dev->geo.luns_per_chnl) + lun;
+ if (clear) {
+ for (j = 0; j < ch_map->nr_luns; j++) {
+ int lun = j + lun_offs[j];
+ int lunid = (ch * dev->geo.luns_per_chnl) + lun;
- WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
+ WARN_ON(!test_and_clear_bit(lunid,
+ dev->lun_map));
+ }
}
kfree(ch_map->lun_offs);
@@ -232,6 +235,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
struct nvm_target *t;
struct nvm_tgt_dev *tgt_dev;
void *targetdata;
+ int ret;
tt = nvm_find_target_type(create->tgttype, 1);
if (!tt) {
@@ -252,34 +256,43 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
return -ENOMEM;
t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
- if (!t)
+ if (!t) {
+ ret = -ENOMEM;
goto err_reserve;
+ }
tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end);
if (!tgt_dev) {
pr_err("nvm: could not create target device\n");
+ ret = -ENOMEM;
goto err_t;
}
- tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
- if (!tqueue)
+ tdisk = alloc_disk(0);
+ if (!tdisk) {
+ ret = -ENOMEM;
goto err_dev;
- blk_queue_make_request(tqueue, tt->make_rq);
+ }
- tdisk = alloc_disk(0);
- if (!tdisk)
- goto err_queue;
+ tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
+ if (!tqueue) {
+ ret = -ENOMEM;
+ goto err_disk;
+ }
+ blk_queue_make_request(tqueue, tt->make_rq);
- sprintf(tdisk->disk_name, "%s", create->tgtname);
+ strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name));
tdisk->flags = GENHD_FL_EXT_DEVT;
tdisk->major = 0;
tdisk->first_minor = 0;
tdisk->fops = &nvm_fops;
tdisk->queue = tqueue;
- targetdata = tt->init(tgt_dev, tdisk);
- if (IS_ERR(targetdata))
+ targetdata = tt->init(tgt_dev, tdisk, create->flags);
+ if (IS_ERR(targetdata)) {
+ ret = PTR_ERR(targetdata);
goto err_init;
+ }
tdisk->private_data = targetdata;
tqueue->queuedata = targetdata;
@@ -289,8 +302,10 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
set_capacity(tdisk, tt->capacity(targetdata));
add_disk(tdisk);
- if (tt->sysfs_init && tt->sysfs_init(tdisk))
+ if (tt->sysfs_init && tt->sysfs_init(tdisk)) {
+ ret = -ENOMEM;
goto err_sysfs;
+ }
t->type = tt;
t->disk = tdisk;
@@ -305,16 +320,17 @@ err_sysfs:
if (tt->exit)
tt->exit(targetdata);
err_init:
- put_disk(tdisk);
-err_queue:
blk_cleanup_queue(tqueue);
+ tdisk->queue = NULL;
+err_disk:
+ put_disk(tdisk);
err_dev:
- nvm_remove_tgt_dev(tgt_dev);
+ nvm_remove_tgt_dev(tgt_dev, 0);
err_t:
kfree(t);
err_reserve:
nvm_release_luns_err(dev, s->lun_begin, s->lun_end);
- return -ENOMEM;
+ return ret;
}
static void __nvm_remove_target(struct nvm_target *t)
@@ -332,7 +348,7 @@ static void __nvm_remove_target(struct nvm_target *t)
if (tt->exit)
tt->exit(tdisk->private_data);
- nvm_remove_tgt_dev(t->dev);
+ nvm_remove_tgt_dev(t->dev, 1);
put_disk(tdisk);
list_del(&t->list);
@@ -411,6 +427,18 @@ err_rmap:
return -ENOMEM;
}
+static void nvm_unregister_map(struct nvm_dev *dev)
+{
+ struct nvm_dev_map *rmap = dev->rmap;
+ int i;
+
+ for (i = 0; i < dev->geo.nr_chnls; i++)
+ kfree(rmap->chnls[i].lun_offs);
+
+ kfree(rmap->chnls);
+ kfree(rmap);
+}
+
static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
{
struct nvm_dev_map *dev_map = tgt_dev->map;
@@ -486,7 +514,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
int *lun_roffs;
struct ppa_addr gaddr;
u64 pba = le64_to_cpu(entries[i]);
- int off;
u64 diff;
if (!pba)
@@ -496,8 +523,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
lun_roffs = ch_rmap->lun_offs;
- off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun;
-
diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
(lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
@@ -590,11 +615,11 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
memset(&rqd, 0, sizeof(struct nvm_rq));
- nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
+ nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
nvm_rq_tgt_to_dev(tgt_dev, &rqd);
ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
- nvm_free_rqd_ppalist(dev, &rqd);
+ nvm_free_rqd_ppalist(tgt_dev, &rqd);
if (ret) {
pr_err("nvm: failed bb mark\n");
return -EINVAL;
@@ -626,34 +651,45 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
}
EXPORT_SYMBOL(nvm_submit_io);
-int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags)
+static void nvm_end_io_sync(struct nvm_rq *rqd)
{
- struct nvm_dev *dev = tgt_dev->parent;
- struct nvm_rq rqd;
- int ret;
+ struct completion *waiting = rqd->private;
- if (!dev->ops->erase_block)
- return 0;
+ complete(waiting);
+}
- nvm_map_to_dev(tgt_dev, ppas);
+int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
+ int nr_ppas)
+{
+ struct nvm_geo *geo = &tgt_dev->geo;
+ struct nvm_rq rqd;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(wait);
memset(&rqd, 0, sizeof(struct nvm_rq));
- ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, 1, 1);
+ rqd.opcode = NVM_OP_ERASE;
+ rqd.end_io = nvm_end_io_sync;
+ rqd.private = &wait;
+ rqd.flags = geo->plane_mode >> 1;
+
+ ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
if (ret)
return ret;
- nvm_rq_tgt_to_dev(tgt_dev, &rqd);
-
- rqd.flags = flags;
-
- ret = dev->ops->erase_block(dev, &rqd);
+ ret = nvm_submit_io(tgt_dev, &rqd);
+ if (ret) {
+ pr_err("rrpr: erase I/O submission failed: %d\n", ret);
+ goto free_ppa_list;
+ }
+ wait_for_completion_io(&wait);
- nvm_free_rqd_ppalist(dev, &rqd);
+free_ppa_list:
+ nvm_free_rqd_ppalist(tgt_dev, &rqd);
return ret;
}
-EXPORT_SYMBOL(nvm_erase_blk);
+EXPORT_SYMBOL(nvm_erase_sync);
int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
nvm_l2p_update_fn *update_l2p, void *priv)
@@ -732,10 +768,11 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
}
EXPORT_SYMBOL(nvm_put_area);
-int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
+int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
const struct ppa_addr *ppas, int nr_ppas, int vblk)
{
- struct nvm_geo *geo = &dev->geo;
+ struct nvm_dev *dev = tgt_dev->parent;
+ struct nvm_geo *geo = &tgt_dev->geo;
int i, plane_cnt, pl_idx;
struct ppa_addr ppa;
@@ -773,12 +810,12 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
}
EXPORT_SYMBOL(nvm_set_rqd_ppalist);
-void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
+void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
{
if (!rqd->ppa_list)
return;
- nvm_dev_dma_free(dev, rqd->ppa_list, rqd->dma_ppa_list);
+ nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
}
EXPORT_SYMBOL(nvm_free_rqd_ppalist);
@@ -972,7 +1009,7 @@ err_fmtype:
return ret;
}
-void nvm_free(struct nvm_dev *dev)
+static void nvm_free(struct nvm_dev *dev)
{
if (!dev)
return;
@@ -980,7 +1017,7 @@ void nvm_free(struct nvm_dev *dev)
if (dev->dma_pool)
dev->ops->destroy_dma_pool(dev->dma_pool);
- kfree(dev->rmap);
+ nvm_unregister_map(dev);
kfree(dev->lptbl);
kfree(dev->lun_map);
kfree(dev);
@@ -1174,13 +1211,13 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
list_for_each_entry(dev, &nvm_devices, devices) {
struct nvm_ioctl_device_info *info = &devices->info[i];
- sprintf(info->devname, "%s", dev->name);
+ strlcpy(info->devname, dev->name, sizeof(info->devname));
/* kept for compatibility */
info->bmversion[0] = 1;
info->bmversion[1] = 0;
info->bmversion[2] = 0;
- sprintf(info->bmname, "%s", "gennvm");
+ strlcpy(info->bmname, "gennvm", sizeof(info->bmname));
i++;
if (i > 31) {
@@ -1217,8 +1254,16 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
create.tgtname[DISK_NAME_LEN - 1] = '\0';
if (create.flags != 0) {
- pr_err("nvm: no flags supported\n");
- return -EINVAL;
+ __u32 flags = create.flags;
+
+ /* Check for valid flags */
+ if (flags & NVM_TARGET_FACTORY)
+ flags &= ~NVM_TARGET_FACTORY;
+
+ if (flags) {
+ pr_err("nvm: flag not supported\n");
+ return -EINVAL;
+ }
}
return __nvm_configure_create(&create);
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
new file mode 100644
index 000000000000..59bcea88db84
--- /dev/null
+++ b/drivers/lightnvm/pblk-cache.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-cache.c - pblk's write cache
+ */
+
+#include "pblk.h"
+
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
+{
+ struct pblk_w_ctx w_ctx;
+ sector_t lba = pblk_get_lba(bio);
+ unsigned int bpos, pos;
+ int nr_entries = pblk_get_secs(bio);
+ int i, ret;
+
+ /* Update the write buffer head (mem) with the entries that we can
+ * write. The write in itself cannot fail, so there is no need to
+ * rollback from here on.
+ */
+retry:
+ ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos);
+ if (ret == NVM_IO_REQUEUE) {
+ io_schedule();
+ goto retry;
+ }
+
+ if (unlikely(!bio_has_data(bio)))
+ goto out;
+
+ w_ctx.flags = flags;
+ pblk_ppa_set_empty(&w_ctx.ppa);
+
+ for (i = 0; i < nr_entries; i++) {
+ void *data = bio_data(bio);
+
+ w_ctx.lba = lba + i;
+
+ pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
+ pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);
+
+ bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(nr_entries, &pblk->inflight_writes);
+ atomic_long_add(nr_entries, &pblk->req_writes);
+#endif
+
+out:
+ pblk_write_should_kick(pblk);
+ return ret;
+}
+
+/*
+ * On GC the incoming lbas are not necessarily sequential. Also, some of the
+ * lbas might not be valid entries, which are marked as empty by the GC thread
+ */
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+ unsigned int nr_entries, unsigned int nr_rec_entries,
+ struct pblk_line *gc_line, unsigned long flags)
+{
+ struct pblk_w_ctx w_ctx;
+ unsigned int bpos, pos;
+ int i, valid_entries;
+
+ /* Update the write buffer head (mem) with the entries that we can
+ * write. The write in itself cannot fail, so there is no need to
+ * rollback from here on.
+ */
+retry:
+ if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) {
+ io_schedule();
+ goto retry;
+ }
+
+ w_ctx.flags = flags;
+ pblk_ppa_set_empty(&w_ctx.ppa);
+
+ for (i = 0, valid_entries = 0; i < nr_entries; i++) {
+ if (lba_list[i] == ADDR_EMPTY)
+ continue;
+
+ w_ctx.lba = lba_list[i];
+
+ pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
+ pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos);
+
+ data += PBLK_EXPOSED_PAGE_SIZE;
+ valid_entries++;
+ }
+
+ WARN_ONCE(nr_rec_entries != valid_entries,
+ "pblk: inconsistent GC write\n");
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(valid_entries, &pblk->inflight_writes);
+ atomic_long_add(valid_entries, &pblk->recov_gc_writes);
+#endif
+
+ pblk_write_should_kick(pblk);
+ return NVM_IO_OK;
+}
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
new file mode 100644
index 000000000000..5e44768ccffa
--- /dev/null
+++ b/drivers/lightnvm/pblk-core.c
@@ -0,0 +1,1667 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-core.c - pblk's core functionality
+ *
+ */
+
+#include "pblk.h"
+#include <linux/time.h>
+
+static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
+ struct ppa_addr *ppa)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int pos = pblk_dev_ppa_to_pos(geo, *ppa);
+
+ pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
+ atomic_long_inc(&pblk->erase_failed);
+
+ atomic_dec(&line->blk_in_line);
+ if (test_and_set_bit(pos, line->blk_bitmap))
+ pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
+ line->id, pos);
+
+ pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb);
+}
+
+static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct pblk_line *line;
+
+ line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)];
+ atomic_dec(&line->left_seblks);
+
+ if (rqd->error) {
+ struct ppa_addr *ppa;
+
+ ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC);
+ if (!ppa)
+ return;
+
+ *ppa = rqd->ppa_addr;
+ pblk_mark_bb(pblk, line, ppa);
+ }
+}
+
+/* Erase completion assumes that only one block is erased at the time */
+static void pblk_end_io_erase(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+
+ up(&pblk->erase_sem);
+ __pblk_end_io_erase(pblk, rqd);
+ mempool_free(rqd, pblk->r_rq_pool);
+}
+
+static void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list = NULL;
+
+ /* Lines being reclaimed (GC'ed) cannot be invalidated. Before the L2P
+ * table is modified with reclaimed sectors, a check is done to endure
+ * that newer updates are not overwritten.
+ */
+ spin_lock(&line->lock);
+ if (line->state == PBLK_LINESTATE_GC ||
+ line->state == PBLK_LINESTATE_FREE) {
+ spin_unlock(&line->lock);
+ return;
+ }
+
+ if (test_and_set_bit(paddr, line->invalid_bitmap)) {
+ WARN_ONCE(1, "pblk: double invalidate\n");
+ spin_unlock(&line->lock);
+ return;
+ }
+ line->vsc--;
+
+ if (line->state == PBLK_LINESTATE_CLOSED)
+ move_list = pblk_line_gc_list(pblk, line);
+ spin_unlock(&line->lock);
+
+ if (move_list) {
+ spin_lock(&l_mg->gc_lock);
+ spin_lock(&line->lock);
+ /* Prevent moving a line that has just been chosen for GC */
+ if (line->state == PBLK_LINESTATE_GC ||
+ line->state == PBLK_LINESTATE_FREE) {
+ spin_unlock(&line->lock);
+ spin_unlock(&l_mg->gc_lock);
+ return;
+ }
+ spin_unlock(&line->lock);
+
+ list_move_tail(&line->list, move_list);
+ spin_unlock(&l_mg->gc_lock);
+ }
+}
+
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
+{
+ struct pblk_line *line;
+ u64 paddr;
+ int line_id;
+
+#ifdef CONFIG_NVM_DEBUG
+ /* Callers must ensure that the ppa points to a device address */
+ BUG_ON(pblk_addr_in_cache(ppa));
+ BUG_ON(pblk_ppa_empty(ppa));
+#endif
+
+ line_id = pblk_tgt_ppa_to_line(ppa);
+ line = &pblk->lines[line_id];
+ paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
+
+ __pblk_map_invalidate(pblk, line, paddr);
+}
+
+void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr)
+{
+ __pblk_map_invalidate(pblk, line, paddr);
+
+ pblk_rb_sync_init(&pblk->rwb, NULL);
+ line->left_ssecs--;
+ if (!line->left_ssecs)
+ pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
+ pblk_rb_sync_end(&pblk->rwb, NULL);
+}
+
+static void pblk_invalidate_range(struct pblk *pblk, sector_t slba,
+ unsigned int nr_secs)
+{
+ sector_t lba;
+
+ spin_lock(&pblk->trans_lock);
+ for (lba = slba; lba < slba + nr_secs; lba++) {
+ struct ppa_addr ppa;
+
+ ppa = pblk_trans_map_get(pblk, lba);
+
+ if (!pblk_addr_in_cache(ppa) && !pblk_ppa_empty(ppa))
+ pblk_map_invalidate(pblk, ppa);
+
+ pblk_ppa_set_empty(&ppa);
+ pblk_trans_map_set(pblk, lba, ppa);
+ }
+ spin_unlock(&pblk->trans_lock);
+}
+
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
+{
+ mempool_t *pool;
+ struct nvm_rq *rqd;
+ int rq_size;
+
+ if (rw == WRITE) {
+ pool = pblk->w_rq_pool;
+ rq_size = pblk_w_rq_size;
+ } else {
+ pool = pblk->r_rq_pool;
+ rq_size = pblk_r_rq_size;
+ }
+
+ rqd = mempool_alloc(pool, GFP_KERNEL);
+ memset(rqd, 0, rq_size);
+
+ return rqd;
+}
+
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw)
+{
+ mempool_t *pool;
+
+ if (rw == WRITE)
+ pool = pblk->w_rq_pool;
+ else
+ pool = pblk->r_rq_pool;
+
+ mempool_free(rqd, pool);
+}
+
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+ int nr_pages)
+{
+ struct bio_vec bv;
+ int i;
+
+ WARN_ON(off + nr_pages != bio->bi_vcnt);
+
+ bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE);
+ for (i = off; i < nr_pages + off; i++) {
+ bv = bio->bi_io_vec[i];
+ mempool_free(bv.bv_page, pblk->page_pool);
+ }
+}
+
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+ int nr_pages)
+{
+ struct request_queue *q = pblk->dev->q;
+ struct page *page;
+ int i, ret;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = mempool_alloc(pblk->page_pool, flags);
+ if (!page)
+ goto err;
+
+ ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
+ if (ret != PBLK_EXPOSED_PAGE_SIZE) {
+ pr_err("pblk: could not add page to bio\n");
+ mempool_free(page, pblk->page_pool);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ pblk_bio_free_pages(pblk, bio, 0, i - 1);
+ return -1;
+}
+
+static void pblk_write_kick(struct pblk *pblk)
+{
+ wake_up_process(pblk->writer_ts);
+ mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000));
+}
+
+void pblk_write_timer_fn(unsigned long data)
+{
+ struct pblk *pblk = (struct pblk *)data;
+
+ /* kick the write thread every tick to flush outstanding data */
+ pblk_write_kick(pblk);
+}
+
+void pblk_write_should_kick(struct pblk *pblk)
+{
+ unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
+
+ if (secs_avail >= pblk->min_write_pgs)
+ pblk_write_kick(pblk);
+}
+
+void pblk_end_bio_sync(struct bio *bio)
+{
+ struct completion *waiting = bio->bi_private;
+
+ complete(waiting);
+}
+
+void pblk_end_io_sync(struct nvm_rq *rqd)
+{
+ struct completion *waiting = rqd->private;
+
+ complete(waiting);
+}
+
+void pblk_flush_writer(struct pblk *pblk)
+{
+ struct bio *bio;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ if (!bio)
+ return;
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_OP_FLUSH);
+ bio->bi_private = &wait;
+ bio->bi_end_io = pblk_end_bio_sync;
+
+ ret = pblk_write_to_cache(pblk, bio, 0);
+ if (ret == NVM_IO_OK) {
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: flush cache timed out\n");
+ }
+ } else if (ret != NVM_IO_DONE) {
+ pr_err("pblk: tear down bio failed\n");
+ }
+
+ if (bio->bi_error)
+ pr_err("pblk: flush sync write failed (%u)\n", bio->bi_error);
+
+ bio_put(bio);
+}
+
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list = NULL;
+
+ if (!line->vsc) {
+ if (line->gc_group != PBLK_LINEGC_FULL) {
+ line->gc_group = PBLK_LINEGC_FULL;
+ move_list = &l_mg->gc_full_list;
+ }
+ } else if (line->vsc < lm->mid_thrs) {
+ if (line->gc_group != PBLK_LINEGC_HIGH) {
+ line->gc_group = PBLK_LINEGC_HIGH;
+ move_list = &l_mg->gc_high_list;
+ }
+ } else if (line->vsc < lm->high_thrs) {
+ if (line->gc_group != PBLK_LINEGC_MID) {
+ line->gc_group = PBLK_LINEGC_MID;
+ move_list = &l_mg->gc_mid_list;
+ }
+ } else if (line->vsc < line->sec_in_line) {
+ if (line->gc_group != PBLK_LINEGC_LOW) {
+ line->gc_group = PBLK_LINEGC_LOW;
+ move_list = &l_mg->gc_low_list;
+ }
+ } else if (line->vsc == line->sec_in_line) {
+ if (line->gc_group != PBLK_LINEGC_EMPTY) {
+ line->gc_group = PBLK_LINEGC_EMPTY;
+ move_list = &l_mg->gc_empty_list;
+ }
+ } else {
+ line->state = PBLK_LINESTATE_CORRUPT;
+ line->gc_group = PBLK_LINEGC_NONE;
+ move_list = &l_mg->corrupt_list;
+ pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
+ line->id, line->vsc,
+ line->sec_in_line,
+ lm->high_thrs, lm->mid_thrs);
+ }
+
+ return move_list;
+}
+
+void pblk_discard(struct pblk *pblk, struct bio *bio)
+{
+ sector_t slba = pblk_get_lba(bio);
+ sector_t nr_secs = pblk_get_secs(bio);
+
+ pblk_invalidate_range(pblk, slba, nr_secs);
+}
+
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba)
+{
+ struct ppa_addr ppa;
+
+ spin_lock(&pblk->trans_lock);
+ ppa = pblk_trans_map_get(pblk, lba);
+ spin_unlock(&pblk->trans_lock);
+
+ return ppa;
+}
+
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ atomic_long_inc(&pblk->write_failed);
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ /* Empty page read is not necessarily an error (e.g., L2P recovery) */
+ if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+ atomic_long_inc(&pblk->read_empty);
+ return;
+ }
+
+ switch (rqd->error) {
+ case NVM_RSP_WARN_HIGHECC:
+ atomic_long_inc(&pblk->read_high_ecc);
+ break;
+ case NVM_RSP_ERR_FAILECC:
+ case NVM_RSP_ERR_FAILCRC:
+ atomic_long_inc(&pblk->read_failed);
+ break;
+ default:
+ pr_err("pblk: unknown read error:%d\n", rqd->error);
+ }
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+
+#ifdef CONFIG_NVM_DEBUG
+ struct ppa_addr *ppa_list;
+
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+ if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (rqd->opcode == NVM_OP_PWRITE) {
+ struct pblk_line *line;
+ struct ppa_addr ppa;
+ int i;
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ ppa = ppa_list[i];
+ line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+
+ spin_lock(&line->lock);
+ if (line->state != PBLK_LINESTATE_OPEN) {
+ pr_err("pblk: bad ppa: line:%d,state:%d\n",
+ line->id, line->state);
+ WARN_ON(1);
+ spin_unlock(&line->lock);
+ return -EINVAL;
+ }
+ spin_unlock(&line->lock);
+ }
+ }
+#endif
+ return nvm_submit_io(dev, rqd);
+}
+
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+ unsigned int nr_secs, unsigned int len,
+ gfp_t gfp_mask)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ void *kaddr = data;
+ struct page *page;
+ struct bio *bio;
+ int i, ret;
+
+ if (l_mg->emeta_alloc_type == PBLK_KMALLOC_META)
+ return bio_map_kern(dev->q, kaddr, len, gfp_mask);
+
+ bio = bio_kmalloc(gfp_mask, nr_secs);
+ if (!bio)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < nr_secs; i++) {
+ page = vmalloc_to_page(kaddr);
+ if (!page) {
+ pr_err("pblk: could not map vmalloc bio\n");
+ bio_put(bio);
+ bio = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
+ if (ret != PAGE_SIZE) {
+ pr_err("pblk: could not add page to bio\n");
+ bio_put(bio);
+ bio = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ kaddr += PAGE_SIZE;
+ }
+out:
+ return bio;
+}
+
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+ unsigned long secs_to_flush)
+{
+ int max = pblk->max_write_pgs;
+ int min = pblk->min_write_pgs;
+ int secs_to_sync = 0;
+
+ if (secs_avail >= max)
+ secs_to_sync = max;
+ else if (secs_avail >= min)
+ secs_to_sync = min * (secs_avail / min);
+ else if (secs_to_flush)
+ secs_to_sync = min;
+
+ return secs_to_sync;
+}
+
+static u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line,
+ int nr_secs)
+{
+ u64 addr;
+ int i;
+
+ /* logic error: ppa out-of-bounds. Prevent generating bad address */
+ if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) {
+ WARN(1, "pblk: page allocation out of bounds\n");
+ nr_secs = pblk->lm.sec_per_line - line->cur_sec;
+ }
+
+ line->cur_sec = addr = find_next_zero_bit(line->map_bitmap,
+ pblk->lm.sec_per_line, line->cur_sec);
+ for (i = 0; i < nr_secs; i++, line->cur_sec++)
+ WARN_ON(test_and_set_bit(line->cur_sec, line->map_bitmap));
+
+ return addr;
+}
+
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
+{
+ u64 addr;
+
+ /* Lock needed in case a write fails and a recovery needs to remap
+ * failed write buffer entries
+ */
+ spin_lock(&line->lock);
+ addr = __pblk_alloc_page(pblk, line, nr_secs);
+ line->left_msecs -= nr_secs;
+ WARN(line->left_msecs < 0, "pblk: page allocation out of bounds\n");
+ spin_unlock(&line->lock);
+
+ return addr;
+}
+
+/*
+ * Submit emeta to one LUN in the raid line at the time to avoid a deadlock when
+ * taking the per LUN semaphore.
+ */
+static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr, int dir)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct bio *bio;
+ struct nvm_rq rqd;
+ struct ppa_addr *ppa_list;
+ dma_addr_t dma_ppa_list;
+ void *emeta = line->emeta;
+ int min = pblk->min_write_pgs;
+ int left_ppas = lm->emeta_sec;
+ int id = line->id;
+ int rq_ppas, rq_len;
+ int cmd_op, bio_op;
+ int flags;
+ int i, j;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ if (dir == WRITE) {
+ bio_op = REQ_OP_WRITE;
+ cmd_op = NVM_OP_PWRITE;
+ flags = pblk_set_progr_mode(pblk, WRITE);
+ } else if (dir == READ) {
+ bio_op = REQ_OP_READ;
+ cmd_op = NVM_OP_PREAD;
+ flags = pblk_set_read_mode(pblk);
+ } else
+ return -EINVAL;
+
+ ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_ppa_list);
+ if (!ppa_list)
+ return -ENOMEM;
+
+next_rq:
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ rq_len = rq_ppas * geo->sec_size;
+
+ bio = pblk_bio_map_addr(pblk, emeta, rq_ppas, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio)) {
+ ret = PTR_ERR(bio);
+ goto free_rqd_dma;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, bio_op, 0);
+
+ rqd.bio = bio;
+ rqd.opcode = cmd_op;
+ rqd.flags = flags;
+ rqd.nr_ppas = rq_ppas;
+ rqd.ppa_list = ppa_list;
+ rqd.dma_ppa_list = dma_ppa_list;
+ rqd.end_io = pblk_end_io_sync;
+ rqd.private = &wait;
+
+ if (dir == WRITE) {
+ for (i = 0; i < rqd.nr_ppas; ) {
+ spin_lock(&line->lock);
+ paddr = __pblk_alloc_page(pblk, line, min);
+ spin_unlock(&line->lock);
+ for (j = 0; j < min; j++, i++, paddr++)
+ rqd.ppa_list[i] =
+ addr_to_gen_ppa(pblk, paddr, id);
+ }
+ } else {
+ for (i = 0; i < rqd.nr_ppas; ) {
+ struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
+ int pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ paddr += min;
+ if (pblk_boundary_paddr_checks(pblk, paddr)) {
+ pr_err("pblk: corrupt emeta line:%d\n",
+ line->id);
+ bio_put(bio);
+ ret = -EINTR;
+ goto free_rqd_dma;
+ }
+
+ ppa = addr_to_gen_ppa(pblk, paddr, id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+ }
+
+ if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
+ pr_err("pblk: corrupt emeta line:%d\n",
+ line->id);
+ bio_put(bio);
+ ret = -EINTR;
+ goto free_rqd_dma;
+ }
+
+ for (j = 0; j < min; j++, i++, paddr++)
+ rqd.ppa_list[i] =
+ addr_to_gen_ppa(pblk, paddr, line->id);
+ }
+ }
+
+ ret = pblk_submit_io(pblk, &rqd);
+ if (ret) {
+ pr_err("pblk: emeta I/O submission failed: %d\n", ret);
+ bio_put(bio);
+ goto free_rqd_dma;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: emeta I/O timed out\n");
+ }
+ reinit_completion(&wait);
+
+ bio_put(bio);
+
+ if (rqd.error) {
+ if (dir == WRITE)
+ pblk_log_write_err(pblk, &rqd);
+ else
+ pblk_log_read_err(pblk, &rqd);
+ }
+
+ emeta += rq_len;
+ left_ppas -= rq_ppas;
+ if (left_ppas)
+ goto next_rq;
+free_rqd_dma:
+ nvm_dev_dma_free(dev->parent, ppa_list, dma_ppa_list);
+ return ret;
+}
+
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ int bit;
+
+ /* This usually only happens on bad lines */
+ bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+ if (bit >= lm->blk_per_line)
+ return -1;
+
+ return bit * geo->sec_per_pl;
+}
+
+static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr, int dir)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct bio *bio;
+ struct nvm_rq rqd;
+ __le64 *lba_list = NULL;
+ int i, ret;
+ int cmd_op, bio_op;
+ int flags;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ if (dir == WRITE) {
+ bio_op = REQ_OP_WRITE;
+ cmd_op = NVM_OP_PWRITE;
+ flags = pblk_set_progr_mode(pblk, WRITE);
+ lba_list = pblk_line_emeta_to_lbas(line->emeta);
+ } else if (dir == READ) {
+ bio_op = REQ_OP_READ;
+ cmd_op = NVM_OP_PREAD;
+ flags = pblk_set_read_mode(pblk);
+ } else
+ return -EINVAL;
+
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd.dma_ppa_list);
+ if (!rqd.ppa_list)
+ return -ENOMEM;
+
+ bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
+ if (IS_ERR(bio)) {
+ ret = PTR_ERR(bio);
+ goto free_ppa_list;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, bio_op, 0);
+
+ rqd.bio = bio;
+ rqd.opcode = cmd_op;
+ rqd.flags = flags;
+ rqd.nr_ppas = lm->smeta_sec;
+ rqd.end_io = pblk_end_io_sync;
+ rqd.private = &wait;
+
+ for (i = 0; i < lm->smeta_sec; i++, paddr++) {
+ rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+ if (dir == WRITE)
+ lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
+ }
+
+ /*
+ * This I/O is sent by the write thread when a line is replace. Since
+ * the write thread is the only one sending write and erase commands,
+ * there is no need to take the LUN semaphore.
+ */
+ ret = pblk_submit_io(pblk, &rqd);
+ if (ret) {
+ pr_err("pblk: smeta I/O submission failed: %d\n", ret);
+ bio_put(bio);
+ goto free_ppa_list;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: smeta I/O timed out\n");
+ }
+
+ if (rqd.error) {
+ if (dir == WRITE)
+ pblk_log_write_err(pblk, &rqd);
+ else
+ pblk_log_read_err(pblk, &rqd);
+ }
+
+free_ppa_list:
+ nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+
+ return ret;
+}
+
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
+{
+ u64 bpaddr = pblk_line_smeta_start(pblk, line);
+
+ return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ);
+}
+
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line)
+{
+ return pblk_line_submit_emeta_io(pblk, line, line->emeta_ssec, READ);
+}
+
+static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct ppa_addr ppa)
+{
+ rqd->opcode = NVM_OP_ERASE;
+ rqd->ppa_addr = ppa;
+ rqd->nr_ppas = 1;
+ rqd->flags = pblk_set_progr_mode(pblk, ERASE);
+ rqd->bio = NULL;
+}
+
+static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
+{
+ struct nvm_rq rqd;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ pblk_setup_e_rq(pblk, &rqd, ppa);
+
+ rqd.end_io = pblk_end_io_sync;
+ rqd.private = &wait;
+
+ /* The write thread schedules erases so that it minimizes disturbances
+ * with writes. Thus, there is no need to take the LUN semaphore.
+ */
+ ret = pblk_submit_io(pblk, &rqd);
+ if (ret) {
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ pr_err("pblk: could not sync erase line:%d,blk:%d\n",
+ pblk_dev_ppa_to_line(ppa),
+ pblk_dev_ppa_to_pos(geo, ppa));
+
+ rqd.error = ret;
+ goto out;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: sync erase timed out\n");
+ }
+
+out:
+ rqd.private = pblk;
+ __pblk_end_io_erase(pblk, &rqd);
+
+ return 0;
+}
+
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct ppa_addr ppa;
+ int bit = -1;
+
+ /* Erase only good blocks, one at a time */
+ do {
+ spin_lock(&line->lock);
+ bit = find_next_zero_bit(line->erase_bitmap, lm->blk_per_line,
+ bit + 1);
+ if (bit >= lm->blk_per_line) {
+ spin_unlock(&line->lock);
+ break;
+ }
+
+ ppa = pblk->luns[bit].bppa; /* set ch and lun */
+ ppa.g.blk = line->id;
+
+ atomic_dec(&line->left_eblks);
+ WARN_ON(test_and_set_bit(bit, line->erase_bitmap));
+ spin_unlock(&line->lock);
+
+ if (pblk_blk_erase_sync(pblk, ppa)) {
+ pr_err("pblk: failed to erase line %d\n", line->id);
+ return -ENOMEM;
+ }
+ } while (1);
+
+ return 0;
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_set_metadata(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_line *cur)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct line_smeta *smeta = line->smeta;
+ struct line_emeta *emeta = line->emeta;
+ int nr_blk_line;
+
+ /* After erasing the line, new bad blocks might appear and we risk
+ * having an invalid line
+ */
+ nr_blk_line = lm->blk_per_line -
+ bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+ if (nr_blk_line < lm->min_blk_line) {
+ spin_lock(&l_mg->free_lock);
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_BAD;
+ spin_unlock(&line->lock);
+
+ list_add_tail(&line->list, &l_mg->bad_list);
+ spin_unlock(&l_mg->free_lock);
+
+ pr_debug("pblk: line %d is bad\n", line->id);
+
+ return 0;
+ }
+
+ /* Run-time metadata */
+ line->lun_bitmap = ((void *)(smeta)) + sizeof(struct line_smeta);
+
+ /* Mark LUNs allocated in this line (all for now) */
+ bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len);
+
+ smeta->header.identifier = cpu_to_le32(PBLK_MAGIC);
+ memcpy(smeta->header.uuid, pblk->instance_uuid, 16);
+ smeta->header.id = cpu_to_le32(line->id);
+ smeta->header.type = cpu_to_le16(line->type);
+ smeta->header.version = cpu_to_le16(1);
+
+ /* Start metadata */
+ smeta->seq_nr = cpu_to_le64(line->seq_nr);
+ smeta->window_wr_lun = cpu_to_le32(geo->nr_luns);
+
+ /* Fill metadata among lines */
+ if (cur) {
+ memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len);
+ smeta->prev_id = cpu_to_le32(cur->id);
+ cur->emeta->next_id = cpu_to_le32(line->id);
+ } else {
+ smeta->prev_id = cpu_to_le32(PBLK_LINE_EMPTY);
+ }
+
+ /* All smeta must be set at this point */
+ smeta->header.crc = cpu_to_le32(pblk_calc_meta_header_crc(pblk, smeta));
+ smeta->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta));
+
+ /* End metadata */
+ memcpy(&emeta->header, &smeta->header, sizeof(struct line_header));
+ emeta->seq_nr = cpu_to_le64(line->seq_nr);
+ emeta->nr_lbas = cpu_to_le64(line->sec_in_line);
+ emeta->nr_valid_lbas = cpu_to_le64(0);
+ emeta->next_id = cpu_to_le32(PBLK_LINE_EMPTY);
+ emeta->crc = cpu_to_le32(0);
+ emeta->prev_id = smeta->prev_id;
+
+ return 1;
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
+ int init)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ int nr_bb = 0;
+ u64 off;
+ int bit = -1;
+
+ line->sec_in_line = lm->sec_per_line;
+
+ /* Capture bad block information on line mapping bitmaps */
+ while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line,
+ bit + 1)) < lm->blk_per_line) {
+ off = bit * geo->sec_per_pl;
+ bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off,
+ lm->sec_per_line);
+ bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
+ lm->sec_per_line);
+ line->sec_in_line -= geo->sec_per_blk;
+ if (bit >= lm->emeta_bb)
+ nr_bb++;
+ }
+
+ /* Mark smeta metadata sectors as bad sectors */
+ bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+ off = bit * geo->sec_per_pl;
+retry_smeta:
+ bitmap_set(line->map_bitmap, off, lm->smeta_sec);
+ line->sec_in_line -= lm->smeta_sec;
+ line->smeta_ssec = off;
+ line->cur_sec = off + lm->smeta_sec;
+
+ if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) {
+ pr_debug("pblk: line smeta I/O failed. Retry\n");
+ off += geo->sec_per_pl;
+ goto retry_smeta;
+ }
+
+ bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line);
+
+ /* Mark emeta metadata sectors as bad sectors. We need to consider bad
+ * blocks to make sure that there are enough sectors to store emeta
+ */
+ bit = lm->sec_per_line;
+ off = lm->sec_per_line - lm->emeta_sec;
+ bitmap_set(line->invalid_bitmap, off, lm->emeta_sec);
+ while (nr_bb) {
+ off -= geo->sec_per_pl;
+ if (!test_bit(off, line->invalid_bitmap)) {
+ bitmap_set(line->invalid_bitmap, off, geo->sec_per_pl);
+ nr_bb--;
+ }
+ }
+
+ line->sec_in_line -= lm->emeta_sec;
+ line->emeta_ssec = off;
+ line->vsc = line->left_ssecs = line->left_msecs = line->sec_in_line;
+
+ if (lm->sec_per_line - line->sec_in_line !=
+ bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) {
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_BAD;
+ spin_unlock(&line->lock);
+
+ list_add_tail(&line->list, &l_mg->bad_list);
+ pr_err("pblk: unexpected line %d is bad\n", line->id);
+
+ return 0;
+ }
+
+ return 1;
+}
+
+static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ int blk_in_line = atomic_read(&line->blk_in_line);
+
+ line->map_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+ if (!line->map_bitmap)
+ return -ENOMEM;
+ memset(line->map_bitmap, 0, lm->sec_bitmap_len);
+
+ /* invalid_bitmap is special since it is used when line is closed. No
+ * need to zeroized; it will be initialized using bb info form
+ * map_bitmap
+ */
+ line->invalid_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+ if (!line->invalid_bitmap) {
+ mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ return -ENOMEM;
+ }
+
+ spin_lock(&line->lock);
+ if (line->state != PBLK_LINESTATE_FREE) {
+ spin_unlock(&line->lock);
+ WARN(1, "pblk: corrupted line state\n");
+ return -EINTR;
+ }
+ line->state = PBLK_LINESTATE_OPEN;
+
+ atomic_set(&line->left_eblks, blk_in_line);
+ atomic_set(&line->left_seblks, blk_in_line);
+ spin_unlock(&line->lock);
+
+ /* Bad blocks do not need to be erased */
+ bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
+
+ kref_init(&line->ref);
+
+ return 0;
+}
+
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ int ret;
+
+ spin_lock(&l_mg->free_lock);
+ l_mg->data_line = line;
+ list_del(&line->list);
+
+ ret = pblk_line_prepare(pblk, line);
+ if (ret) {
+ list_add(&line->list, &l_mg->free_list);
+ spin_unlock(&l_mg->free_lock);
+ return ret;
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ pblk_rl_free_lines_dec(&pblk->rl, line);
+
+ if (!pblk_line_init_bb(pblk, line, 0)) {
+ list_add(&line->list, &l_mg->free_list);
+ return -EINTR;
+ }
+
+ return 0;
+}
+
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
+{
+ mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ line->map_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+}
+
+struct pblk_line *pblk_line_get(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line *line = NULL;
+ int bit;
+
+ lockdep_assert_held(&l_mg->free_lock);
+
+retry_get:
+ if (list_empty(&l_mg->free_list)) {
+ pr_err("pblk: no free lines\n");
+ goto out;
+ }
+
+ line = list_first_entry(&l_mg->free_list, struct pblk_line, list);
+ list_del(&line->list);
+ l_mg->nr_free_lines--;
+
+ bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+ if (unlikely(bit >= lm->blk_per_line)) {
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_BAD;
+ spin_unlock(&line->lock);
+
+ list_add_tail(&line->list, &l_mg->bad_list);
+
+ pr_debug("pblk: line %d is bad\n", line->id);
+ goto retry_get;
+ }
+
+ if (pblk_line_prepare(pblk, line)) {
+ pr_err("pblk: failed to prepare line %d\n", line->id);
+ list_add(&line->list, &l_mg->free_list);
+ return NULL;
+ }
+
+out:
+ return line;
+}
+
+static struct pblk_line *pblk_line_retry(struct pblk *pblk,
+ struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *retry_line;
+
+ spin_lock(&l_mg->free_lock);
+ retry_line = pblk_line_get(pblk);
+ if (!retry_line) {
+ l_mg->data_line = NULL;
+ spin_unlock(&l_mg->free_lock);
+ return NULL;
+ }
+
+ retry_line->smeta = line->smeta;
+ retry_line->emeta = line->emeta;
+ retry_line->meta_line = line->meta_line;
+
+ pblk_line_free(pblk, line);
+ l_mg->data_line = retry_line;
+ spin_unlock(&l_mg->free_lock);
+
+ if (pblk_line_erase(pblk, retry_line)) {
+ spin_lock(&l_mg->free_lock);
+ l_mg->data_line = NULL;
+ spin_unlock(&l_mg->free_lock);
+ return NULL;
+ }
+
+ pblk_rl_free_lines_dec(&pblk->rl, retry_line);
+
+ return retry_line;
+}
+
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line;
+ int meta_line;
+ int is_next = 0;
+
+ spin_lock(&l_mg->free_lock);
+ line = pblk_line_get(pblk);
+ if (!line) {
+ spin_unlock(&l_mg->free_lock);
+ return NULL;
+ }
+
+ line->seq_nr = l_mg->d_seq_nr++;
+ line->type = PBLK_LINETYPE_DATA;
+ l_mg->data_line = line;
+
+ meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+ set_bit(meta_line, &l_mg->meta_bitmap);
+ line->smeta = l_mg->sline_meta[meta_line].meta;
+ line->emeta = l_mg->eline_meta[meta_line].meta;
+ line->meta_line = meta_line;
+
+ /* Allocate next line for preparation */
+ l_mg->data_next = pblk_line_get(pblk);
+ if (l_mg->data_next) {
+ l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+ l_mg->data_next->type = PBLK_LINETYPE_DATA;
+ is_next = 1;
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ pblk_rl_free_lines_dec(&pblk->rl, line);
+ if (is_next)
+ pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+ if (pblk_line_erase(pblk, line))
+ return NULL;
+
+retry_setup:
+ if (!pblk_line_set_metadata(pblk, line, NULL)) {
+ line = pblk_line_retry(pblk, line);
+ if (!line)
+ return NULL;
+
+ goto retry_setup;
+ }
+
+ if (!pblk_line_init_bb(pblk, line, 1)) {
+ line = pblk_line_retry(pblk, line);
+ if (!line)
+ return NULL;
+
+ goto retry_setup;
+ }
+
+ return line;
+}
+
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *cur, *new;
+ unsigned int left_seblks;
+ int meta_line;
+ int is_next = 0;
+
+ cur = l_mg->data_line;
+ new = l_mg->data_next;
+ if (!new)
+ return NULL;
+ l_mg->data_line = new;
+
+retry_line:
+ left_seblks = atomic_read(&new->left_seblks);
+ if (left_seblks) {
+ /* If line is not fully erased, erase it */
+ if (atomic_read(&new->left_eblks)) {
+ if (pblk_line_erase(pblk, new))
+ return NULL;
+ } else {
+ io_schedule();
+ }
+ goto retry_line;
+ }
+
+ spin_lock(&l_mg->free_lock);
+ /* Allocate next line for preparation */
+ l_mg->data_next = pblk_line_get(pblk);
+ if (l_mg->data_next) {
+ l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+ l_mg->data_next->type = PBLK_LINETYPE_DATA;
+ is_next = 1;
+ }
+
+retry_meta:
+ meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+ if (meta_line == PBLK_DATA_LINES) {
+ spin_unlock(&l_mg->free_lock);
+ io_schedule();
+ spin_lock(&l_mg->free_lock);
+ goto retry_meta;
+ }
+
+ set_bit(meta_line, &l_mg->meta_bitmap);
+ new->smeta = l_mg->sline_meta[meta_line].meta;
+ new->emeta = l_mg->eline_meta[meta_line].meta;
+ new->meta_line = meta_line;
+
+ memset(new->smeta, 0, lm->smeta_len);
+ memset(new->emeta, 0, lm->emeta_len);
+ spin_unlock(&l_mg->free_lock);
+
+ if (is_next)
+ pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+retry_setup:
+ if (!pblk_line_set_metadata(pblk, new, cur)) {
+ new = pblk_line_retry(pblk, new);
+ if (!new)
+ return NULL;
+
+ goto retry_setup;
+ }
+
+ if (!pblk_line_init_bb(pblk, new, 1)) {
+ new = pblk_line_retry(pblk, new);
+ if (!new)
+ return NULL;
+
+ goto retry_setup;
+ }
+
+ return new;
+}
+
+void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
+{
+ if (line->map_bitmap)
+ mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ if (line->invalid_bitmap)
+ mempool_free(line->invalid_bitmap, pblk->line_meta_pool);
+
+ line->map_bitmap = NULL;
+ line->invalid_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+}
+
+void pblk_line_put(struct kref *ref)
+{
+ struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+ struct pblk *pblk = line->pblk;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_GC);
+ line->state = PBLK_LINESTATE_FREE;
+ line->gc_group = PBLK_LINEGC_NONE;
+ pblk_line_free(pblk, line);
+ spin_unlock(&line->lock);
+
+ spin_lock(&l_mg->free_lock);
+ list_add_tail(&line->list, &l_mg->free_list);
+ l_mg->nr_free_lines++;
+ spin_unlock(&l_mg->free_lock);
+
+ pblk_rl_free_lines_inc(&pblk->rl, line);
+}
+
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
+{
+ struct nvm_rq *rqd;
+ int err;
+
+ rqd = mempool_alloc(pblk->r_rq_pool, GFP_KERNEL);
+ memset(rqd, 0, pblk_r_rq_size);
+
+ pblk_setup_e_rq(pblk, rqd, ppa);
+
+ rqd->end_io = pblk_end_io_erase;
+ rqd->private = pblk;
+
+ /* The write thread schedules erases so that it minimizes disturbances
+ * with writes. Thus, there is no need to take the LUN semaphore.
+ */
+ err = pblk_submit_io(pblk, rqd);
+ if (err) {
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ pr_err("pblk: could not async erase line:%d,blk:%d\n",
+ pblk_dev_ppa_to_line(ppa),
+ pblk_dev_ppa_to_pos(geo, ppa));
+ }
+
+ return err;
+}
+
+struct pblk_line *pblk_line_get_data(struct pblk *pblk)
+{
+ return pblk->l_mg.data_line;
+}
+
+struct pblk_line *pblk_line_get_data_next(struct pblk *pblk)
+{
+ return pblk->l_mg.data_next;
+}
+
+int pblk_line_is_full(struct pblk_line *line)
+{
+ return (line->left_msecs == 0);
+}
+
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list;
+
+ line->emeta->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, line->emeta));
+
+ if (pblk_line_submit_emeta_io(pblk, line, line->cur_sec, WRITE))
+ pr_err("pblk: line %d close I/O failed\n", line->id);
+
+ WARN(!bitmap_full(line->map_bitmap, line->sec_in_line),
+ "pblk: corrupt closed line %d\n", line->id);
+
+ spin_lock(&l_mg->free_lock);
+ WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap));
+ spin_unlock(&l_mg->free_lock);
+
+ spin_lock(&l_mg->gc_lock);
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_OPEN);
+ line->state = PBLK_LINESTATE_CLOSED;
+ move_list = pblk_line_gc_list(pblk, line);
+
+ list_add_tail(&line->list, move_list);
+
+ mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ line->map_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+
+ spin_unlock(&line->lock);
+ spin_unlock(&l_mg->gc_lock);
+}
+
+void pblk_line_close_ws(struct work_struct *work)
+{
+ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+ ws);
+ struct pblk *pblk = line_ws->pblk;
+ struct pblk_line *line = line_ws->line;
+
+ pblk_line_close(pblk, line);
+ mempool_free(line_ws, pblk->line_ws_pool);
+}
+
+void pblk_line_mark_bb(struct work_struct *work)
+{
+ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+ ws);
+ struct pblk *pblk = line_ws->pblk;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct ppa_addr *ppa = line_ws->priv;
+ int ret;
+
+ ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
+ if (ret) {
+ struct pblk_line *line;
+ int pos;
+
+ line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
+ pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
+
+ pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
+ line->id, pos);
+ }
+
+ kfree(ppa);
+ mempool_free(line_ws, pblk->line_ws_pool);
+}
+
+void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+ void (*work)(struct work_struct *))
+{
+ struct pblk_line_ws *line_ws;
+
+ line_ws = mempool_alloc(pblk->line_ws_pool, GFP_ATOMIC);
+ if (!line_ws)
+ return;
+
+ line_ws->pblk = pblk;
+ line_ws->line = line;
+ line_ws->priv = priv;
+
+ INIT_WORK(&line_ws->ws, work);
+ queue_work(pblk->kw_wq, &line_ws->ws);
+}
+
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ int lun_id = ppa_list[0].g.ch * geo->luns_per_chnl + ppa_list[0].g.lun;
+ int ret;
+
+ /*
+ * Only send one inflight I/O per LUN. Since we map at a page
+ * granurality, all ppas in the I/O will map to the same LUN
+ */
+#ifdef CONFIG_NVM_DEBUG
+ int i;
+
+ for (i = 1; i < nr_ppas; i++)
+ WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun ||
+ ppa_list[0].g.ch != ppa_list[i].g.ch);
+#endif
+ /* If the LUN has been locked for this same request, do no attempt to
+ * lock it again
+ */
+ if (test_and_set_bit(lun_id, lun_bitmap))
+ return;
+
+ rlun = &pblk->luns[lun_id];
+ ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000));
+ if (ret) {
+ switch (ret) {
+ case -ETIME:
+ pr_err("pblk: lun semaphore timed out\n");
+ break;
+ case -EINTR:
+ pr_err("pblk: lun semaphore timed out\n");
+ break;
+ }
+ }
+}
+
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ int nr_luns = geo->nr_luns;
+ int bit = -1;
+
+ while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
+ rlun = &pblk->luns[bit];
+ up(&rlun->wr_sem);
+ }
+
+ kfree(lun_bitmap);
+}
+
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+ struct ppa_addr l2p_ppa;
+
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ return;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ l2p_ppa = pblk_trans_map_get(pblk, lba);
+
+ if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa))
+ pblk_map_invalidate(pblk, l2p_ppa);
+
+ pblk_trans_map_set(pblk, lba, ppa);
+ spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+#ifdef CONFIG_NVM_DEBUG
+ /* Callers must ensure that the ppa points to a cache address */
+ BUG_ON(!pblk_addr_in_cache(ppa));
+ BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+#endif
+
+ pblk_update_map(pblk, lba, ppa);
+}
+
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+ struct pblk_line *gc_line)
+{
+ struct ppa_addr l2p_ppa;
+ int ret = 1;
+
+#ifdef CONFIG_NVM_DEBUG
+ /* Callers must ensure that the ppa points to a cache address */
+ BUG_ON(!pblk_addr_in_cache(ppa));
+ BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+#endif
+
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ return 0;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ l2p_ppa = pblk_trans_map_get(pblk, lba);
+
+ /* Prevent updated entries to be overwritten by GC */
+ if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) ||
+ pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) {
+ ret = 0;
+ goto out;
+ }
+
+ pblk_trans_map_set(pblk, lba, ppa);
+out:
+ spin_unlock(&pblk->trans_lock);
+ return ret;
+}
+
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+ struct ppa_addr entry_line)
+{
+ struct ppa_addr l2p_line;
+
+#ifdef CONFIG_NVM_DEBUG
+ /* Callers must ensure that the ppa points to a device address */
+ BUG_ON(pblk_addr_in_cache(ppa));
+#endif
+ /* Invalidate and discard padded entries */
+ if (lba == ADDR_EMPTY) {
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_inc(&pblk->padded_wb);
+#endif
+ pblk_map_invalidate(pblk, ppa);
+ return;
+ }
+
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ return;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ l2p_line = pblk_trans_map_get(pblk, lba);
+
+ /* Do not update L2P if the cacheline has been updated. In this case,
+ * the mapped ppa must be invalidated
+ */
+ if (l2p_line.ppa != entry_line.ppa) {
+ if (!pblk_ppa_empty(ppa))
+ pblk_map_invalidate(pblk, ppa);
+ goto out;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line));
+#endif
+
+ pblk_trans_map_set(pblk, lba, ppa);
+out:
+ spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+ sector_t blba, int nr_secs)
+{
+ int i;
+
+ spin_lock(&pblk->trans_lock);
+ for (i = 0; i < nr_secs; i++)
+ ppas[i] = pblk_trans_map_get(pblk, blba + i);
+ spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+ u64 *lba_list, int nr_secs)
+{
+ sector_t lba;
+ int i;
+
+ spin_lock(&pblk->trans_lock);
+ for (i = 0; i < nr_secs; i++) {
+ lba = lba_list[i];
+ if (lba == ADDR_EMPTY) {
+ ppas[i].ppa = ADDR_EMPTY;
+ } else {
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ continue;
+ }
+ ppas[i] = pblk_trans_map_get(pblk, lba);
+ }
+ }
+ spin_unlock(&pblk->trans_lock);
+}
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
new file mode 100644
index 000000000000..eaf479c6b63c
--- /dev/null
+++ b/drivers/lightnvm/pblk-gc.c
@@ -0,0 +1,555 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-gc.c - pblk's garbage collector
+ */
+
+#include "pblk.h"
+#include <linux/delay.h>
+
+static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
+{
+ kfree(gc_rq->data);
+ kfree(gc_rq->lba_list);
+ kfree(gc_rq);
+}
+
+static int pblk_gc_write(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_gc_rq *gc_rq, *tgc_rq;
+ LIST_HEAD(w_list);
+
+ spin_lock(&gc->w_lock);
+ if (list_empty(&gc->w_list)) {
+ spin_unlock(&gc->w_lock);
+ return 1;
+ }
+
+ list_for_each_entry_safe(gc_rq, tgc_rq, &gc->w_list, list) {
+ list_move_tail(&gc_rq->list, &w_list);
+ gc->w_entries--;
+ }
+ spin_unlock(&gc->w_lock);
+
+ list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
+ pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list,
+ gc_rq->nr_secs, gc_rq->secs_to_gc,
+ gc_rq->line, PBLK_IOTYPE_GC);
+
+ kref_put(&gc_rq->line->ref, pblk_line_put);
+
+ list_del(&gc_rq->list);
+ pblk_gc_free_gc_rq(gc_rq);
+ }
+
+ return 0;
+}
+
+static void pblk_gc_writer_kick(struct pblk_gc *gc)
+{
+ wake_up_process(gc->gc_writer_ts);
+}
+
+/*
+ * Responsible for managing all memory related to a gc request. Also in case of
+ * failure
+ */
+static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_line *line,
+ u64 *lba_list, unsigned int nr_secs)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_gc_rq *gc_rq;
+ void *data;
+ unsigned int secs_to_gc;
+ int ret = NVM_IO_OK;
+
+ data = kmalloc(nr_secs * geo->sec_size, GFP_KERNEL);
+ if (!data) {
+ ret = NVM_IO_ERR;
+ goto free_lba_list;
+ }
+
+ /* Read from GC victim block */
+ if (pblk_submit_read_gc(pblk, lba_list, data, nr_secs,
+ &secs_to_gc, line)) {
+ ret = NVM_IO_ERR;
+ goto free_data;
+ }
+
+ if (!secs_to_gc)
+ goto free_data;
+
+ gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
+ if (!gc_rq) {
+ ret = NVM_IO_ERR;
+ goto free_data;
+ }
+
+ gc_rq->line = line;
+ gc_rq->data = data;
+ gc_rq->lba_list = lba_list;
+ gc_rq->nr_secs = nr_secs;
+ gc_rq->secs_to_gc = secs_to_gc;
+
+ kref_get(&line->ref);
+
+retry:
+ spin_lock(&gc->w_lock);
+ if (gc->w_entries > 256) {
+ spin_unlock(&gc->w_lock);
+ usleep_range(256, 1024);
+ goto retry;
+ }
+ gc->w_entries++;
+ list_add_tail(&gc_rq->list, &gc->w_list);
+ spin_unlock(&gc->w_lock);
+
+ pblk_gc_writer_kick(&pblk->gc);
+
+ return NVM_IO_OK;
+
+free_data:
+ kfree(data);
+free_lba_list:
+ kfree(lba_list);
+
+ return ret;
+}
+
+static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list;
+
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_GC);
+ line->state = PBLK_LINESTATE_CLOSED;
+ move_list = pblk_line_gc_list(pblk, line);
+ spin_unlock(&line->lock);
+
+ if (move_list) {
+ spin_lock(&l_mg->gc_lock);
+ list_add_tail(&line->list, move_list);
+ spin_unlock(&l_mg->gc_lock);
+ }
+}
+
+static void pblk_gc_line_ws(struct work_struct *work)
+{
+ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+ ws);
+ struct pblk *pblk = line_ws->pblk;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line = line_ws->line;
+ struct pblk_line_meta *lm = &pblk->lm;
+ __le64 *lba_list = line_ws->priv;
+ u64 *gc_list;
+ int sec_left;
+ int nr_ppas, bit;
+ int put_line = 1;
+
+ pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
+
+ spin_lock(&line->lock);
+ sec_left = line->vsc;
+ if (!sec_left) {
+ /* Lines are erased before being used (l_mg->data_/log_next) */
+ spin_unlock(&line->lock);
+ goto out;
+ }
+ spin_unlock(&line->lock);
+
+ if (sec_left < 0) {
+ pr_err("pblk: corrupted GC line (%d)\n", line->id);
+ put_line = 0;
+ pblk_put_line_back(pblk, line);
+ goto out;
+ }
+
+ bit = -1;
+next_rq:
+ gc_list = kmalloc_array(pblk->max_write_pgs, sizeof(u64), GFP_KERNEL);
+ if (!gc_list) {
+ put_line = 0;
+ pblk_put_line_back(pblk, line);
+ goto out;
+ }
+
+ nr_ppas = 0;
+ do {
+ bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line,
+ bit + 1);
+ if (bit > line->emeta_ssec)
+ break;
+
+ gc_list[nr_ppas++] = le64_to_cpu(lba_list[bit]);
+ } while (nr_ppas < pblk->max_write_pgs);
+
+ if (unlikely(!nr_ppas)) {
+ kfree(gc_list);
+ goto out;
+ }
+
+ if (pblk_gc_move_valid_secs(pblk, line, gc_list, nr_ppas)) {
+ pr_err("pblk: could not GC all sectors: line:%d (%d/%d/%d)\n",
+ line->id, line->vsc,
+ nr_ppas, nr_ppas);
+ put_line = 0;
+ pblk_put_line_back(pblk, line);
+ goto out;
+ }
+
+ sec_left -= nr_ppas;
+ if (sec_left > 0)
+ goto next_rq;
+
+out:
+ pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
+ mempool_free(line_ws, pblk->line_ws_pool);
+ atomic_dec(&pblk->gc.inflight_gc);
+ if (put_line)
+ kref_put(&line->ref, pblk_line_put);
+}
+
+static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_ws *line_ws;
+ __le64 *lba_list;
+ int ret;
+
+ line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL);
+ line->emeta = pblk_malloc(lm->emeta_len, l_mg->emeta_alloc_type,
+ GFP_KERNEL);
+ if (!line->emeta) {
+ pr_err("pblk: cannot use GC emeta\n");
+ goto fail_free_ws;
+ }
+
+ ret = pblk_line_read_emeta(pblk, line);
+ if (ret) {
+ pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret);
+ goto fail_free_emeta;
+ }
+
+ /* If this read fails, it means that emeta is corrupted. For now, leave
+ * the line untouched. TODO: Implement a recovery routine that scans and
+ * moves all sectors on the line.
+ */
+ lba_list = pblk_recov_get_lba_list(pblk, line->emeta);
+ if (!lba_list) {
+ pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
+ goto fail_free_emeta;
+ }
+
+ line_ws->pblk = pblk;
+ line_ws->line = line;
+ line_ws->priv = lba_list;
+
+ INIT_WORK(&line_ws->ws, pblk_gc_line_ws);
+ queue_work(pblk->gc.gc_reader_wq, &line_ws->ws);
+
+ return 0;
+
+fail_free_emeta:
+ pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
+fail_free_ws:
+ mempool_free(line_ws, pblk->line_ws_pool);
+ pblk_put_line_back(pblk, line);
+
+ return 1;
+}
+
+static void pblk_gc_lines(struct pblk *pblk, struct list_head *gc_list)
+{
+ struct pblk_line *line, *tline;
+
+ list_for_each_entry_safe(line, tline, gc_list, list) {
+ if (pblk_gc_line(pblk, line))
+ pr_err("pblk: failed to GC line %d\n", line->id);
+ list_del(&line->list);
+ }
+}
+
+/*
+ * Lines with no valid sectors will be returned to the free list immediately. If
+ * GC is activated - either because the free block count is under the determined
+ * threshold, or because it is being forced from user space - only lines with a
+ * high count of invalid sectors will be recycled.
+ */
+static void pblk_gc_run(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_line *line, *tline;
+ unsigned int nr_blocks_free, nr_blocks_need;
+ struct list_head *group_list;
+ int run_gc, gc_group = 0;
+ int prev_gc = 0;
+ int inflight_gc = atomic_read(&gc->inflight_gc);
+ LIST_HEAD(gc_list);
+
+ spin_lock(&l_mg->gc_lock);
+ list_for_each_entry_safe(line, tline, &l_mg->gc_full_list, list) {
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+ line->state = PBLK_LINESTATE_GC;
+ spin_unlock(&line->lock);
+
+ list_del(&line->list);
+ kref_put(&line->ref, pblk_line_put);
+ }
+ spin_unlock(&l_mg->gc_lock);
+
+ nr_blocks_need = pblk_rl_gc_thrs(&pblk->rl);
+ nr_blocks_free = pblk_rl_nr_free_blks(&pblk->rl);
+ run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
+
+next_gc_group:
+ group_list = l_mg->gc_lists[gc_group++];
+ spin_lock(&l_mg->gc_lock);
+ while (run_gc && !list_empty(group_list)) {
+ /* No need to queue up more GC lines than we can handle */
+ if (!run_gc || inflight_gc > gc->gc_jobs_active) {
+ spin_unlock(&l_mg->gc_lock);
+ pblk_gc_lines(pblk, &gc_list);
+ return;
+ }
+
+ line = list_first_entry(group_list, struct pblk_line, list);
+ nr_blocks_free += atomic_read(&line->blk_in_line);
+
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+ line->state = PBLK_LINESTATE_GC;
+ list_move_tail(&line->list, &gc_list);
+ atomic_inc(&gc->inflight_gc);
+ inflight_gc++;
+ spin_unlock(&line->lock);
+
+ prev_gc = 1;
+ run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
+ }
+ spin_unlock(&l_mg->gc_lock);
+
+ pblk_gc_lines(pblk, &gc_list);
+
+ if (!prev_gc && pblk->rl.rb_state > gc_group &&
+ gc_group < PBLK_NR_GC_LISTS)
+ goto next_gc_group;
+}
+
+
+static void pblk_gc_kick(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ wake_up_process(gc->gc_ts);
+ pblk_gc_writer_kick(gc);
+ mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+}
+
+static void pblk_gc_timer(unsigned long data)
+{
+ struct pblk *pblk = (struct pblk *)data;
+
+ pblk_gc_kick(pblk);
+}
+
+static int pblk_gc_ts(void *data)
+{
+ struct pblk *pblk = data;
+
+ while (!kthread_should_stop()) {
+ pblk_gc_run(pblk);
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule();
+ }
+
+ return 0;
+}
+
+static int pblk_gc_writer_ts(void *data)
+{
+ struct pblk *pblk = data;
+
+ while (!kthread_should_stop()) {
+ if (!pblk_gc_write(pblk))
+ continue;
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule();
+ }
+
+ return 0;
+}
+
+static void pblk_gc_start(struct pblk *pblk)
+{
+ pblk->gc.gc_active = 1;
+
+ pr_debug("pblk: gc start\n");
+}
+
+int pblk_gc_status(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ int ret;
+
+ spin_lock(&gc->lock);
+ ret = gc->gc_active;
+ spin_unlock(&gc->lock);
+
+ return ret;
+}
+
+static void __pblk_gc_should_start(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ lockdep_assert_held(&gc->lock);
+
+ if (gc->gc_enabled && !gc->gc_active)
+ pblk_gc_start(pblk);
+}
+
+void pblk_gc_should_start(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ spin_lock(&gc->lock);
+ __pblk_gc_should_start(pblk);
+ spin_unlock(&gc->lock);
+}
+
+/*
+ * If flush_wq == 1 then no lock should be held by the caller since
+ * flush_workqueue can sleep
+ */
+static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
+{
+ spin_lock(&pblk->gc.lock);
+ pblk->gc.gc_active = 0;
+ spin_unlock(&pblk->gc.lock);
+
+ pr_debug("pblk: gc stop\n");
+}
+
+void pblk_gc_should_stop(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ if (gc->gc_active && !gc->gc_forced)
+ pblk_gc_stop(pblk, 0);
+}
+
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+ int *gc_active)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ spin_lock(&gc->lock);
+ *gc_enabled = gc->gc_enabled;
+ *gc_active = gc->gc_active;
+ spin_unlock(&gc->lock);
+}
+
+void pblk_gc_sysfs_force(struct pblk *pblk, int force)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ int rsv = 0;
+
+ spin_lock(&gc->lock);
+ if (force) {
+ gc->gc_enabled = 1;
+ rsv = 64;
+ }
+ pblk_rl_set_gc_rsc(&pblk->rl, rsv);
+ gc->gc_forced = force;
+ __pblk_gc_should_start(pblk);
+ spin_unlock(&gc->lock);
+}
+
+int pblk_gc_init(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ int ret;
+
+ gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
+ if (IS_ERR(gc->gc_ts)) {
+ pr_err("pblk: could not allocate GC main kthread\n");
+ return PTR_ERR(gc->gc_ts);
+ }
+
+ gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
+ "pblk-gc-writer-ts");
+ if (IS_ERR(gc->gc_writer_ts)) {
+ pr_err("pblk: could not allocate GC writer kthread\n");
+ ret = PTR_ERR(gc->gc_writer_ts);
+ goto fail_free_main_kthread;
+ }
+
+ setup_timer(&gc->gc_timer, pblk_gc_timer, (unsigned long)pblk);
+ mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+
+ gc->gc_active = 0;
+ gc->gc_forced = 0;
+ gc->gc_enabled = 1;
+ gc->gc_jobs_active = 8;
+ gc->w_entries = 0;
+ atomic_set(&gc->inflight_gc, 0);
+
+ gc->gc_reader_wq = alloc_workqueue("pblk-gc-reader-wq",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, gc->gc_jobs_active);
+ if (!gc->gc_reader_wq) {
+ pr_err("pblk: could not allocate GC reader workqueue\n");
+ ret = -ENOMEM;
+ goto fail_free_writer_kthread;
+ }
+
+ spin_lock_init(&gc->lock);
+ spin_lock_init(&gc->w_lock);
+ INIT_LIST_HEAD(&gc->w_list);
+
+ return 0;
+
+fail_free_writer_kthread:
+ kthread_stop(gc->gc_writer_ts);
+fail_free_main_kthread:
+ kthread_stop(gc->gc_ts);
+
+ return ret;
+}
+
+void pblk_gc_exit(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ flush_workqueue(gc->gc_reader_wq);
+
+ del_timer(&gc->gc_timer);
+ pblk_gc_stop(pblk, 1);
+
+ if (gc->gc_ts)
+ kthread_stop(gc->gc_ts);
+
+ if (pblk->gc.gc_reader_wq)
+ destroy_workqueue(pblk->gc.gc_reader_wq);
+
+ if (gc->gc_writer_ts)
+ kthread_stop(gc->gc_writer_ts);
+}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
new file mode 100644
index 000000000000..ae8cd6d5af8b
--- /dev/null
+++ b/drivers/lightnvm/pblk-init.c
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-init.c - pblk's initialization.
+ */
+
+#include "pblk.h"
+
+static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_r_rq_cache,
+ *pblk_w_rq_cache, *pblk_line_meta_cache;
+static DECLARE_RWSEM(pblk_lock);
+
+static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
+ struct bio *bio)
+{
+ int ret;
+
+ /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
+ * constraint. Writes can be of arbitrary size.
+ */
+ if (bio_data_dir(bio) == READ) {
+ blk_queue_split(q, &bio, q->bio_split);
+ ret = pblk_submit_read(pblk, bio);
+ if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED))
+ bio_put(bio);
+
+ return ret;
+ }
+
+ /* Prevent deadlock in the case of a modest LUN configuration and large
+ * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
+ * available for user I/O.
+ */
+ if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl)))
+ blk_queue_split(q, &bio, q->bio_split);
+
+ return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
+}
+
+static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
+{
+ struct pblk *pblk = q->queuedata;
+
+ if (bio_op(bio) == REQ_OP_DISCARD) {
+ pblk_discard(pblk, bio);
+ if (!(bio->bi_opf & REQ_PREFLUSH)) {
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+ }
+ }
+
+ switch (pblk_rw_io(q, pblk, bio)) {
+ case NVM_IO_ERR:
+ bio_io_error(bio);
+ break;
+ case NVM_IO_DONE:
+ bio_endio(bio);
+ break;
+ }
+
+ return BLK_QC_T_NONE;
+}
+
+static void pblk_l2p_free(struct pblk *pblk)
+{
+ vfree(pblk->trans_map);
+}
+
+static int pblk_l2p_init(struct pblk *pblk)
+{
+ sector_t i;
+ struct ppa_addr ppa;
+ int entry_size = 8;
+
+ if (pblk->ppaf_bitsize < 32)
+ entry_size = 4;
+
+ pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs);
+ if (!pblk->trans_map)
+ return -ENOMEM;
+
+ pblk_ppa_set_empty(&ppa);
+
+ for (i = 0; i < pblk->rl.nr_secs; i++)
+ pblk_trans_map_set(pblk, i, ppa);
+
+ return 0;
+}
+
+static void pblk_rwb_free(struct pblk *pblk)
+{
+ if (pblk_rb_tear_down_check(&pblk->rwb))
+ pr_err("pblk: write buffer error on tear down\n");
+
+ pblk_rb_data_free(&pblk->rwb);
+ vfree(pblk_rb_entries_ref(&pblk->rwb));
+}
+
+static int pblk_rwb_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_rb_entry *entries;
+ unsigned long nr_entries;
+ unsigned int power_size, power_seg_sz;
+
+ nr_entries = pblk_rb_calculate_size(pblk->pgs_in_buffer);
+
+ entries = vzalloc(nr_entries * sizeof(struct pblk_rb_entry));
+ if (!entries)
+ return -ENOMEM;
+
+ power_size = get_count_order(nr_entries);
+ power_seg_sz = get_count_order(geo->sec_size);
+
+ return pblk_rb_init(&pblk->rwb, entries, power_size, power_seg_sz);
+}
+
+/* Minimum pages needed within a lun */
+#define PAGE_POOL_SIZE 16
+#define ADDR_POOL_SIZE 64
+
+static int pblk_set_ppaf(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct nvm_addr_format ppaf = geo->ppaf;
+ int power_len;
+
+ /* Re-calculate channel and lun format to adapt to configuration */
+ power_len = get_count_order(geo->nr_chnls);
+ if (1 << power_len != geo->nr_chnls) {
+ pr_err("pblk: supports only power-of-two channel config.\n");
+ return -EINVAL;
+ }
+ ppaf.ch_len = power_len;
+
+ power_len = get_count_order(geo->luns_per_chnl);
+ if (1 << power_len != geo->luns_per_chnl) {
+ pr_err("pblk: supports only power-of-two LUN config.\n");
+ return -EINVAL;
+ }
+ ppaf.lun_len = power_len;
+
+ pblk->ppaf.sec_offset = 0;
+ pblk->ppaf.pln_offset = ppaf.sect_len;
+ pblk->ppaf.ch_offset = pblk->ppaf.pln_offset + ppaf.pln_len;
+ pblk->ppaf.lun_offset = pblk->ppaf.ch_offset + ppaf.ch_len;
+ pblk->ppaf.pg_offset = pblk->ppaf.lun_offset + ppaf.lun_len;
+ pblk->ppaf.blk_offset = pblk->ppaf.pg_offset + ppaf.pg_len;
+ pblk->ppaf.sec_mask = (1ULL << ppaf.sect_len) - 1;
+ pblk->ppaf.pln_mask = ((1ULL << ppaf.pln_len) - 1) <<
+ pblk->ppaf.pln_offset;
+ pblk->ppaf.ch_mask = ((1ULL << ppaf.ch_len) - 1) <<
+ pblk->ppaf.ch_offset;
+ pblk->ppaf.lun_mask = ((1ULL << ppaf.lun_len) - 1) <<
+ pblk->ppaf.lun_offset;
+ pblk->ppaf.pg_mask = ((1ULL << ppaf.pg_len) - 1) <<
+ pblk->ppaf.pg_offset;
+ pblk->ppaf.blk_mask = ((1ULL << ppaf.blk_len) - 1) <<
+ pblk->ppaf.blk_offset;
+
+ pblk->ppaf_bitsize = pblk->ppaf.blk_offset + ppaf.blk_len;
+
+ return 0;
+}
+
+static int pblk_init_global_caches(struct pblk *pblk)
+{
+ char cache_name[PBLK_CACHE_NAME_LEN];
+
+ down_write(&pblk_lock);
+ pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws",
+ sizeof(struct pblk_line_ws), 0, 0, NULL);
+ if (!pblk_blk_ws_cache) {
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_rec_cache = kmem_cache_create("pblk_rec",
+ sizeof(struct pblk_rec_ctx), 0, 0, NULL);
+ if (!pblk_rec_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_r_rq_cache = kmem_cache_create("pblk_r_rq", pblk_r_rq_size,
+ 0, 0, NULL);
+ if (!pblk_r_rq_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
+ 0, 0, NULL);
+ if (!pblk_w_rq_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ kmem_cache_destroy(pblk_r_rq_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s",
+ pblk->disk->disk_name);
+ pblk_line_meta_cache = kmem_cache_create(cache_name,
+ pblk->lm.sec_bitmap_len, 0, 0, NULL);
+ if (!pblk_line_meta_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ kmem_cache_destroy(pblk_r_rq_cache);
+ kmem_cache_destroy(pblk_w_rq_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+ up_write(&pblk_lock);
+
+ return 0;
+}
+
+static int pblk_core_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int max_write_ppas;
+ int mod;
+
+ pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
+ max_write_ppas = pblk->min_write_pgs * geo->nr_luns;
+ pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
+ max_write_ppas : nvm_max_phys_sects(dev);
+ pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
+ geo->nr_planes * geo->nr_luns;
+
+ if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+ pr_err("pblk: cannot support device max_phys_sect\n");
+ return -EINVAL;
+ }
+
+ div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod);
+ if (mod) {
+ pr_err("pblk: bad configuration of sectors/pages\n");
+ return -EINVAL;
+ }
+
+ if (pblk_init_global_caches(pblk))
+ return -ENOMEM;
+
+ pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
+ if (!pblk->page_pool)
+ return -ENOMEM;
+
+ pblk->line_ws_pool = mempool_create_slab_pool(geo->nr_luns,
+ pblk_blk_ws_cache);
+ if (!pblk->line_ws_pool)
+ goto free_page_pool;
+
+ pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
+ if (!pblk->rec_pool)
+ goto free_blk_ws_pool;
+
+ pblk->r_rq_pool = mempool_create_slab_pool(64, pblk_r_rq_cache);
+ if (!pblk->r_rq_pool)
+ goto free_rec_pool;
+
+ pblk->w_rq_pool = mempool_create_slab_pool(64, pblk_w_rq_cache);
+ if (!pblk->w_rq_pool)
+ goto free_r_rq_pool;
+
+ pblk->line_meta_pool =
+ mempool_create_slab_pool(16, pblk_line_meta_cache);
+ if (!pblk->line_meta_pool)
+ goto free_w_rq_pool;
+
+ pblk->kw_wq = alloc_workqueue("pblk-aux-wq",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!pblk->kw_wq)
+ goto free_line_meta_pool;
+
+ if (pblk_set_ppaf(pblk))
+ goto free_kw_wq;
+
+ if (pblk_rwb_init(pblk))
+ goto free_kw_wq;
+
+ INIT_LIST_HEAD(&pblk->compl_list);
+ return 0;
+
+free_kw_wq:
+ destroy_workqueue(pblk->kw_wq);
+free_line_meta_pool:
+ mempool_destroy(pblk->line_meta_pool);
+free_w_rq_pool:
+ mempool_destroy(pblk->w_rq_pool);
+free_r_rq_pool:
+ mempool_destroy(pblk->r_rq_pool);
+free_rec_pool:
+ mempool_destroy(pblk->rec_pool);
+free_blk_ws_pool:
+ mempool_destroy(pblk->line_ws_pool);
+free_page_pool:
+ mempool_destroy(pblk->page_pool);
+ return -ENOMEM;
+}
+
+static void pblk_core_free(struct pblk *pblk)
+{
+ if (pblk->kw_wq)
+ destroy_workqueue(pblk->kw_wq);
+
+ mempool_destroy(pblk->page_pool);
+ mempool_destroy(pblk->line_ws_pool);
+ mempool_destroy(pblk->rec_pool);
+ mempool_destroy(pblk->r_rq_pool);
+ mempool_destroy(pblk->w_rq_pool);
+ mempool_destroy(pblk->line_meta_pool);
+
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ kmem_cache_destroy(pblk_r_rq_cache);
+ kmem_cache_destroy(pblk_w_rq_cache);
+ kmem_cache_destroy(pblk_line_meta_cache);
+}
+
+static void pblk_luns_free(struct pblk *pblk)
+{
+ kfree(pblk->luns);
+}
+
+static void pblk_lines_free(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line;
+ int i;
+
+ spin_lock(&l_mg->free_lock);
+ for (i = 0; i < l_mg->nr_lines; i++) {
+ line = &pblk->lines[i];
+
+ pblk_line_free(pblk, line);
+ kfree(line->blk_bitmap);
+ kfree(line->erase_bitmap);
+ }
+ spin_unlock(&l_mg->free_lock);
+}
+
+static void pblk_line_meta_free(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ int i;
+
+ kfree(l_mg->bb_template);
+ kfree(l_mg->bb_aux);
+
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
+ pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
+ }
+
+ kfree(pblk->lines);
+}
+
+static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
+{
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr ppa;
+ u8 *blks;
+ int nr_blks, ret;
+
+ nr_blks = geo->blks_per_lun * geo->plane_mode;
+ blks = kmalloc(nr_blks, GFP_KERNEL);
+ if (!blks)
+ return -ENOMEM;
+
+ ppa.ppa = 0;
+ ppa.g.ch = rlun->bppa.g.ch;
+ ppa.g.lun = rlun->bppa.g.lun;
+
+ ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
+ if (ret)
+ goto out;
+
+ nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
+ if (nr_blks < 0) {
+ ret = nr_blks;
+ goto out;
+ }
+
+ rlun->bb_list = blks;
+
+ return 0;
+out:
+ kfree(blks);
+ return ret;
+}
+
+static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_lun *rlun;
+ int bb_cnt = 0;
+ int i;
+
+ line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+ if (!line->blk_bitmap)
+ return -ENOMEM;
+
+ line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+ if (!line->erase_bitmap) {
+ kfree(line->blk_bitmap);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < lm->blk_per_line; i++) {
+ rlun = &pblk->luns[i];
+ if (rlun->bb_list[line->id] == NVM_BLK_T_FREE)
+ continue;
+
+ set_bit(i, line->blk_bitmap);
+ bb_cnt++;
+ }
+
+ return bb_cnt;
+}
+
+static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ int i, ret;
+
+ /* TODO: Implement unbalanced LUN support */
+ if (geo->luns_per_chnl < 0) {
+ pr_err("pblk: unbalanced LUN config.\n");
+ return -EINVAL;
+ }
+
+ pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL);
+ if (!pblk->luns)
+ return -ENOMEM;
+
+ for (i = 0; i < geo->nr_luns; i++) {
+ /* Stripe across channels */
+ int ch = i % geo->nr_chnls;
+ int lun_raw = i / geo->nr_chnls;
+ int lunid = lun_raw + ch * geo->luns_per_chnl;
+
+ rlun = &pblk->luns[i];
+ rlun->bppa = luns[lunid];
+
+ sema_init(&rlun->wr_sem, 1);
+
+ ret = pblk_bb_discovery(dev, rlun);
+ if (ret) {
+ while (--i >= 0)
+ kfree(pblk->luns[i].bb_list);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int pblk_lines_configure(struct pblk *pblk, int flags)
+{
+ struct pblk_line *line = NULL;
+ int ret = 0;
+
+ if (!(flags & NVM_TARGET_FACTORY)) {
+ line = pblk_recov_l2p(pblk);
+ if (IS_ERR(line)) {
+ pr_err("pblk: could not recover l2p table\n");
+ ret = -EFAULT;
+ }
+ }
+
+ if (!line) {
+ /* Configure next line for user data */
+ line = pblk_line_get_first_data(pblk);
+ if (!line) {
+ pr_err("pblk: line list corrupted\n");
+ ret = -EFAULT;
+ }
+ }
+
+ return ret;
+}
+
+/* See comment over struct line_emeta definition */
+static unsigned int calc_emeta_len(struct pblk *pblk, struct pblk_line_meta *lm)
+{
+ return (sizeof(struct line_emeta) +
+ ((lm->sec_per_line - lm->emeta_sec) * sizeof(u64)) +
+ (pblk->l_mg.nr_lines * sizeof(u32)) +
+ lm->blk_bitmap_len);
+}
+
+static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ sector_t provisioned;
+
+ pblk->over_pct = 20;
+
+ provisioned = nr_free_blks;
+ provisioned *= (100 - pblk->over_pct);
+ sector_div(provisioned, 100);
+
+ /* Internally pblk manages all free blocks, but all calculations based
+ * on user capacity consider only provisioned blocks
+ */
+ pblk->rl.total_blocks = nr_free_blks;
+ pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk;
+ pblk->capacity = provisioned * geo->sec_per_blk;
+ atomic_set(&pblk->rl.free_blocks, nr_free_blks);
+}
+
+static int pblk_lines_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line *line;
+ unsigned int smeta_len, emeta_len;
+ long nr_bad_blks, nr_meta_blks, nr_free_blks;
+ int bb_distance;
+ int i;
+ int ret;
+
+ lm->sec_per_line = geo->sec_per_blk * geo->nr_luns;
+ lm->blk_per_line = geo->nr_luns;
+ lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+ lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
+ lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+ lm->high_thrs = lm->sec_per_line / 2;
+ lm->mid_thrs = lm->sec_per_line / 4;
+
+ /* Calculate necessary pages for smeta. See comment over struct
+ * line_smeta definition
+ */
+ lm->smeta_len = sizeof(struct line_smeta) +
+ PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
+
+ i = 1;
+add_smeta_page:
+ lm->smeta_sec = i * geo->sec_per_pl;
+ lm->smeta_len = lm->smeta_sec * geo->sec_size;
+
+ smeta_len = sizeof(struct line_smeta) +
+ PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
+ if (smeta_len > lm->smeta_len) {
+ i++;
+ goto add_smeta_page;
+ }
+
+ /* Calculate necessary pages for emeta. See comment over struct
+ * line_emeta definition
+ */
+ i = 1;
+add_emeta_page:
+ lm->emeta_sec = i * geo->sec_per_pl;
+ lm->emeta_len = lm->emeta_sec * geo->sec_size;
+
+ emeta_len = calc_emeta_len(pblk, lm);
+ if (emeta_len > lm->emeta_len) {
+ i++;
+ goto add_emeta_page;
+ }
+ lm->emeta_bb = geo->nr_luns - i;
+
+ nr_meta_blks = (lm->smeta_sec + lm->emeta_sec +
+ (geo->sec_per_blk / 2)) / geo->sec_per_blk;
+ lm->min_blk_line = nr_meta_blks + 1;
+
+ l_mg->nr_lines = geo->blks_per_lun;
+ l_mg->log_line = l_mg->data_line = NULL;
+ l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
+ l_mg->nr_free_lines = 0;
+ bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+
+ /* smeta is always small enough to fit on a kmalloc memory allocation,
+ * emeta depends on the number of LUNs allocated to the pblk instance
+ */
+ l_mg->smeta_alloc_type = PBLK_KMALLOC_META;
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ l_mg->sline_meta[i].meta = kmalloc(lm->smeta_len, GFP_KERNEL);
+ if (!l_mg->sline_meta[i].meta)
+ while (--i >= 0) {
+ kfree(l_mg->sline_meta[i].meta);
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ if (lm->emeta_len > KMALLOC_MAX_CACHE_SIZE) {
+ l_mg->emeta_alloc_type = PBLK_VMALLOC_META;
+
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ l_mg->eline_meta[i].meta = vmalloc(lm->emeta_len);
+ if (!l_mg->eline_meta[i].meta)
+ while (--i >= 0) {
+ vfree(l_mg->eline_meta[i].meta);
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+ } else {
+ l_mg->emeta_alloc_type = PBLK_KMALLOC_META;
+
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ l_mg->eline_meta[i].meta =
+ kmalloc(lm->emeta_len, GFP_KERNEL);
+ if (!l_mg->eline_meta[i].meta)
+ while (--i >= 0) {
+ kfree(l_mg->eline_meta[i].meta);
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+ }
+
+ l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!l_mg->bb_template) {
+ ret = -ENOMEM;
+ goto fail_free_meta;
+ }
+
+ l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!l_mg->bb_aux) {
+ ret = -ENOMEM;
+ goto fail_free_bb_template;
+ }
+
+ bb_distance = (geo->nr_luns) * geo->sec_per_pl;
+ for (i = 0; i < lm->sec_per_line; i += bb_distance)
+ bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
+
+ INIT_LIST_HEAD(&l_mg->free_list);
+ INIT_LIST_HEAD(&l_mg->corrupt_list);
+ INIT_LIST_HEAD(&l_mg->bad_list);
+ INIT_LIST_HEAD(&l_mg->gc_full_list);
+ INIT_LIST_HEAD(&l_mg->gc_high_list);
+ INIT_LIST_HEAD(&l_mg->gc_mid_list);
+ INIT_LIST_HEAD(&l_mg->gc_low_list);
+ INIT_LIST_HEAD(&l_mg->gc_empty_list);
+
+ l_mg->gc_lists[0] = &l_mg->gc_high_list;
+ l_mg->gc_lists[1] = &l_mg->gc_mid_list;
+ l_mg->gc_lists[2] = &l_mg->gc_low_list;
+
+ spin_lock_init(&l_mg->free_lock);
+ spin_lock_init(&l_mg->gc_lock);
+
+ pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line),
+ GFP_KERNEL);
+ if (!pblk->lines) {
+ ret = -ENOMEM;
+ goto fail_free_bb_aux;
+ }
+
+ nr_free_blks = 0;
+ for (i = 0; i < l_mg->nr_lines; i++) {
+ int blk_in_line;
+
+ line = &pblk->lines[i];
+
+ line->pblk = pblk;
+ line->id = i;
+ line->type = PBLK_LINETYPE_FREE;
+ line->state = PBLK_LINESTATE_FREE;
+ line->gc_group = PBLK_LINEGC_NONE;
+ spin_lock_init(&line->lock);
+
+ nr_bad_blks = pblk_bb_line(pblk, line);
+ if (nr_bad_blks < 0 || nr_bad_blks > lm->blk_per_line) {
+ ret = -EINVAL;
+ goto fail_free_lines;
+ }
+
+ blk_in_line = lm->blk_per_line - nr_bad_blks;
+ if (blk_in_line < lm->min_blk_line) {
+ line->state = PBLK_LINESTATE_BAD;
+ list_add_tail(&line->list, &l_mg->bad_list);
+ continue;
+ }
+
+ nr_free_blks += blk_in_line;
+ atomic_set(&line->blk_in_line, blk_in_line);
+
+ l_mg->nr_free_lines++;
+ list_add_tail(&line->list, &l_mg->free_list);
+ }
+
+ pblk_set_provision(pblk, nr_free_blks);
+
+ sema_init(&pblk->erase_sem, 1);
+
+ /* Cleanup per-LUN bad block lists - managed within lines on run-time */
+ for (i = 0; i < geo->nr_luns; i++)
+ kfree(pblk->luns[i].bb_list);
+
+ return 0;
+fail_free_lines:
+ kfree(pblk->lines);
+fail_free_bb_aux:
+ kfree(l_mg->bb_aux);
+fail_free_bb_template:
+ kfree(l_mg->bb_template);
+fail_free_meta:
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
+ pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
+ }
+fail:
+ for (i = 0; i < geo->nr_luns; i++)
+ kfree(pblk->luns[i].bb_list);
+
+ return ret;
+}
+
+static int pblk_writer_init(struct pblk *pblk)
+{
+ setup_timer(&pblk->wtimer, pblk_write_timer_fn, (unsigned long)pblk);
+ mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
+
+ pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
+ if (IS_ERR(pblk->writer_ts)) {
+ pr_err("pblk: could not allocate writer kthread\n");
+ return PTR_ERR(pblk->writer_ts);
+ }
+
+ return 0;
+}
+
+static void pblk_writer_stop(struct pblk *pblk)
+{
+ if (pblk->writer_ts)
+ kthread_stop(pblk->writer_ts);
+ del_timer(&pblk->wtimer);
+}
+
+static void pblk_free(struct pblk *pblk)
+{
+ pblk_luns_free(pblk);
+ pblk_lines_free(pblk);
+ pblk_line_meta_free(pblk);
+ pblk_core_free(pblk);
+ pblk_l2p_free(pblk);
+
+ kfree(pblk);
+}
+
+static void pblk_tear_down(struct pblk *pblk)
+{
+ pblk_flush_writer(pblk);
+ pblk_writer_stop(pblk);
+ pblk_rb_sync_l2p(&pblk->rwb);
+ pblk_recov_pad(pblk);
+ pblk_rwb_free(pblk);
+ pblk_rl_free(&pblk->rl);
+
+ pr_debug("pblk: consistent tear down\n");
+}
+
+static void pblk_exit(void *private)
+{
+ struct pblk *pblk = private;
+
+ down_write(&pblk_lock);
+ pblk_gc_exit(pblk);
+ pblk_tear_down(pblk);
+ pblk_free(pblk);
+ up_write(&pblk_lock);
+}
+
+static sector_t pblk_capacity(void *private)
+{
+ struct pblk *pblk = private;
+
+ return pblk->capacity * NR_PHY_IN_LOG;
+}
+
+static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
+ int flags)
+{
+ struct nvm_geo *geo = &dev->geo;
+ struct request_queue *bqueue = dev->q;
+ struct request_queue *tqueue = tdisk->queue;
+ struct pblk *pblk;
+ int ret;
+
+ if (dev->identity.dom & NVM_RSP_L2P) {
+ pr_err("pblk: device-side L2P table not supported. (%x)\n",
+ dev->identity.dom);
+ return ERR_PTR(-EINVAL);
+ }
+
+ pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
+ if (!pblk)
+ return ERR_PTR(-ENOMEM);
+
+ pblk->dev = dev;
+ pblk->disk = tdisk;
+
+ spin_lock_init(&pblk->trans_lock);
+ spin_lock_init(&pblk->lock);
+
+ if (flags & NVM_TARGET_FACTORY)
+ pblk_setup_uuid(pblk);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_set(&pblk->inflight_writes, 0);
+ atomic_long_set(&pblk->padded_writes, 0);
+ atomic_long_set(&pblk->padded_wb, 0);
+ atomic_long_set(&pblk->nr_flush, 0);
+ atomic_long_set(&pblk->req_writes, 0);
+ atomic_long_set(&pblk->sub_writes, 0);
+ atomic_long_set(&pblk->sync_writes, 0);
+ atomic_long_set(&pblk->compl_writes, 0);
+ atomic_long_set(&pblk->inflight_reads, 0);
+ atomic_long_set(&pblk->sync_reads, 0);
+ atomic_long_set(&pblk->recov_writes, 0);
+ atomic_long_set(&pblk->recov_writes, 0);
+ atomic_long_set(&pblk->recov_gc_writes, 0);
+#endif
+
+ atomic_long_set(&pblk->read_failed, 0);
+ atomic_long_set(&pblk->read_empty, 0);
+ atomic_long_set(&pblk->read_high_ecc, 0);
+ atomic_long_set(&pblk->read_failed_gc, 0);
+ atomic_long_set(&pblk->write_failed, 0);
+ atomic_long_set(&pblk->erase_failed, 0);
+
+ ret = pblk_luns_init(pblk, dev->luns);
+ if (ret) {
+ pr_err("pblk: could not initialize luns\n");
+ goto fail;
+ }
+
+ ret = pblk_lines_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize lines\n");
+ goto fail_free_luns;
+ }
+
+ ret = pblk_core_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize core\n");
+ goto fail_free_line_meta;
+ }
+
+ ret = pblk_l2p_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize maps\n");
+ goto fail_free_core;
+ }
+
+ ret = pblk_lines_configure(pblk, flags);
+ if (ret) {
+ pr_err("pblk: could not configure lines\n");
+ goto fail_free_l2p;
+ }
+
+ ret = pblk_writer_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize write thread\n");
+ goto fail_free_lines;
+ }
+
+ ret = pblk_gc_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize gc\n");
+ goto fail_stop_writer;
+ }
+
+ /* inherit the size from the underlying device */
+ blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
+ blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
+
+ blk_queue_write_cache(tqueue, true, false);
+
+ tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size;
+ tqueue->limits.discard_alignment = 0;
+ blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
+
+ pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
+ geo->nr_luns, pblk->l_mg.nr_lines,
+ (unsigned long long)pblk->rl.nr_secs,
+ pblk->rwb.nr_entries);
+
+ wake_up_process(pblk->writer_ts);
+ return pblk;
+
+fail_stop_writer:
+ pblk_writer_stop(pblk);
+fail_free_lines:
+ pblk_lines_free(pblk);
+fail_free_l2p:
+ pblk_l2p_free(pblk);
+fail_free_core:
+ pblk_core_free(pblk);
+fail_free_line_meta:
+ pblk_line_meta_free(pblk);
+fail_free_luns:
+ pblk_luns_free(pblk);
+fail:
+ kfree(pblk);
+ return ERR_PTR(ret);
+}
+
+/* physical block device target */
+static struct nvm_tgt_type tt_pblk = {
+ .name = "pblk",
+ .version = {1, 0, 0},
+
+ .make_rq = pblk_make_rq,
+ .capacity = pblk_capacity,
+
+ .init = pblk_init,
+ .exit = pblk_exit,
+
+ .sysfs_init = pblk_sysfs_init,
+ .sysfs_exit = pblk_sysfs_exit,
+};
+
+static int __init pblk_module_init(void)
+{
+ return nvm_register_tgt_type(&tt_pblk);
+}
+
+static void pblk_module_exit(void)
+{
+ nvm_unregister_tgt_type(&tt_pblk);
+}
+
+module_init(pblk_module_init);
+module_exit(pblk_module_exit);
+MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
+MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
new file mode 100644
index 000000000000..17c16955284d
--- /dev/null
+++ b/drivers/lightnvm/pblk-map.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-map.c - pblk's lba-ppa mapping strategy
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
+ struct ppa_addr *ppa_list,
+ unsigned long *lun_bitmap,
+ struct pblk_sec_meta *meta_list,
+ unsigned int valid_secs)
+{
+ struct pblk_line *line = pblk_line_get_data(pblk);
+ struct line_emeta *emeta = line->emeta;
+ struct pblk_w_ctx *w_ctx;
+ __le64 *lba_list = pblk_line_emeta_to_lbas(emeta);
+ u64 paddr;
+ int nr_secs = pblk->min_write_pgs;
+ int i;
+
+ paddr = pblk_alloc_page(pblk, line, nr_secs);
+
+ for (i = 0; i < nr_secs; i++, paddr++) {
+ /* ppa to be sent to the device */
+ ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+
+ /* Write context for target bio completion on write buffer. Note
+ * that the write buffer is protected by the sync backpointer,
+ * and a single writer thread have access to each specific entry
+ * at a time. Thus, it is safe to modify the context for the
+ * entry we are setting up for submission without taking any
+ * lock or memory barrier.
+ */
+ if (i < valid_secs) {
+ kref_get(&line->ref);
+ w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
+ w_ctx->ppa = ppa_list[i];
+ meta_list[i].lba = cpu_to_le64(w_ctx->lba);
+ lba_list[paddr] = cpu_to_le64(w_ctx->lba);
+ le64_add_cpu(&line->emeta->nr_valid_lbas, 1);
+ } else {
+ meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+ lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
+ pblk_map_pad_invalidate(pblk, line, paddr);
+ }
+ }
+
+ if (pblk_line_is_full(line)) {
+ line = pblk_line_replace_data(pblk);
+ if (!line)
+ return;
+ }
+
+ pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
+}
+
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+ unsigned long *lun_bitmap, unsigned int valid_secs,
+ unsigned int off)
+{
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ unsigned int map_secs;
+ int min = pblk->min_write_pgs;
+ int i;
+
+ for (i = off; i < rqd->nr_ppas; i += min) {
+ map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+ pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+ lun_bitmap, &meta_list[i], map_secs);
+ }
+}
+
+/* only if erase_ppa is set, acquire erase semaphore */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int sentry, unsigned long *lun_bitmap,
+ unsigned int valid_secs, struct ppa_addr *erase_ppa)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line *e_line = pblk_line_get_data_next(pblk);
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ unsigned int map_secs;
+ int min = pblk->min_write_pgs;
+ int i, erase_lun;
+
+ for (i = 0; i < rqd->nr_ppas; i += min) {
+ map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+ pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+ lun_bitmap, &meta_list[i], map_secs);
+
+ erase_lun = rqd->ppa_list[i].g.lun * geo->nr_chnls +
+ rqd->ppa_list[i].g.ch;
+
+ if (!test_bit(erase_lun, e_line->erase_bitmap)) {
+ if (down_trylock(&pblk->erase_sem))
+ continue;
+
+ set_bit(erase_lun, e_line->erase_bitmap);
+ atomic_dec(&e_line->left_eblks);
+ *erase_ppa = rqd->ppa_list[i];
+ erase_ppa->g.blk = e_line->id;
+
+ /* Avoid evaluating e_line->left_eblks */
+ return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
+ valid_secs, i + min);
+ }
+ }
+
+ /* Erase blocks that are bad in this line but might not be in next */
+ if (unlikely(ppa_empty(*erase_ppa))) {
+ struct pblk_line_meta *lm = &pblk->lm;
+
+ i = find_first_zero_bit(e_line->erase_bitmap, lm->blk_per_line);
+ if (i == lm->blk_per_line)
+ return;
+
+ set_bit(i, e_line->erase_bitmap);
+ atomic_dec(&e_line->left_eblks);
+ *erase_ppa = pblk->luns[i].bppa; /* set ch and lun */
+ erase_ppa->g.blk = e_line->id;
+ }
+}
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
new file mode 100644
index 000000000000..045384ddc1f9
--- /dev/null
+++ b/drivers/lightnvm/pblk-rb.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * Based upon the circular ringbuffer.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rb.c - pblk's write buffer
+ */
+
+#include <linux/circ_buf.h>
+
+#include "pblk.h"
+
+static DECLARE_RWSEM(pblk_rb_lock);
+
+void pblk_rb_data_free(struct pblk_rb *rb)
+{
+ struct pblk_rb_pages *p, *t;
+
+ down_write(&pblk_rb_lock);
+ list_for_each_entry_safe(p, t, &rb->pages, list) {
+ free_pages((unsigned long)page_address(p->pages), p->order);
+ list_del(&p->list);
+ kfree(p);
+ }
+ up_write(&pblk_rb_lock);
+}
+
+/*
+ * Initialize ring buffer. The data and metadata buffers must be previously
+ * allocated and their size must be a power of two
+ * (Documentation/circular-buffers.txt)
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+ unsigned int power_size, unsigned int power_seg_sz)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ unsigned int init_entry = 0;
+ unsigned int alloc_order = power_size;
+ unsigned int max_order = MAX_ORDER - 1;
+ unsigned int order, iter;
+
+ down_write(&pblk_rb_lock);
+ rb->entries = rb_entry_base;
+ rb->seg_size = (1 << power_seg_sz);
+ rb->nr_entries = (1 << power_size);
+ rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
+ rb->sync_point = EMPTY_ENTRY;
+
+ spin_lock_init(&rb->w_lock);
+ spin_lock_init(&rb->s_lock);
+
+ INIT_LIST_HEAD(&rb->pages);
+
+ if (alloc_order >= max_order) {
+ order = max_order;
+ iter = (1 << (alloc_order - max_order));
+ } else {
+ order = alloc_order;
+ iter = 1;
+ }
+
+ do {
+ struct pblk_rb_entry *entry;
+ struct pblk_rb_pages *page_set;
+ void *kaddr;
+ unsigned long set_size;
+ int i;
+
+ page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL);
+ if (!page_set) {
+ up_write(&pblk_rb_lock);
+ return -ENOMEM;
+ }
+
+ page_set->order = order;
+ page_set->pages = alloc_pages(GFP_KERNEL, order);
+ if (!page_set->pages) {
+ kfree(page_set);
+ pblk_rb_data_free(rb);
+ up_write(&pblk_rb_lock);
+ return -ENOMEM;
+ }
+ kaddr = page_address(page_set->pages);
+
+ entry = &rb->entries[init_entry];
+ entry->data = kaddr;
+ entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+ entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+
+ set_size = (1 << order);
+ for (i = 1; i < set_size; i++) {
+ entry = &rb->entries[init_entry];
+ entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+ entry->data = kaddr + (i * rb->seg_size);
+ entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+ bio_list_init(&entry->w_ctx.bios);
+ }
+
+ list_add_tail(&page_set->list, &rb->pages);
+ iter--;
+ } while (iter > 0);
+ up_write(&pblk_rb_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_set(&rb->inflight_sync_point, 0);
+#endif
+
+ /*
+ * Initialize rate-limiter, which controls access to the write buffer
+ * but user and GC I/O
+ */
+ pblk_rl_init(&pblk->rl, rb->nr_entries);
+
+ return 0;
+}
+
+/*
+ * pblk_rb_calculate_size -- calculate the size of the write buffer
+ */
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries)
+{
+ /* Alloc a write buffer that can at least fit 128 entries */
+ return (1 << max(get_count_order(nr_entries), 7));
+}
+
+void *pblk_rb_entries_ref(struct pblk_rb *rb)
+{
+ return rb->entries;
+}
+
+static void clean_wctx(struct pblk_w_ctx *w_ctx)
+{
+ int flags;
+
+try:
+ flags = READ_ONCE(w_ctx->flags);
+ if (!(flags & PBLK_SUBMITTED_ENTRY))
+ goto try;
+
+ /* Release flags on context. Protect from writes and reads */
+ smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
+ pblk_ppa_set_empty(&w_ctx->ppa);
+}
+
+#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size)
+#define pblk_rb_ring_space(rb, head, tail, size) \
+ (CIRC_SPACE(head, tail, size))
+
+/*
+ * Buffer space is calculated with respect to the back pointer signaling
+ * synchronized entries to the media.
+ */
+static unsigned int pblk_rb_space(struct pblk_rb *rb)
+{
+ unsigned int mem = READ_ONCE(rb->mem);
+ unsigned int sync = READ_ONCE(rb->sync);
+
+ return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries);
+}
+
+/*
+ * Buffer count is calculated with respect to the submission entry signaling the
+ * entries that are available to send to the media
+ */
+unsigned int pblk_rb_read_count(struct pblk_rb *rb)
+{
+ unsigned int mem = READ_ONCE(rb->mem);
+ unsigned int subm = READ_ONCE(rb->subm);
+
+ return pblk_rb_ring_count(mem, subm, rb->nr_entries);
+}
+
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
+{
+ unsigned int subm;
+
+ subm = READ_ONCE(rb->subm);
+ /* Commit read means updating submission pointer */
+ smp_store_release(&rb->subm,
+ (subm + nr_entries) & (rb->nr_entries - 1));
+
+ return subm;
+}
+
+static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
+ unsigned int to_update)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_line *line;
+ struct pblk_rb_entry *entry;
+ struct pblk_w_ctx *w_ctx;
+ unsigned int i;
+
+ for (i = 0; i < to_update; i++) {
+ entry = &rb->entries[*l2p_upd];
+ w_ctx = &entry->w_ctx;
+
+ pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
+ entry->cacheline);
+
+ line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
+ kref_put(&line->ref, pblk_line_put);
+ clean_wctx(w_ctx);
+ *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1);
+ }
+
+ return 0;
+}
+
+/*
+ * When we move the l2p_update pointer, we update the l2p table - lookups will
+ * point to the physical address instead of to the cacheline in the write buffer
+ * from this moment on.
+ */
+static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int mem, unsigned int sync)
+{
+ unsigned int space, count;
+ int ret = 0;
+
+ lockdep_assert_held(&rb->w_lock);
+
+ /* Update l2p only as buffer entries are being overwritten */
+ space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries);
+ if (space > nr_entries)
+ goto out;
+
+ count = nr_entries - space;
+ /* l2p_update used exclusively under rb->w_lock */
+ ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count);
+
+out:
+ return ret;
+}
+
+/*
+ * Update the l2p entry for all sectors stored on the write buffer. This means
+ * that all future lookups to the l2p table will point to a device address, not
+ * to the cacheline in the write buffer.
+ */
+void pblk_rb_sync_l2p(struct pblk_rb *rb)
+{
+ unsigned int sync;
+ unsigned int to_update;
+
+ spin_lock(&rb->w_lock);
+
+ /* Protect from reads and writes */
+ sync = smp_load_acquire(&rb->sync);
+
+ to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
+ __pblk_rb_update_l2p(rb, &rb->l2p_update, to_update);
+
+ spin_unlock(&rb->w_lock);
+}
+
+/*
+ * Write @nr_entries to ring buffer from @data buffer if there is enough space.
+ * Typically, 4KB data chunks coming from a bio will be copied to the ring
+ * buffer, thus the write will fail if not all incoming data can be copied.
+ *
+ */
+static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx,
+ struct pblk_rb_entry *entry)
+{
+ memcpy(entry->data, data, rb->seg_size);
+
+ entry->w_ctx.lba = w_ctx.lba;
+ entry->w_ctx.ppa = w_ctx.ppa;
+}
+
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, unsigned int ring_pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_rb_entry *entry;
+ int flags;
+
+ entry = &rb->entries[ring_pos];
+ flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_DEBUG
+ /* Caller must guarantee that the entry is free */
+ BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+ __pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+ pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
+ flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+ /* Release flags on write context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
+ unsigned int ring_pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_rb_entry *entry;
+ int flags;
+
+ entry = &rb->entries[ring_pos];
+ flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_DEBUG
+ /* Caller must guarantee that the entry is free */
+ BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+ __pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+ if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line))
+ entry->w_ctx.lba = ADDR_EMPTY;
+
+ flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+ /* Release flags on write context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
+ unsigned int pos)
+{
+ struct pblk_rb_entry *entry;
+ unsigned int subm, sync_point;
+ int flags;
+
+ subm = READ_ONCE(rb->subm);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_inc(&rb->inflight_sync_point);
+#endif
+
+ if (pos == subm)
+ return 0;
+
+ sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
+ entry = &rb->entries[sync_point];
+
+ flags = READ_ONCE(entry->w_ctx.flags);
+ flags |= PBLK_FLUSH_ENTRY;
+
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+
+ /* Protect syncs */
+ smp_store_release(&rb->sync_point, sync_point);
+
+ spin_lock_irq(&rb->s_lock);
+ bio_list_add(&entry->w_ctx.bios, bio);
+ spin_unlock_irq(&rb->s_lock);
+
+ return 1;
+}
+
+static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos)
+{
+ unsigned int mem;
+ unsigned int sync;
+
+ sync = READ_ONCE(rb->sync);
+ mem = READ_ONCE(rb->mem);
+
+ if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries)
+ return 0;
+
+ if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
+ return 0;
+
+ *pos = mem;
+
+ return 1;
+}
+
+static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos)
+{
+ if (!__pblk_rb_may_write(rb, nr_entries, pos))
+ return 0;
+
+ /* Protect from read count */
+ smp_store_release(&rb->mem, (*pos + nr_entries) & (rb->nr_entries - 1));
+ return 1;
+}
+
+static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos, struct bio *bio,
+ int *io_ret)
+{
+ unsigned int mem;
+
+ if (!__pblk_rb_may_write(rb, nr_entries, pos))
+ return 0;
+
+ mem = (*pos + nr_entries) & (rb->nr_entries - 1);
+ *io_ret = NVM_IO_DONE;
+
+ if (bio->bi_opf & REQ_PREFLUSH) {
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_inc(&pblk->nr_flush);
+#endif
+ if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem))
+ *io_ret = NVM_IO_OK;
+ }
+
+ /* Protect from read count */
+ smp_store_release(&rb->mem, mem);
+ return 1;
+}
+
+/*
+ * Atomically check that (i) there is space on the write buffer for the
+ * incoming I/O, and (ii) the current I/O type has enough budget in the write
+ * buffer (rate-limiter).
+ */
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+ unsigned int nr_entries, unsigned int *pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ int flush_done;
+
+ spin_lock(&rb->w_lock);
+ if (!pblk_rl_user_may_insert(&pblk->rl, nr_entries)) {
+ spin_unlock(&rb->w_lock);
+ return NVM_IO_REQUEUE;
+ }
+
+ if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &flush_done)) {
+ spin_unlock(&rb->w_lock);
+ return NVM_IO_REQUEUE;
+ }
+
+ pblk_rl_user_in(&pblk->rl, nr_entries);
+ spin_unlock(&rb->w_lock);
+
+ return flush_done;
+}
+
+/*
+ * Look at pblk_rb_may_write_user comment
+ */
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+ spin_lock(&rb->w_lock);
+ if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) {
+ spin_unlock(&rb->w_lock);
+ return 0;
+ }
+
+ if (!pblk_rb_may_write(rb, nr_entries, pos)) {
+ spin_unlock(&rb->w_lock);
+ return 0;
+ }
+
+ pblk_rl_gc_in(&pblk->rl, nr_entries);
+ spin_unlock(&rb->w_lock);
+
+ return 1;
+}
+
+/*
+ * The caller of this function must ensure that the backpointer will not
+ * overwrite the entries passed on the list.
+ */
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+ struct list_head *list,
+ unsigned int max)
+{
+ struct pblk_rb_entry *entry, *tentry;
+ struct page *page;
+ unsigned int read = 0;
+ int ret;
+
+ list_for_each_entry_safe(entry, tentry, list, index) {
+ if (read > max) {
+ pr_err("pblk: too many entries on list\n");
+ goto out;
+ }
+
+ page = virt_to_page(entry->data);
+ if (!page) {
+ pr_err("pblk: could not allocate write bio page\n");
+ goto out;
+ }
+
+ ret = bio_add_page(bio, page, rb->seg_size, 0);
+ if (ret != rb->seg_size) {
+ pr_err("pblk: could not add page to write bio\n");
+ goto out;
+ }
+
+ list_del(&entry->index);
+ read++;
+ }
+
+out:
+ return read;
+}
+
+/*
+ * Read available entries on rb and add them to the given bio. To avoid a memory
+ * copy, a page reference to the write buffer is used to be added to the bio.
+ *
+ * This function is used by the write thread to form the write bio that will
+ * persist data on the write buffer to the media.
+ */
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+ struct pblk_c_ctx *c_ctx,
+ unsigned int pos,
+ unsigned int nr_entries,
+ unsigned int count)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_rb_entry *entry;
+ struct page *page;
+ unsigned int pad = 0, read = 0, to_read = nr_entries;
+ unsigned int user_io = 0, gc_io = 0;
+ unsigned int i;
+ int flags;
+ int ret;
+
+ if (count < nr_entries) {
+ pad = nr_entries - count;
+ to_read = count;
+ }
+
+ c_ctx->sentry = pos;
+ c_ctx->nr_valid = to_read;
+ c_ctx->nr_padded = pad;
+
+ for (i = 0; i < to_read; i++) {
+ entry = &rb->entries[pos];
+
+ /* A write has been allowed into the buffer, but data is still
+ * being copied to it. It is ok to busy wait.
+ */
+try:
+ flags = READ_ONCE(entry->w_ctx.flags);
+ if (!(flags & PBLK_WRITTEN_DATA))
+ goto try;
+
+ if (flags & PBLK_IOTYPE_USER)
+ user_io++;
+ else if (flags & PBLK_IOTYPE_GC)
+ gc_io++;
+ else
+ WARN(1, "pblk: unknown IO type\n");
+
+ page = virt_to_page(entry->data);
+ if (!page) {
+ pr_err("pblk: could not allocate write bio page\n");
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_SUBMITTED_ENTRY;
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+ goto out;
+ }
+
+ ret = bio_add_page(bio, page, rb->seg_size, 0);
+ if (ret != rb->seg_size) {
+ pr_err("pblk: could not add page to write bio\n");
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_SUBMITTED_ENTRY;
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+ goto out;
+ }
+
+ if (flags & PBLK_FLUSH_ENTRY) {
+ unsigned int sync_point;
+
+ sync_point = READ_ONCE(rb->sync_point);
+ if (sync_point == pos) {
+ /* Protect syncs */
+ smp_store_release(&rb->sync_point, EMPTY_ENTRY);
+ }
+
+ flags &= ~PBLK_FLUSH_ENTRY;
+#ifdef CONFIG_NVM_DEBUG
+ atomic_dec(&rb->inflight_sync_point);
+#endif
+ }
+
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_SUBMITTED_ENTRY;
+
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+
+ pos = (pos + 1) & (rb->nr_entries - 1);
+ }
+
+ read = to_read;
+ pblk_rl_out(&pblk->rl, user_io, gc_io);
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(pad, &((struct pblk *)
+ (container_of(rb, struct pblk, rwb)))->padded_writes);
+#endif
+out:
+ return read;
+}
+
+/*
+ * Copy to bio only if the lba matches the one on the given cache entry.
+ * Otherwise, it means that the entry has been overwritten, and the bio should
+ * be directed to disk.
+ */
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+ u64 pos, int bio_iter)
+{
+ struct pblk_rb_entry *entry;
+ struct pblk_w_ctx *w_ctx;
+ void *data;
+ int flags;
+ int ret = 1;
+
+ spin_lock(&rb->w_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+ /* Caller must ensure that the access will not cause an overflow */
+ BUG_ON(pos >= rb->nr_entries);
+#endif
+ entry = &rb->entries[pos];
+ w_ctx = &entry->w_ctx;
+ flags = READ_ONCE(w_ctx->flags);
+
+ /* Check if the entry has been overwritten or is scheduled to be */
+ if (w_ctx->lba != lba || flags & PBLK_WRITABLE_ENTRY) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Only advance the bio if it hasn't been advanced already. If advanced,
+ * this bio is at least a partial bio (i.e., it has partially been
+ * filled with data from the cache). If part of the data resides on the
+ * media, we will read later on
+ */
+ if (unlikely(!bio->bi_iter.bi_idx))
+ bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE);
+
+ data = bio_data(bio);
+ memcpy(data, entry->data, rb->seg_size);
+
+out:
+ spin_unlock(&rb->w_lock);
+ return ret;
+}
+
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos)
+{
+ unsigned int entry = pos & (rb->nr_entries - 1);
+
+ return &rb->entries[entry].w_ctx;
+}
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags)
+ __acquires(&rb->s_lock)
+{
+ if (flags)
+ spin_lock_irqsave(&rb->s_lock, *flags);
+ else
+ spin_lock_irq(&rb->s_lock);
+
+ return rb->sync;
+}
+
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
+ __releases(&rb->s_lock)
+{
+ lockdep_assert_held(&rb->s_lock);
+
+ if (flags)
+ spin_unlock_irqrestore(&rb->s_lock, *flags);
+ else
+ spin_unlock_irq(&rb->s_lock);
+}
+
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
+{
+ unsigned int sync;
+ unsigned int i;
+
+ lockdep_assert_held(&rb->s_lock);
+
+ sync = READ_ONCE(rb->sync);
+
+ for (i = 0; i < nr_entries; i++)
+ sync = (sync + 1) & (rb->nr_entries - 1);
+
+ /* Protect from counts */
+ smp_store_release(&rb->sync, sync);
+
+ return sync;
+}
+
+unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb)
+{
+ unsigned int subm, sync_point;
+ unsigned int count;
+
+ /* Protect syncs */
+ sync_point = smp_load_acquire(&rb->sync_point);
+ if (sync_point == EMPTY_ENTRY)
+ return 0;
+
+ subm = READ_ONCE(rb->subm);
+
+ /* The sync point itself counts as a sector to sync */
+ count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1;
+
+ return count;
+}
+
+/*
+ * Scan from the current position of the sync pointer to find the entry that
+ * corresponds to the given ppa. This is necessary since write requests can be
+ * completed out of order. The assumption is that the ppa is close to the sync
+ * pointer thus the search will not take long.
+ *
+ * The caller of this function must guarantee that the sync pointer will no
+ * reach the entry while it is using the metadata associated with it. With this
+ * assumption in mind, there is no need to take the sync lock.
+ */
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+ struct ppa_addr *ppa)
+{
+ unsigned int sync, subm, count;
+ unsigned int i;
+
+ sync = READ_ONCE(rb->sync);
+ subm = READ_ONCE(rb->subm);
+ count = pblk_rb_ring_count(subm, sync, rb->nr_entries);
+
+ for (i = 0; i < count; i++)
+ sync = (sync + 1) & (rb->nr_entries - 1);
+
+ return NULL;
+}
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb)
+{
+ struct pblk_rb_entry *entry;
+ int i;
+ int ret = 0;
+
+ spin_lock(&rb->w_lock);
+ spin_lock_irq(&rb->s_lock);
+
+ if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
+ (rb->sync == rb->l2p_update) &&
+ (rb->sync_point == EMPTY_ENTRY)) {
+ goto out;
+ }
+
+ if (!rb->entries) {
+ ret = 1;
+ goto out;
+ }
+
+ for (i = 0; i < rb->nr_entries; i++) {
+ entry = &rb->entries[i];
+
+ if (!entry->data) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock(&rb->w_lock);
+ spin_unlock_irq(&rb->s_lock);
+
+ return ret;
+}
+
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos)
+{
+ return (pos & (rb->nr_entries - 1));
+}
+
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos)
+{
+ return (pos >= rb->nr_entries);
+}
+
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_c_ctx *c;
+ ssize_t offset;
+ int queued_entries = 0;
+
+ spin_lock_irq(&rb->s_lock);
+ list_for_each_entry(c, &pblk->compl_list, list)
+ queued_entries++;
+ spin_unlock_irq(&rb->s_lock);
+
+ if (rb->sync_point != EMPTY_ENTRY)
+ offset = scnprintf(buf, PAGE_SIZE,
+ "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
+ rb->nr_entries,
+ rb->mem,
+ rb->subm,
+ rb->sync,
+ rb->l2p_update,
+#ifdef CONFIG_NVM_DEBUG
+ atomic_read(&rb->inflight_sync_point),
+#else
+ 0,
+#endif
+ rb->sync_point,
+ pblk_rb_read_count(rb),
+ pblk_rb_space(rb),
+ pblk_rb_sync_point_count(rb),
+ queued_entries);
+ else
+ offset = scnprintf(buf, PAGE_SIZE,
+ "%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n",
+ rb->nr_entries,
+ rb->mem,
+ rb->subm,
+ rb->sync,
+ rb->l2p_update,
+#ifdef CONFIG_NVM_DEBUG
+ atomic_read(&rb->inflight_sync_point),
+#else
+ 0,
+#endif
+ pblk_rb_read_count(rb),
+ pblk_rb_space(rb),
+ pblk_rb_sync_point_count(rb),
+ queued_entries);
+
+ return offset;
+}
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
new file mode 100644
index 000000000000..4a12f14d78c6
--- /dev/null
+++ b/drivers/lightnvm/pblk-read.c
@@ -0,0 +1,529 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-read.c - pblk's read path
+ */
+
+#include "pblk.h"
+
+/*
+ * There is no guarantee that the value read from cache has not been updated and
+ * resides at another location in the cache. We guarantee though that if the
+ * value is read from the cache, it belongs to the mapped lba. In order to
+ * guarantee and order between writes and reads are ordered, a flush must be
+ * issued.
+ */
+static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
+ sector_t lba, struct ppa_addr ppa,
+ int bio_iter)
+{
+#ifdef CONFIG_NVM_DEBUG
+ /* Callers must ensure that the ppa points to a cache address */
+ BUG_ON(pblk_ppa_empty(ppa));
+ BUG_ON(!pblk_addr_in_cache(ppa));
+#endif
+
+ return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba,
+ pblk_addr_to_cacheline(ppa), bio_iter);
+}
+
+static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned long *read_bitmap)
+{
+ struct bio *bio = rqd->bio;
+ struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+ sector_t blba = pblk_get_lba(bio);
+ int nr_secs = rqd->nr_ppas;
+ int advanced_bio = 0;
+ int i, j = 0;
+
+ /* logic error: lba out-of-bounds. Ignore read request */
+ if (blba + nr_secs >= pblk->rl.nr_secs) {
+ WARN(1, "pblk: read lbas out of bounds\n");
+ return;
+ }
+
+ pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
+
+ for (i = 0; i < nr_secs; i++) {
+ struct ppa_addr p = ppas[i];
+ sector_t lba = blba + i;
+
+retry:
+ if (pblk_ppa_empty(p)) {
+ WARN_ON(test_and_set_bit(i, read_bitmap));
+ continue;
+ }
+
+ /* Try to read from write buffer. The address is later checked
+ * on the write buffer to prevent retrieving overwritten data.
+ */
+ if (pblk_addr_in_cache(p)) {
+ if (!pblk_read_from_cache(pblk, bio, lba, p, i)) {
+ pblk_lookup_l2p_seq(pblk, &p, lba, 1);
+ goto retry;
+ }
+ WARN_ON(test_and_set_bit(i, read_bitmap));
+ advanced_bio = 1;
+ } else {
+ /* Read from media non-cached sectors */
+ rqd->ppa_list[j++] = p;
+ }
+
+ if (advanced_bio)
+ bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(nr_secs, &pblk->inflight_reads);
+#endif
+}
+
+static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ int err;
+
+ rqd->flags = pblk_set_read_mode(pblk);
+
+ err = pblk_submit_io(pblk, rqd);
+ if (err)
+ return NVM_IO_ERR;
+
+ return NVM_IO_OK;
+}
+
+static void pblk_end_io_read(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ struct bio *bio = rqd->bio;
+
+ if (rqd->error)
+ pblk_log_read_err(pblk, rqd);
+#ifdef CONFIG_NVM_DEBUG
+ else
+ WARN_ONCE(bio->bi_error, "pblk: corrupted read error\n");
+#endif
+
+ if (rqd->nr_ppas > 1)
+ nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+
+ bio_put(bio);
+ if (r_ctx->orig_bio) {
+#ifdef CONFIG_NVM_DEBUG
+ WARN_ONCE(r_ctx->orig_bio->bi_error,
+ "pblk: corrupted read bio\n");
+#endif
+ bio_endio(r_ctx->orig_bio);
+ bio_put(r_ctx->orig_bio);
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
+ atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
+#endif
+
+ pblk_free_rqd(pblk, rqd, READ);
+}
+
+static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int bio_init_idx,
+ unsigned long *read_bitmap)
+{
+ struct bio *new_bio, *bio = rqd->bio;
+ struct bio_vec src_bv, dst_bv;
+ void *ppa_ptr = NULL;
+ void *src_p, *dst_p;
+ dma_addr_t dma_ppa_list = 0;
+ int nr_secs = rqd->nr_ppas;
+ int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
+ int i, ret, hole;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ new_bio = bio_alloc(GFP_KERNEL, nr_holes);
+ if (!new_bio) {
+ pr_err("pblk: could not alloc read bio\n");
+ return NVM_IO_ERR;
+ }
+
+ if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
+ goto err;
+
+ if (nr_holes != new_bio->bi_vcnt) {
+ pr_err("pblk: malformed bio\n");
+ goto err;
+ }
+
+ new_bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
+ new_bio->bi_private = &wait;
+ new_bio->bi_end_io = pblk_end_bio_sync;
+
+ rqd->bio = new_bio;
+ rqd->nr_ppas = nr_holes;
+ rqd->end_io = NULL;
+
+ if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+ ppa_ptr = rqd->ppa_list;
+ dma_ppa_list = rqd->dma_ppa_list;
+ rqd->ppa_addr = rqd->ppa_list[0];
+ }
+
+ ret = pblk_submit_read_io(pblk, rqd);
+ if (ret) {
+ bio_put(rqd->bio);
+ pr_err("pblk: read IO submission failed\n");
+ goto err;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: partial read I/O timed out\n");
+ }
+
+ if (rqd->error) {
+ atomic_long_inc(&pblk->read_failed);
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+ }
+
+ if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+ rqd->ppa_list = ppa_ptr;
+ rqd->dma_ppa_list = dma_ppa_list;
+ }
+
+ /* Fill the holes in the original bio */
+ i = 0;
+ hole = find_first_zero_bit(read_bitmap, nr_secs);
+ do {
+ src_bv = new_bio->bi_io_vec[i++];
+ dst_bv = bio->bi_io_vec[bio_init_idx + hole];
+
+ src_p = kmap_atomic(src_bv.bv_page);
+ dst_p = kmap_atomic(dst_bv.bv_page);
+
+ memcpy(dst_p + dst_bv.bv_offset,
+ src_p + src_bv.bv_offset,
+ PBLK_EXPOSED_PAGE_SIZE);
+
+ kunmap_atomic(src_p);
+ kunmap_atomic(dst_p);
+
+ mempool_free(src_bv.bv_page, pblk->page_pool);
+
+ hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
+ } while (hole < nr_secs);
+
+ bio_put(new_bio);
+
+ /* Complete the original bio and associated request */
+ rqd->bio = bio;
+ rqd->nr_ppas = nr_secs;
+ rqd->private = pblk;
+
+ bio_endio(bio);
+ pblk_end_io_read(rqd);
+ return NVM_IO_OK;
+
+err:
+ /* Free allocated pages in new bio */
+ pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
+ rqd->private = pblk;
+ pblk_end_io_read(rqd);
+ return NVM_IO_ERR;
+}
+
+static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned long *read_bitmap)
+{
+ struct bio *bio = rqd->bio;
+ struct ppa_addr ppa;
+ sector_t lba = pblk_get_lba(bio);
+
+ /* logic error: lba out-of-bounds. Ignore read request */
+ if (lba >= pblk->rl.nr_secs) {
+ WARN(1, "pblk: read lba out of bounds\n");
+ return;
+ }
+
+ pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+retry:
+ if (pblk_ppa_empty(ppa)) {
+ WARN_ON(test_and_set_bit(0, read_bitmap));
+ return;
+ }
+
+ /* Try to read from write buffer. The address is later checked on the
+ * write buffer to prevent retrieving overwritten data.
+ */
+ if (pblk_addr_in_cache(ppa)) {
+ if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0)) {
+ pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+ goto retry;
+ }
+ WARN_ON(test_and_set_bit(0, read_bitmap));
+ } else {
+ rqd->ppa_addr = ppa;
+ }
+}
+
+int pblk_submit_read(struct pblk *pblk, struct bio *bio)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ unsigned int nr_secs = pblk_get_secs(bio);
+ struct nvm_rq *rqd;
+ unsigned long read_bitmap; /* Max 64 ppas per request */
+ unsigned int bio_init_idx;
+ int ret = NVM_IO_ERR;
+
+ if (nr_secs > PBLK_MAX_REQ_ADDRS)
+ return NVM_IO_ERR;
+
+ bitmap_zero(&read_bitmap, nr_secs);
+
+ rqd = pblk_alloc_rqd(pblk, READ);
+ if (IS_ERR(rqd)) {
+ pr_err_ratelimited("pblk: not able to alloc rqd");
+ return NVM_IO_ERR;
+ }
+
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->bio = bio;
+ rqd->nr_ppas = nr_secs;
+ rqd->private = pblk;
+ rqd->end_io = pblk_end_io_read;
+
+ /* Save the index for this bio's start. This is needed in case
+ * we need to fill a partial read.
+ */
+ bio_init_idx = pblk_get_bi_idx(bio);
+
+ if (nr_secs > 1) {
+ rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd->dma_ppa_list);
+ if (!rqd->ppa_list) {
+ pr_err("pblk: not able to allocate ppa list\n");
+ goto fail_rqd_free;
+ }
+
+ pblk_read_ppalist_rq(pblk, rqd, &read_bitmap);
+ } else {
+ pblk_read_rq(pblk, rqd, &read_bitmap);
+ }
+
+ bio_get(bio);
+ if (bitmap_full(&read_bitmap, nr_secs)) {
+ bio_endio(bio);
+ pblk_end_io_read(rqd);
+ return NVM_IO_OK;
+ }
+
+ /* All sectors are to be read from the device */
+ if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
+ struct bio *int_bio = NULL;
+ struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+
+ /* Clone read bio to deal with read errors internally */
+ int_bio = bio_clone_bioset(bio, GFP_KERNEL, fs_bio_set);
+ if (!int_bio) {
+ pr_err("pblk: could not clone read bio\n");
+ return NVM_IO_ERR;
+ }
+
+ rqd->bio = int_bio;
+ r_ctx->orig_bio = bio;
+
+ ret = pblk_submit_read_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: read IO submission failed\n");
+ if (int_bio)
+ bio_put(int_bio);
+ return ret;
+ }
+
+ return NVM_IO_OK;
+ }
+
+ /* The read bio request could be partially filled by the write buffer,
+ * but there are some holes that need to be read from the drive.
+ */
+ ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
+ if (ret) {
+ pr_err("pblk: failed to perform partial read\n");
+ return ret;
+ }
+
+ return NVM_IO_OK;
+
+fail_rqd_free:
+ pblk_free_rqd(pblk, rqd, READ);
+ return ret;
+}
+
+static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_line *line, u64 *lba_list,
+ unsigned int nr_secs)
+{
+ struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+ int valid_secs = 0;
+ int i;
+
+ pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs);
+
+ for (i = 0; i < nr_secs; i++) {
+ if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id ||
+ pblk_ppa_empty(ppas[i])) {
+ lba_list[i] = ADDR_EMPTY;
+ continue;
+ }
+
+ rqd->ppa_list[valid_secs++] = ppas[i];
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(valid_secs, &pblk->inflight_reads);
+#endif
+ return valid_secs;
+}
+
+static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_line *line, sector_t lba)
+{
+ struct ppa_addr ppa;
+ int valid_secs = 0;
+
+ if (lba == ADDR_EMPTY)
+ goto out;
+
+ /* logic error: lba out-of-bounds */
+ if (lba >= pblk->rl.nr_secs) {
+ WARN(1, "pblk: read lba out of bounds\n");
+ goto out;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ ppa = pblk_trans_map_get(pblk, lba);
+ spin_unlock(&pblk->trans_lock);
+
+ /* Ignore updated values until the moment */
+ if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id ||
+ pblk_ppa_empty(ppa))
+ goto out;
+
+ rqd->ppa_addr = ppa;
+ valid_secs = 1;
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+out:
+ return valid_secs;
+}
+
+int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
+ unsigned int nr_secs, unsigned int *secs_to_gc,
+ struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct request_queue *q = dev->q;
+ struct bio *bio;
+ struct nvm_rq rqd;
+ int ret, data_len;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ if (nr_secs > 1) {
+ rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd.dma_ppa_list);
+ if (!rqd.ppa_list)
+ return NVM_IO_ERR;
+
+ *secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list,
+ nr_secs);
+ if (*secs_to_gc == 1) {
+ struct ppa_addr ppa;
+
+ ppa = rqd.ppa_list[0];
+ nvm_dev_dma_free(dev->parent, rqd.ppa_list,
+ rqd.dma_ppa_list);
+ rqd.ppa_addr = ppa;
+ }
+ } else {
+ *secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]);
+ }
+
+ if (!(*secs_to_gc))
+ goto out;
+
+ data_len = (*secs_to_gc) * geo->sec_size;
+ bio = bio_map_kern(q, data, data_len, GFP_KERNEL);
+ if (IS_ERR(bio)) {
+ pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
+ goto err_free_dma;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd.opcode = NVM_OP_PREAD;
+ rqd.end_io = pblk_end_io_sync;
+ rqd.private = &wait;
+ rqd.nr_ppas = *secs_to_gc;
+ rqd.bio = bio;
+
+ ret = pblk_submit_read_io(pblk, &rqd);
+ if (ret) {
+ bio_endio(bio);
+ pr_err("pblk: GC read request failed\n");
+ goto err_free_dma;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: GC read I/O timed out\n");
+ }
+
+ if (rqd.error) {
+ atomic_long_inc(&pblk->read_failed_gc);
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, &rqd, rqd.error);
+#endif
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(*secs_to_gc, &pblk->sync_reads);
+ atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads);
+ atomic_long_sub(*secs_to_gc, &pblk->inflight_reads);
+#endif
+
+out:
+ if (rqd.nr_ppas > 1)
+ nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+ return NVM_IO_OK;
+
+err_free_dma:
+ if (rqd.nr_ppas > 1)
+ nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+ return NVM_IO_ERR;
+}
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
new file mode 100644
index 000000000000..f8f85087cd3c
--- /dev/null
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -0,0 +1,998 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-recovery.c - pblk's recovery path
+ */
+
+#include "pblk.h"
+
+void pblk_submit_rec(struct work_struct *work)
+{
+ struct pblk_rec_ctx *recovery =
+ container_of(work, struct pblk_rec_ctx, ws_rec);
+ struct pblk *pblk = recovery->pblk;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_rq *rqd = recovery->rqd;
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+ int max_secs = nvm_max_phys_sects(dev);
+ struct bio *bio;
+ unsigned int nr_rec_secs;
+ unsigned int pgs_read;
+ int ret;
+
+ nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
+ max_secs);
+
+ bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
+ if (!bio) {
+ pr_err("pblk: not able to create recovery bio\n");
+ return;
+ }
+
+ bio->bi_iter.bi_sector = 0;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ rqd->bio = bio;
+ rqd->nr_ppas = nr_rec_secs;
+
+ pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
+ nr_rec_secs);
+ if (pgs_read != nr_rec_secs) {
+ pr_err("pblk: could not read recovery entries\n");
+ goto err;
+ }
+
+ if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
+ pr_err("pblk: could not setup recovery request\n");
+ goto err;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(nr_rec_secs, &pblk->recov_writes);
+#endif
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: I/O submission failed: %d\n", ret);
+ goto err;
+ }
+
+ mempool_free(recovery, pblk->rec_pool);
+ return;
+
+err:
+ bio_put(bio);
+ pblk_free_rqd(pblk, rqd, WRITE);
+}
+
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
+ struct pblk_rec_ctx *recovery, u64 *comp_bits,
+ unsigned int comp)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ int max_secs = nvm_max_phys_sects(dev);
+ struct nvm_rq *rec_rqd;
+ struct pblk_c_ctx *rec_ctx;
+ int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
+
+ rec_rqd = pblk_alloc_rqd(pblk, WRITE);
+ if (IS_ERR(rec_rqd)) {
+ pr_err("pblk: could not create recovery req.\n");
+ return -ENOMEM;
+ }
+
+ rec_ctx = nvm_rq_to_pdu(rec_rqd);
+
+ /* Copy completion bitmap, but exclude the first X completed entries */
+ bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
+ (unsigned long int *)comp_bits,
+ comp, max_secs);
+
+ /* Save the context for the entries that need to be re-written and
+ * update current context with the completed entries.
+ */
+ rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
+ if (comp >= c_ctx->nr_valid) {
+ rec_ctx->nr_valid = 0;
+ rec_ctx->nr_padded = nr_entries - comp;
+
+ c_ctx->nr_padded = comp - c_ctx->nr_valid;
+ } else {
+ rec_ctx->nr_valid = c_ctx->nr_valid - comp;
+ rec_ctx->nr_padded = c_ctx->nr_padded;
+
+ c_ctx->nr_valid = comp;
+ c_ctx->nr_padded = 0;
+ }
+
+ recovery->rqd = rec_rqd;
+ recovery->pblk = pblk;
+
+ return 0;
+}
+
+__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta)
+{
+ u32 crc;
+
+ crc = pblk_calc_emeta_crc(pblk, emeta);
+ if (le32_to_cpu(emeta->crc) != crc)
+ return NULL;
+
+ if (le32_to_cpu(emeta->header.identifier) != PBLK_MAGIC)
+ return NULL;
+
+ return pblk_line_emeta_to_lbas(emeta);
+}
+
+static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct line_emeta *emeta = line->emeta;
+ __le64 *lba_list;
+ int data_start;
+ int nr_data_lbas, nr_valid_lbas, nr_lbas = 0;
+ int i;
+
+ lba_list = pblk_recov_get_lba_list(pblk, emeta);
+ if (!lba_list)
+ return 1;
+
+ data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
+ nr_data_lbas = lm->sec_per_line - lm->emeta_sec;
+ nr_valid_lbas = le64_to_cpu(emeta->nr_valid_lbas);
+
+ for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) {
+ struct ppa_addr ppa;
+ int pos;
+
+ ppa = addr_to_pblk_ppa(pblk, i, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+
+ /* Do not update bad blocks */
+ if (test_bit(pos, line->blk_bitmap))
+ continue;
+
+ if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) {
+ spin_lock(&line->lock);
+ if (test_and_set_bit(i, line->invalid_bitmap))
+ WARN_ONCE(1, "pblk: rec. double invalidate:\n");
+ else
+ line->vsc--;
+ spin_unlock(&line->lock);
+
+ continue;
+ }
+
+ pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa);
+ nr_lbas++;
+ }
+
+ if (nr_valid_lbas != nr_lbas)
+ pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n",
+ line->id, line->emeta->nr_valid_lbas, nr_lbas);
+
+ line->left_msecs = 0;
+
+ return 0;
+}
+
+static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+
+ return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec -
+ nr_bb * geo->sec_per_blk;
+}
+
+struct pblk_recov_alloc {
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ void *data;
+ dma_addr_t dma_ppa_list;
+ dma_addr_t dma_meta_list;
+};
+
+static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_recov_alloc p, u64 r_ptr)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ u64 r_ptr_int;
+ int left_ppas;
+ int rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ ppa_list = p.ppa_list;
+ meta_list = p.meta_list;
+ rqd = p.rqd;
+ data = p.data;
+ dma_ppa_list = p.dma_ppa_list;
+ dma_meta_list = p.dma_meta_list;
+
+ left_ppas = line->cur_sec - r_ptr;
+ if (!left_ppas)
+ return 0;
+
+ r_ptr_int = r_ptr;
+
+next_read_rq:
+ memset(rqd, 0, pblk_r_rq_size);
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (!rq_ppas)
+ rq_ppas = pblk->min_write_pgs;
+ rq_len = rq_ppas * geo->sec_size;
+
+ bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->flags = pblk_set_read_mode(pblk);
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+ rqd->end_io = pblk_end_io_sync;
+ rqd->private = &wait;
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ r_ptr_int += pblk->min_write_pgs;
+ ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
+ rqd->ppa_list[i] =
+ addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+ }
+
+ /* If read fails, more padding is needed */
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: I/O submission failed: %d\n", ret);
+ return ret;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: L2P recovery read timed out\n");
+ return -EINTR;
+ }
+
+ reinit_completion(&wait);
+
+ /* At this point, the read should not fail. If it does, it is a problem
+ * we cannot recover from here. Need FTL log.
+ */
+ if (rqd->error) {
+ pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
+ return -EINTR;
+ }
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ u64 lba = le64_to_cpu(meta_list[i].lba);
+
+ if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+ continue;
+
+ pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+ }
+
+ left_ppas -= rq_ppas;
+ if (left_ppas > 0)
+ goto next_read_rq;
+
+ return 0;
+}
+
+static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_recov_alloc p, int left_ppas)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ __le64 *lba_list = pblk_line_emeta_to_lbas(line->emeta);
+ u64 w_ptr = line->cur_sec;
+ int left_line_ppas = line->left_msecs;
+ int rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ ppa_list = p.ppa_list;
+ meta_list = p.meta_list;
+ rqd = p.rqd;
+ data = p.data;
+ dma_ppa_list = p.dma_ppa_list;
+ dma_meta_list = p.dma_meta_list;
+
+next_pad_rq:
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (!rq_ppas)
+ rq_ppas = pblk->min_write_pgs;
+ rq_len = rq_ppas * geo->sec_size;
+
+ bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+ memset(rqd, 0, pblk_r_rq_size);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PWRITE;
+ rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+ rqd->end_io = pblk_end_io_sync;
+ rqd->private = &wait;
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+ ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ w_ptr += pblk->min_write_pgs;
+ ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
+ struct ppa_addr dev_ppa;
+
+ dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+
+ pblk_map_invalidate(pblk, dev_ppa);
+ meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+ lba_list[w_ptr] = cpu_to_le64(ADDR_EMPTY);
+ rqd->ppa_list[i] = dev_ppa;
+ }
+ }
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: I/O submission failed: %d\n", ret);
+ return ret;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: L2P recovery write timed out\n");
+ }
+ reinit_completion(&wait);
+
+ left_line_ppas -= rq_ppas;
+ left_ppas -= rq_ppas;
+ if (left_ppas > 0 && left_line_ppas)
+ goto next_pad_rq;
+
+ return 0;
+}
+
+/* When this function is called, it means that not all upper pages have been
+ * written in a page that contains valid data. In order to recover this data, we
+ * first find the write pointer on the device, then we pad all necessary
+ * sectors, and finally attempt to read the valid data
+ */
+static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_recov_alloc p)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ u64 w_ptr = 0, r_ptr;
+ int rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+ int rec_round;
+ int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ ppa_list = p.ppa_list;
+ meta_list = p.meta_list;
+ rqd = p.rqd;
+ data = p.data;
+ dma_ppa_list = p.dma_ppa_list;
+ dma_meta_list = p.dma_meta_list;
+
+ /* we could recover up until the line write pointer */
+ r_ptr = line->cur_sec;
+ rec_round = 0;
+
+next_rq:
+ memset(rqd, 0, pblk_r_rq_size);
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (!rq_ppas)
+ rq_ppas = pblk->min_write_pgs;
+ rq_len = rq_ppas * geo->sec_size;
+
+ bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->flags = pblk_set_read_mode(pblk);
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+ rqd->end_io = pblk_end_io_sync;
+ rqd->private = &wait;
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+ ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ w_ptr += pblk->min_write_pgs;
+ ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
+ rqd->ppa_list[i] =
+ addr_to_gen_ppa(pblk, w_ptr, line->id);
+ }
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: I/O submission failed: %d\n", ret);
+ return ret;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: L2P recovery read timed out\n");
+ }
+ reinit_completion(&wait);
+
+ /* This should not happen since the read failed during normal recovery,
+ * but the media works funny sometimes...
+ */
+ if (!rec_round++ && !rqd->error) {
+ rec_round = 0;
+ for (i = 0; i < rqd->nr_ppas; i++, r_ptr++) {
+ u64 lba = le64_to_cpu(meta_list[i].lba);
+
+ if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+ continue;
+
+ pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+ }
+ }
+
+ /* Reached the end of the written line */
+ if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+ int pad_secs, nr_error_bits, bit;
+ int ret;
+
+ bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+ nr_error_bits = rqd->nr_ppas - bit;
+
+ /* Roll back failed sectors */
+ line->cur_sec -= nr_error_bits;
+ line->left_msecs += nr_error_bits;
+ bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+ pad_secs = pblk_pad_distance(pblk);
+ if (pad_secs > line->left_msecs)
+ pad_secs = line->left_msecs;
+
+ ret = pblk_recov_pad_oob(pblk, line, p, pad_secs);
+ if (ret)
+ pr_err("pblk: OOB padding failed (err:%d)\n", ret);
+
+ ret = pblk_recov_read_oob(pblk, line, p, r_ptr);
+ if (ret)
+ pr_err("pblk: OOB read failed (err:%d)\n", ret);
+
+ line->left_ssecs = line->left_msecs;
+ left_ppas = 0;
+ }
+
+ left_ppas -= rq_ppas;
+ if (left_ppas > 0)
+ goto next_rq;
+
+ return ret;
+}
+
+static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_recov_alloc p, int *done)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ u64 paddr;
+ int rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+ int left_ppas = pblk_calc_sec_in_line(pblk, line);
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ ppa_list = p.ppa_list;
+ meta_list = p.meta_list;
+ rqd = p.rqd;
+ data = p.data;
+ dma_ppa_list = p.dma_ppa_list;
+ dma_meta_list = p.dma_meta_list;
+
+ *done = 1;
+
+next_rq:
+ memset(rqd, 0, pblk_r_rq_size);
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (!rq_ppas)
+ rq_ppas = pblk->min_write_pgs;
+ rq_len = rq_ppas * geo->sec_size;
+
+ bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->flags = pblk_set_read_mode(pblk);
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+ rqd->end_io = pblk_end_io_sync;
+ rqd->private = &wait;
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+ ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ paddr += pblk->min_write_pgs;
+ ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
+ rqd->ppa_list[i] =
+ addr_to_gen_ppa(pblk, paddr, line->id);
+ }
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: I/O submission failed: %d\n", ret);
+ bio_put(bio);
+ return ret;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: L2P recovery read timed out\n");
+ }
+ reinit_completion(&wait);
+
+ /* Reached the end of the written line */
+ if (rqd->error) {
+ int nr_error_bits, bit;
+
+ bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+ nr_error_bits = rqd->nr_ppas - bit;
+
+ /* Roll back failed sectors */
+ line->cur_sec -= nr_error_bits;
+ line->left_msecs += nr_error_bits;
+ line->left_ssecs = line->left_msecs;
+ bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+ left_ppas = 0;
+ rqd->nr_ppas = bit;
+
+ if (rqd->error != NVM_RSP_ERR_EMPTYPAGE)
+ *done = 0;
+ }
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ u64 lba = le64_to_cpu(meta_list[i].lba);
+
+ if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+ continue;
+
+ pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+ }
+
+ left_ppas -= rq_ppas;
+ if (left_ppas > 0)
+ goto next_rq;
+
+ return ret;
+}
+
+/* Scan line for lbas on out of bound area */
+static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct nvm_rq *rqd;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct pblk_recov_alloc p;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ int done, ret = 0;
+
+ rqd = pblk_alloc_rqd(pblk, READ);
+ if (IS_ERR(rqd))
+ return PTR_ERR(rqd);
+
+ meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+ if (!meta_list) {
+ ret = -ENOMEM;
+ goto free_rqd;
+ }
+
+ ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+ dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+ data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
+ if (!data) {
+ ret = -ENOMEM;
+ goto free_meta_list;
+ }
+
+ p.ppa_list = ppa_list;
+ p.meta_list = meta_list;
+ p.rqd = rqd;
+ p.data = data;
+ p.dma_ppa_list = dma_ppa_list;
+ p.dma_meta_list = dma_meta_list;
+
+ ret = pblk_recov_scan_oob(pblk, line, p, &done);
+ if (ret) {
+ pr_err("pblk: could not recover L2P from OOB\n");
+ goto out;
+ }
+
+ if (!done) {
+ ret = pblk_recov_scan_all_oob(pblk, line, p);
+ if (ret) {
+ pr_err("pblk: could not recover L2P from OOB\n");
+ goto out;
+ }
+ }
+
+ if (pblk_line_is_full(line))
+ pblk_line_recov_close(pblk, line);
+
+out:
+ kfree(data);
+free_meta_list:
+ nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+free_rqd:
+ pblk_free_rqd(pblk, rqd, READ);
+
+ return ret;
+}
+
+/* Insert lines ordered by sequence number (seq_num) on list */
+static void pblk_recov_line_add_ordered(struct list_head *head,
+ struct pblk_line *line)
+{
+ struct pblk_line *t = NULL;
+
+ list_for_each_entry(t, head, list)
+ if (t->seq_nr > line->seq_nr)
+ break;
+
+ __list_add(&line->list, t->list.prev, &t->list);
+}
+
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line, *tline, *data_line = NULL;
+ struct line_smeta *smeta;
+ struct line_emeta *emeta;
+ int found_lines = 0, recovered_lines = 0, open_lines = 0;
+ int is_next = 0;
+ int meta_line;
+ int i, valid_uuid = 0;
+ LIST_HEAD(recov_list);
+
+ /* TODO: Implement FTL snapshot */
+
+ /* Scan recovery - takes place when FTL snapshot fails */
+ spin_lock(&l_mg->free_lock);
+ meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+ set_bit(meta_line, &l_mg->meta_bitmap);
+ smeta = l_mg->sline_meta[meta_line].meta;
+ emeta = l_mg->eline_meta[meta_line].meta;
+ spin_unlock(&l_mg->free_lock);
+
+ /* Order data lines using their sequence number */
+ for (i = 0; i < l_mg->nr_lines; i++) {
+ u32 crc;
+
+ line = &pblk->lines[i];
+
+ memset(smeta, 0, lm->smeta_len);
+ line->smeta = smeta;
+ line->lun_bitmap = ((void *)(smeta)) +
+ sizeof(struct line_smeta);
+
+ /* Lines that cannot be read are assumed as not written here */
+ if (pblk_line_read_smeta(pblk, line))
+ continue;
+
+ crc = pblk_calc_smeta_crc(pblk, smeta);
+ if (le32_to_cpu(smeta->crc) != crc)
+ continue;
+
+ if (le32_to_cpu(smeta->header.identifier) != PBLK_MAGIC)
+ continue;
+
+ if (le16_to_cpu(smeta->header.version) != 1) {
+ pr_err("pblk: found incompatible line version %u\n",
+ smeta->header.version);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* The first valid instance uuid is used for initialization */
+ if (!valid_uuid) {
+ memcpy(pblk->instance_uuid, smeta->header.uuid, 16);
+ valid_uuid = 1;
+ }
+
+ if (memcmp(pblk->instance_uuid, smeta->header.uuid, 16)) {
+ pr_debug("pblk: ignore line %u due to uuid mismatch\n",
+ i);
+ continue;
+ }
+
+ /* Update line metadata */
+ spin_lock(&line->lock);
+ line->id = le32_to_cpu(line->smeta->header.id);
+ line->type = le16_to_cpu(line->smeta->header.type);
+ line->seq_nr = le64_to_cpu(line->smeta->seq_nr);
+ spin_unlock(&line->lock);
+
+ /* Update general metadata */
+ spin_lock(&l_mg->free_lock);
+ if (line->seq_nr >= l_mg->d_seq_nr)
+ l_mg->d_seq_nr = line->seq_nr + 1;
+ l_mg->nr_free_lines--;
+ spin_unlock(&l_mg->free_lock);
+
+ if (pblk_line_recov_alloc(pblk, line))
+ goto out;
+
+ pblk_recov_line_add_ordered(&recov_list, line);
+ found_lines++;
+ pr_debug("pblk: recovering data line %d, seq:%llu\n",
+ line->id, smeta->seq_nr);
+ }
+
+ if (!found_lines) {
+ pblk_setup_uuid(pblk);
+
+ spin_lock(&l_mg->free_lock);
+ WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+ &l_mg->meta_bitmap));
+ spin_unlock(&l_mg->free_lock);
+
+ goto out;
+ }
+
+ /* Verify closed blocks and recover this portion of L2P table*/
+ list_for_each_entry_safe(line, tline, &recov_list, list) {
+ int off, nr_bb;
+
+ recovered_lines++;
+ /* Calculate where emeta starts based on the line bb */
+ off = lm->sec_per_line - lm->emeta_sec;
+ nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+ off -= nr_bb * geo->sec_per_pl;
+
+ memset(emeta, 0, lm->emeta_len);
+ line->emeta = emeta;
+ line->emeta_ssec = off;
+
+ if (pblk_line_read_emeta(pblk, line)) {
+ pblk_recov_l2p_from_oob(pblk, line);
+ goto next;
+ }
+
+ if (pblk_recov_l2p_from_emeta(pblk, line))
+ pblk_recov_l2p_from_oob(pblk, line);
+
+next:
+ if (pblk_line_is_full(line)) {
+ struct list_head *move_list;
+
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_CLOSED;
+ move_list = pblk_line_gc_list(pblk, line);
+ spin_unlock(&line->lock);
+
+ spin_lock(&l_mg->gc_lock);
+ list_move_tail(&line->list, move_list);
+ spin_unlock(&l_mg->gc_lock);
+
+ mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ line->map_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+ } else {
+ if (open_lines > 1)
+ pr_err("pblk: failed to recover L2P\n");
+
+ open_lines++;
+ line->meta_line = meta_line;
+ data_line = line;
+ }
+ }
+
+ spin_lock(&l_mg->free_lock);
+ if (!open_lines) {
+ WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+ &l_mg->meta_bitmap));
+ pblk_line_replace_data(pblk);
+ } else {
+ /* Allocate next line for preparation */
+ l_mg->data_next = pblk_line_get(pblk);
+ if (l_mg->data_next) {
+ l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+ l_mg->data_next->type = PBLK_LINETYPE_DATA;
+ is_next = 1;
+ }
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ if (is_next) {
+ pblk_line_erase(pblk, l_mg->data_next);
+ pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+ }
+
+out:
+ if (found_lines != recovered_lines)
+ pr_err("pblk: failed to recover all found lines %d/%d\n",
+ found_lines, recovered_lines);
+
+ return data_line;
+}
+
+/*
+ * Pad until smeta can be read on current data line
+ */
+void pblk_recov_pad(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line *line;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct nvm_rq *rqd;
+ struct pblk_recov_alloc p;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+
+ spin_lock(&l_mg->free_lock);
+ line = l_mg->data_line;
+ spin_unlock(&l_mg->free_lock);
+
+ rqd = pblk_alloc_rqd(pblk, READ);
+ if (IS_ERR(rqd))
+ return;
+
+ meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+ if (!meta_list)
+ goto free_rqd;
+
+ ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+ dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+ data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
+ if (!data)
+ goto free_meta_list;
+
+ p.ppa_list = ppa_list;
+ p.meta_list = meta_list;
+ p.rqd = rqd;
+ p.data = data;
+ p.dma_ppa_list = dma_ppa_list;
+ p.dma_meta_list = dma_meta_list;
+
+ if (pblk_recov_pad_oob(pblk, line, p, line->left_msecs)) {
+ pr_err("pblk: Tear down padding failed\n");
+ goto free_data;
+ }
+
+ pblk_line_close(pblk, line);
+
+free_data:
+ kfree(data);
+free_meta_list:
+ nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+free_rqd:
+ pblk_free_rqd(pblk, rqd, READ);
+}
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
new file mode 100644
index 000000000000..ab7cbb144f3f
--- /dev/null
+++ b/drivers/lightnvm/pblk-rl.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rl.c - pblk's rate limiter for user I/O
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_rl_kick_u_timer(struct pblk_rl *rl)
+{
+ mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000));
+}
+
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+ int rb_user_cnt = atomic_read(&rl->rb_user_cnt);
+
+ return (!(rb_user_cnt + nr_entries > rl->rb_user_max));
+}
+
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+ int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt);
+ int rb_user_active;
+
+ /* If there is no user I/O let GC take over space on the write buffer */
+ rb_user_active = READ_ONCE(rl->rb_user_active);
+ return (!(rb_gc_cnt + nr_entries > rl->rb_gc_max && rb_user_active));
+}
+
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
+{
+ atomic_add(nr_entries, &rl->rb_user_cnt);
+
+ /* Release user I/O state. Protect from GC */
+ smp_store_release(&rl->rb_user_active, 1);
+ pblk_rl_kick_u_timer(rl);
+}
+
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
+{
+ atomic_add(nr_entries, &rl->rb_gc_cnt);
+}
+
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc)
+{
+ atomic_sub(nr_user, &rl->rb_user_cnt);
+ atomic_sub(nr_gc, &rl->rb_gc_cnt);
+}
+
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
+{
+ return atomic_read(&rl->free_blocks);
+}
+
+/*
+ * We check for (i) the number of free blocks in the current LUN and (ii) the
+ * total number of free blocks in the pblk instance. This is to even out the
+ * number of free blocks on each LUN when GC kicks in.
+ *
+ * Only the total number of free blocks is used to configure the rate limiter.
+ */
+static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
+{
+ unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
+
+ if (free_blocks >= rl->high) {
+ rl->rb_user_max = max - rl->rb_gc_rsv;
+ rl->rb_gc_max = rl->rb_gc_rsv;
+ rl->rb_state = PBLK_RL_HIGH;
+ } else if (free_blocks < rl->high) {
+ int shift = rl->high_pw - rl->rb_windows_pw;
+ int user_windows = free_blocks >> shift;
+ int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW;
+ int gc_max;
+
+ rl->rb_user_max = user_max;
+ gc_max = max - rl->rb_user_max;
+ rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv);
+
+ if (free_blocks > rl->low)
+ rl->rb_state = PBLK_RL_MID;
+ else
+ rl->rb_state = PBLK_RL_LOW;
+ }
+
+ return rl->rb_state;
+}
+
+void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv)
+{
+ rl->rb_gc_rsv = rl->rb_gc_max = rsv;
+}
+
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
+{
+ struct pblk *pblk = container_of(rl, struct pblk, rl);
+ int blk_in_line = atomic_read(&line->blk_in_line);
+ int ret;
+
+ atomic_add(blk_in_line, &rl->free_blocks);
+ /* Rates will not change that often - no need to lock update */
+ ret = pblk_rl_update_rates(rl, rl->rb_budget);
+
+ if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
+ pblk_gc_should_start(pblk);
+ else
+ pblk_gc_should_stop(pblk);
+}
+
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
+{
+ struct pblk *pblk = container_of(rl, struct pblk, rl);
+ int blk_in_line = atomic_read(&line->blk_in_line);
+ int ret;
+
+ atomic_sub(blk_in_line, &rl->free_blocks);
+
+ /* Rates will not change that often - no need to lock update */
+ ret = pblk_rl_update_rates(rl, rl->rb_budget);
+ if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
+ pblk_gc_should_start(pblk);
+ else
+ pblk_gc_should_stop(pblk);
+}
+
+int pblk_rl_gc_thrs(struct pblk_rl *rl)
+{
+ return rl->high;
+}
+
+int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
+{
+ return rl->rb_user_max;
+}
+
+static void pblk_rl_u_timer(unsigned long data)
+{
+ struct pblk_rl *rl = (struct pblk_rl *)data;
+
+ /* Release user I/O state. Protect from GC */
+ smp_store_release(&rl->rb_user_active, 0);
+}
+
+void pblk_rl_free(struct pblk_rl *rl)
+{
+ del_timer(&rl->u_timer);
+}
+
+void pblk_rl_init(struct pblk_rl *rl, int budget)
+{
+ unsigned int rb_windows;
+
+ rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS;
+ rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
+ rl->high_pw = get_count_order(rl->high);
+
+ /* This will always be a power-of-2 */
+ rb_windows = budget / PBLK_MAX_REQ_ADDRS;
+ rl->rb_windows_pw = get_count_order(rb_windows) + 1;
+
+ /* To start with, all buffer is available to user I/O writers */
+ rl->rb_budget = budget;
+ rl->rb_user_max = budget;
+ atomic_set(&rl->rb_user_cnt, 0);
+ rl->rb_gc_max = 0;
+ rl->rb_state = PBLK_RL_HIGH;
+ atomic_set(&rl->rb_gc_cnt, 0);
+
+ setup_timer(&rl->u_timer, pblk_rl_u_timer, (unsigned long)rl);
+ rl->rb_user_active = 0;
+}
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
new file mode 100644
index 000000000000..f0af1d1ceeff
--- /dev/null
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-sysfs.c - pblk's sysfs
+ *
+ */
+
+#include "pblk.h"
+
+static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ ssize_t sz = 0;
+ int i;
+
+ for (i = 0; i < geo->nr_luns; i++) {
+ int active = 1;
+
+ rlun = &pblk->luns[i];
+ if (!down_trylock(&rlun->wr_sem)) {
+ active = 0;
+ up(&rlun->wr_sem);
+ }
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "pblk: pos:%d, ch:%d, lun:%d - %d\n",
+ i,
+ rlun->bppa.g.ch,
+ rlun->bppa.g.lun,
+ active);
+ }
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int free_blocks, total_blocks;
+ int rb_user_max, rb_user_cnt;
+ int rb_gc_max, rb_gc_rsv, rb_gc_cnt, rb_budget, rb_state;
+
+ free_blocks = atomic_read(&pblk->rl.free_blocks);
+ rb_user_max = pblk->rl.rb_user_max;
+ rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
+ rb_gc_max = pblk->rl.rb_gc_max;
+ rb_gc_rsv = pblk->rl.rb_gc_rsv;
+ rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt);
+ rb_budget = pblk->rl.rb_budget;
+ rb_state = pblk->rl.rb_state;
+
+ total_blocks = geo->blks_per_lun * geo->nr_luns;
+
+ return snprintf(page, PAGE_SIZE,
+ "u:%u/%u,gc:%u/%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n",
+ rb_user_cnt,
+ rb_user_max,
+ rb_gc_cnt,
+ rb_gc_max,
+ rb_gc_rsv,
+ rb_state,
+ rb_budget,
+ pblk->rl.low,
+ pblk->rl.high,
+ free_blocks,
+ total_blocks,
+ READ_ONCE(pblk->rl.rb_user_active));
+}
+
+static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page)
+{
+ int gc_enabled, gc_active;
+
+ pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active);
+ return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n",
+ gc_enabled, gc_active);
+}
+
+static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page)
+{
+ ssize_t sz;
+
+ sz = snprintf(page, PAGE_SIZE,
+ "read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n",
+ atomic_long_read(&pblk->read_failed),
+ atomic_long_read(&pblk->read_high_ecc),
+ atomic_long_read(&pblk->read_empty),
+ atomic_long_read(&pblk->read_failed_gc),
+ atomic_long_read(&pblk->write_failed),
+ atomic_long_read(&pblk->erase_failed));
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page)
+{
+ return pblk_rb_sysfs(&pblk->rwb, page);
+}
+
+static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ ssize_t sz = 0;
+
+ sz = snprintf(page, PAGE_SIZE - sz,
+ "g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+ pblk->ppaf_bitsize,
+ pblk->ppaf.blk_offset, geo->ppaf.blk_len,
+ pblk->ppaf.pg_offset, geo->ppaf.pg_len,
+ pblk->ppaf.lun_offset, geo->ppaf.lun_len,
+ pblk->ppaf.ch_offset, geo->ppaf.ch_len,
+ pblk->ppaf.pln_offset, geo->ppaf.pln_len,
+ pblk->ppaf.sec_offset, geo->ppaf.sect_len);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+ geo->ppaf.blk_offset, geo->ppaf.blk_len,
+ geo->ppaf.pg_offset, geo->ppaf.pg_len,
+ geo->ppaf.lun_offset, geo->ppaf.lun_len,
+ geo->ppaf.ch_offset, geo->ppaf.ch_len,
+ geo->ppaf.pln_offset, geo->ppaf.pln_len,
+ geo->ppaf.sect_offset, geo->ppaf.sect_len);
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line;
+ ssize_t sz = 0;
+ int nr_free_lines;
+ int cur_data, cur_log;
+ int free_line_cnt = 0, closed_line_cnt = 0;
+ int d_line_cnt = 0, l_line_cnt = 0;
+ int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
+ int free = 0, bad = 0, cor = 0;
+ int msecs = 0, ssecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
+ int map_weight = 0, meta_weight = 0;
+
+ spin_lock(&l_mg->free_lock);
+ cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1;
+ cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1;
+ nr_free_lines = l_mg->nr_free_lines;
+
+ list_for_each_entry(line, &l_mg->free_list, list)
+ free_line_cnt++;
+ spin_unlock(&l_mg->free_lock);
+
+ spin_lock(&l_mg->gc_lock);
+ list_for_each_entry(line, &l_mg->gc_full_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_full++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_high_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_high++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_mid_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_mid++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_low_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_low++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_empty_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_empty++;
+ }
+
+ list_for_each_entry(line, &l_mg->free_list, list)
+ free++;
+ list_for_each_entry(line, &l_mg->bad_list, list)
+ bad++;
+ list_for_each_entry(line, &l_mg->corrupt_list, list)
+ cor++;
+ spin_unlock(&l_mg->gc_lock);
+
+ spin_lock(&l_mg->free_lock);
+ if (l_mg->data_line) {
+ cur_sec = l_mg->data_line->cur_sec;
+ msecs = l_mg->data_line->left_msecs;
+ ssecs = l_mg->data_line->left_ssecs;
+ vsc = l_mg->data_line->vsc;
+ sec_in_line = l_mg->data_line->sec_in_line;
+ meta_weight = bitmap_weight(&l_mg->meta_bitmap,
+ PBLK_DATA_LINES);
+ map_weight = bitmap_weight(l_mg->data_line->map_bitmap,
+ lm->sec_per_line);
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ if (nr_free_lines != free_line_cnt)
+ pr_err("pblk: corrupted free line list\n");
+
+ sz = snprintf(page, PAGE_SIZE - sz,
+ "line: nluns:%d, nblks:%d, nsecs:%d\n",
+ geo->nr_luns, lm->blk_per_line, lm->sec_per_line);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "lines:d:%d,l:%d-f:%d(%d),b:%d,co:%d,c:%d(d:%d,l:%d)t:%d\n",
+ cur_data, cur_log,
+ free, nr_free_lines, bad, cor,
+ closed_line_cnt,
+ d_line_cnt, l_line_cnt,
+ l_mg->nr_lines);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
+ gc_full, gc_high, gc_mid, gc_low, gc_empty,
+ atomic_read(&pblk->gc.inflight_gc));
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "data (%d) cur:%d, left:%d/%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
+ cur_data, cur_sec, msecs, ssecs, vsc, sec_in_line,
+ map_weight, lm->sec_per_line, meta_weight);
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ ssize_t sz = 0;
+
+ sz = snprintf(page, PAGE_SIZE - sz,
+ "smeta - len:%d, secs:%d\n",
+ lm->smeta_len, lm->smeta_sec);
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "emeta - len:%d, sec:%d, bb_start:%d\n",
+ lm->emeta_len, lm->emeta_sec,
+ lm->emeta_bb);
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "bitmap lengths: sec:%d, blk:%d, lun:%d\n",
+ lm->sec_bitmap_len,
+ lm->blk_bitmap_len,
+ lm->lun_bitmap_len);
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "blk_line:%d, sec_line:%d, sec_blk:%d\n",
+ lm->blk_per_line,
+ lm->sec_per_line,
+ geo->sec_per_blk);
+
+ return sz;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
+{
+ return snprintf(page, PAGE_SIZE,
+ "%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n",
+ atomic_long_read(&pblk->inflight_writes),
+ atomic_long_read(&pblk->inflight_reads),
+ atomic_long_read(&pblk->req_writes),
+ atomic_long_read(&pblk->nr_flush),
+ atomic_long_read(&pblk->padded_writes),
+ atomic_long_read(&pblk->padded_wb),
+ atomic_long_read(&pblk->sub_writes),
+ atomic_long_read(&pblk->sync_writes),
+ atomic_long_read(&pblk->compl_writes),
+ atomic_long_read(&pblk->recov_writes),
+ atomic_long_read(&pblk->recov_gc_writes),
+ atomic_long_read(&pblk->recov_gc_reads),
+ atomic_long_read(&pblk->sync_reads));
+}
+#endif
+
+static ssize_t pblk_sysfs_rate_store(struct pblk *pblk, const char *page,
+ size_t len)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ size_t c_len;
+ int value;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &value))
+ return -EINVAL;
+
+ spin_lock(&gc->lock);
+ pblk_rl_set_gc_rsc(&pblk->rl, value);
+ spin_unlock(&gc->lock);
+
+ return len;
+}
+
+static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
+ size_t len)
+{
+ size_t c_len;
+ int force;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &force))
+ return -EINVAL;
+
+ if (force < 0 || force > 1)
+ return -EINVAL;
+
+ pblk_gc_sysfs_force(pblk, force);
+
+ return len;
+}
+
+static struct attribute sys_write_luns = {
+ .name = "write_luns",
+ .mode = 0444,
+};
+
+static struct attribute sys_rate_limiter_attr = {
+ .name = "rate_limiter",
+ .mode = 0444,
+};
+
+static struct attribute sys_gc_state = {
+ .name = "gc_state",
+ .mode = 0444,
+};
+
+static struct attribute sys_errors_attr = {
+ .name = "errors",
+ .mode = 0444,
+};
+
+static struct attribute sys_rb_attr = {
+ .name = "write_buffer",
+ .mode = 0444,
+};
+
+static struct attribute sys_stats_ppaf_attr = {
+ .name = "ppa_format",
+ .mode = 0444,
+};
+
+static struct attribute sys_lines_attr = {
+ .name = "lines",
+ .mode = 0444,
+};
+
+static struct attribute sys_lines_info_attr = {
+ .name = "lines_info",
+ .mode = 0444,
+};
+
+static struct attribute sys_gc_force = {
+ .name = "gc_force",
+ .mode = 0200,
+};
+
+static struct attribute sys_gc_rl_max = {
+ .name = "gc_rl_max",
+ .mode = 0200,
+};
+
+#ifdef CONFIG_NVM_DEBUG
+static struct attribute sys_stats_debug_attr = {
+ .name = "stats",
+ .mode = 0444,
+};
+#endif
+
+static struct attribute *pblk_attrs[] = {
+ &sys_write_luns,
+ &sys_rate_limiter_attr,
+ &sys_errors_attr,
+ &sys_gc_state,
+ &sys_gc_force,
+ &sys_gc_rl_max,
+ &sys_rb_attr,
+ &sys_stats_ppaf_attr,
+ &sys_lines_attr,
+ &sys_lines_info_attr,
+#ifdef CONFIG_NVM_DEBUG
+ &sys_stats_debug_attr,
+#endif
+ NULL,
+};
+
+static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+ if (strcmp(attr->name, "rate_limiter") == 0)
+ return pblk_sysfs_rate_limiter(pblk, buf);
+ else if (strcmp(attr->name, "write_luns") == 0)
+ return pblk_sysfs_luns_show(pblk, buf);
+ else if (strcmp(attr->name, "gc_state") == 0)
+ return pblk_sysfs_gc_state_show(pblk, buf);
+ else if (strcmp(attr->name, "errors") == 0)
+ return pblk_sysfs_stats(pblk, buf);
+ else if (strcmp(attr->name, "write_buffer") == 0)
+ return pblk_sysfs_write_buffer(pblk, buf);
+ else if (strcmp(attr->name, "ppa_format") == 0)
+ return pblk_sysfs_ppaf(pblk, buf);
+ else if (strcmp(attr->name, "lines") == 0)
+ return pblk_sysfs_lines(pblk, buf);
+ else if (strcmp(attr->name, "lines_info") == 0)
+ return pblk_sysfs_lines_info(pblk, buf);
+#ifdef CONFIG_NVM_DEBUG
+ else if (strcmp(attr->name, "stats") == 0)
+ return pblk_sysfs_stats_debug(pblk, buf);
+#endif
+ return 0;
+}
+
+static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+ if (strcmp(attr->name, "gc_rl_max") == 0)
+ return pblk_sysfs_rate_store(pblk, buf, len);
+ else if (strcmp(attr->name, "gc_force") == 0)
+ return pblk_sysfs_gc_force(pblk, buf, len);
+
+ return 0;
+}
+
+static const struct sysfs_ops pblk_sysfs_ops = {
+ .show = pblk_sysfs_show,
+ .store = pblk_sysfs_store,
+};
+
+static struct kobj_type pblk_ktype = {
+ .sysfs_ops = &pblk_sysfs_ops,
+ .default_attrs = pblk_attrs,
+};
+
+int pblk_sysfs_init(struct gendisk *tdisk)
+{
+ struct pblk *pblk = tdisk->private_data;
+ struct device *parent_dev = disk_to_dev(pblk->disk);
+ int ret;
+
+ ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype,
+ kobject_get(&parent_dev->kobj),
+ "%s", "pblk");
+ if (ret) {
+ pr_err("pblk: could not register %s/pblk\n",
+ tdisk->disk_name);
+ return ret;
+ }
+
+ kobject_uevent(&pblk->kobj, KOBJ_ADD);
+ return 0;
+}
+
+void pblk_sysfs_exit(struct gendisk *tdisk)
+{
+ struct pblk *pblk = tdisk->private_data;
+
+ kobject_uevent(&pblk->kobj, KOBJ_REMOVE);
+ kobject_del(&pblk->kobj);
+ kobject_put(&pblk->kobj);
+}
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
new file mode 100644
index 000000000000..aef6fd7c4a0c
--- /dev/null
+++ b/drivers/lightnvm/pblk-write.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-write.c - pblk's write path from write buffer to media
+ */
+
+#include "pblk.h"
+
+static void pblk_sync_line(struct pblk *pblk, struct pblk_line *line)
+{
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_inc(&pblk->sync_writes);
+#endif
+
+ /* Counter protected by rb sync lock */
+ line->left_ssecs--;
+ if (!line->left_ssecs)
+ pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
+}
+
+static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct bio *original_bio;
+ unsigned long ret;
+ int i;
+
+ for (i = 0; i < c_ctx->nr_valid; i++) {
+ struct pblk_w_ctx *w_ctx;
+ struct ppa_addr p;
+ struct pblk_line *line;
+
+ w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
+
+ p = rqd->ppa_list[i];
+ line = &pblk->lines[pblk_dev_ppa_to_line(p)];
+ pblk_sync_line(pblk, line);
+
+ while ((original_bio = bio_list_pop(&w_ctx->bios)))
+ bio_endio(original_bio);
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(c_ctx->nr_valid, &pblk->compl_writes);
+#endif
+
+ ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
+
+ if (rqd->meta_list)
+ nvm_dev_dma_free(dev->parent, rqd->meta_list,
+ rqd->dma_meta_list);
+
+ bio_put(rqd->bio);
+ pblk_free_rqd(pblk, rqd, WRITE);
+
+ return ret;
+}
+
+static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
+ struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ list_del(&c_ctx->list);
+ return pblk_end_w_bio(pblk, rqd, c_ctx);
+}
+
+static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ struct pblk_c_ctx *c, *r;
+ unsigned long flags;
+ unsigned long pos;
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
+#endif
+
+ pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
+
+ pos = pblk_rb_sync_init(&pblk->rwb, &flags);
+ if (pos == c_ctx->sentry) {
+ pos = pblk_end_w_bio(pblk, rqd, c_ctx);
+
+retry:
+ list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
+ rqd = nvm_rq_from_c_ctx(c);
+ if (c->sentry == pos) {
+ pos = pblk_end_queued_w_bio(pblk, rqd, c);
+ goto retry;
+ }
+ }
+ } else {
+ WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd);
+ list_add_tail(&c_ctx->list, &pblk->compl_list);
+ }
+ pblk_rb_sync_end(&pblk->rwb, &flags);
+}
+
+/* When a write fails, we are not sure whether the block has grown bad or a page
+ * range is more susceptible to write errors. If a high number of pages fail, we
+ * assume that the block is bad and we mark it accordingly. In all cases, we
+ * remap and resubmit the failed entries as fast as possible; if a flush is
+ * waiting on a completion, the whole stack would stall otherwise.
+ */
+static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ void *comp_bits = &rqd->ppa_status;
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+ struct pblk_rec_ctx *recovery;
+ struct ppa_addr *ppa_list = rqd->ppa_list;
+ int nr_ppas = rqd->nr_ppas;
+ unsigned int c_entries;
+ int bit, ret;
+
+ if (unlikely(nr_ppas == 1))
+ ppa_list = &rqd->ppa_addr;
+
+ recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
+ if (!recovery) {
+ pr_err("pblk: could not allocate recovery context\n");
+ return;
+ }
+ INIT_LIST_HEAD(&recovery->failed);
+
+ bit = -1;
+ while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
+ struct pblk_rb_entry *entry;
+ struct ppa_addr ppa;
+
+ /* Logic error */
+ if (bit > c_ctx->nr_valid) {
+ WARN_ONCE(1, "pblk: corrupted write request\n");
+ mempool_free(recovery, pblk->rec_pool);
+ goto out;
+ }
+
+ ppa = ppa_list[bit];
+ entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
+ if (!entry) {
+ pr_err("pblk: could not scan entry on write failure\n");
+ mempool_free(recovery, pblk->rec_pool);
+ goto out;
+ }
+
+ /* The list is filled first and emptied afterwards. No need for
+ * protecting it with a lock
+ */
+ list_add_tail(&entry->index, &recovery->failed);
+ }
+
+ c_entries = find_first_bit(comp_bits, nr_ppas);
+ ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
+ if (ret) {
+ pr_err("pblk: could not recover from write failure\n");
+ mempool_free(recovery, pblk->rec_pool);
+ goto out;
+ }
+
+ INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
+ queue_work(pblk->kw_wq, &recovery->ws_rec);
+
+out:
+ pblk_complete_write(pblk, rqd, c_ctx);
+}
+
+static void pblk_end_io_write(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+
+ if (rqd->error) {
+ pblk_log_write_err(pblk, rqd);
+ return pblk_end_w_fail(pblk, rqd);
+ }
+#ifdef CONFIG_NVM_DEBUG
+ else
+ WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n");
+#endif
+
+ pblk_complete_write(pblk, rqd, c_ctx);
+}
+
+static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int nr_secs)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+
+ /* Setup write request */
+ rqd->opcode = NVM_OP_PWRITE;
+ rqd->nr_ppas = nr_secs;
+ rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+ rqd->private = pblk;
+ rqd->end_io = pblk_end_io_write;
+
+ rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd->dma_meta_list);
+ if (!rqd->meta_list)
+ return -ENOMEM;
+
+ if (unlikely(nr_secs == 1))
+ return 0;
+
+ rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
+ rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
+
+ return 0;
+}
+
+static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line *e_line = pblk_line_get_data_next(pblk);
+ struct ppa_addr erase_ppa;
+ unsigned int valid = c_ctx->nr_valid;
+ unsigned int padded = c_ctx->nr_padded;
+ unsigned int nr_secs = valid + padded;
+ unsigned long *lun_bitmap;
+ int ret = 0;
+
+ lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
+ if (!lun_bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ c_ctx->lun_bitmap = lun_bitmap;
+
+ ret = pblk_alloc_w_rq(pblk, rqd, nr_secs);
+ if (ret) {
+ kfree(lun_bitmap);
+ goto out;
+ }
+
+ ppa_set_empty(&erase_ppa);
+ if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
+ pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
+ else
+ pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+ valid, &erase_ppa);
+
+out:
+ if (unlikely(e_line && !ppa_empty(erase_ppa))) {
+ if (pblk_blk_erase_async(pblk, erase_ppa)) {
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int bit;
+
+ atomic_inc(&e_line->left_eblks);
+ bit = erase_ppa.g.lun * geo->nr_chnls + erase_ppa.g.ch;
+ WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
+ up(&pblk->erase_sem);
+ }
+ }
+
+ return ret;
+}
+
+int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ unsigned long *lun_bitmap;
+ int ret;
+
+ lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
+ if (!lun_bitmap)
+ return -ENOMEM;
+
+ c_ctx->lun_bitmap = lun_bitmap;
+
+ ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas);
+ if (ret)
+ return ret;
+
+ pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
+
+ rqd->ppa_status = (u64)0;
+ rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+
+ return ret;
+}
+
+static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
+ unsigned int secs_to_flush)
+{
+ int secs_to_sync;
+
+ secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
+
+#ifdef CONFIG_NVM_DEBUG
+ if ((!secs_to_sync && secs_to_flush)
+ || (secs_to_sync < 0)
+ || (secs_to_sync > secs_avail && !secs_to_flush)) {
+ pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n",
+ secs_avail, secs_to_sync, secs_to_flush);
+ }
+#endif
+
+ return secs_to_sync;
+}
+
+static int pblk_submit_write(struct pblk *pblk)
+{
+ struct bio *bio;
+ struct nvm_rq *rqd;
+ struct pblk_c_ctx *c_ctx;
+ unsigned int pgs_read;
+ unsigned int secs_avail, secs_to_sync, secs_to_com;
+ unsigned int secs_to_flush;
+ unsigned long pos;
+ int err;
+
+ /* If there are no sectors in the cache, flushes (bios without data)
+ * will be cleared on the cache threads
+ */
+ secs_avail = pblk_rb_read_count(&pblk->rwb);
+ if (!secs_avail)
+ return 1;
+
+ secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
+ if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
+ return 1;
+
+ rqd = pblk_alloc_rqd(pblk, WRITE);
+ if (IS_ERR(rqd)) {
+ pr_err("pblk: cannot allocate write req.\n");
+ return 1;
+ }
+ c_ctx = nvm_rq_to_pdu(rqd);
+
+ bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
+ if (!bio) {
+ pr_err("pblk: cannot allocate write bio\n");
+ goto fail_free_rqd;
+ }
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ rqd->bio = bio;
+
+ secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
+ if (secs_to_sync > pblk->max_write_pgs) {
+ pr_err("pblk: bad buffer sync calculation\n");
+ goto fail_put_bio;
+ }
+
+ secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
+ pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+
+ pgs_read = pblk_rb_read_to_bio(&pblk->rwb, bio, c_ctx, pos,
+ secs_to_sync, secs_avail);
+ if (!pgs_read) {
+ pr_err("pblk: corrupted write bio\n");
+ goto fail_put_bio;
+ }
+
+ if (c_ctx->nr_padded)
+ if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, c_ctx->nr_padded))
+ goto fail_put_bio;
+
+ /* Assign lbas to ppas and populate request structure */
+ err = pblk_setup_w_rq(pblk, rqd, c_ctx);
+ if (err) {
+ pr_err("pblk: could not setup write request\n");
+ goto fail_free_bio;
+ }
+
+ err = pblk_submit_io(pblk, rqd);
+ if (err) {
+ pr_err("pblk: I/O submission failed: %d\n", err);
+ goto fail_free_bio;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(secs_to_sync, &pblk->sub_writes);
+#endif
+
+ return 0;
+
+fail_free_bio:
+ if (c_ctx->nr_padded)
+ pblk_bio_free_pages(pblk, bio, secs_to_sync, c_ctx->nr_padded);
+fail_put_bio:
+ bio_put(bio);
+fail_free_rqd:
+ pblk_free_rqd(pblk, rqd, WRITE);
+
+ return 1;
+}
+
+int pblk_write_ts(void *data)
+{
+ struct pblk *pblk = data;
+
+ while (!kthread_should_stop()) {
+ if (!pblk_submit_write(pblk))
+ continue;
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule();
+ }
+
+ return 0;
+}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
new file mode 100644
index 000000000000..99f3186b5288
--- /dev/null
+++ b/drivers/lightnvm/pblk.h
@@ -0,0 +1,1121 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.h)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Matias Bjorling <matias@cnexlabs.com>
+ * Write buffering: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a Physical Block-device target for Open-channel SSDs.
+ *
+ */
+
+#ifndef PBLK_H_
+#define PBLK_H_
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+#include <linux/crc32.h>
+#include <linux/uuid.h>
+
+#include <linux/lightnvm.h>
+
+/* Run only GC if less than 1/X blocks are free */
+#define GC_LIMIT_INVERSE 5
+#define GC_TIME_MSECS 1000
+
+#define PBLK_SECTOR (512)
+#define PBLK_EXPOSED_PAGE_SIZE (4096)
+#define PBLK_MAX_REQ_ADDRS (64)
+#define PBLK_MAX_REQ_ADDRS_PW (6)
+
+#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16)
+
+#define PBLK_COMMAND_TIMEOUT_MS 30000
+
+/* Max 512 LUNs per device */
+#define PBLK_MAX_LUNS_BITMAP (4)
+
+#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
+
+#define pblk_for_each_lun(pblk, rlun, i) \
+ for ((i) = 0, rlun = &(pblk)->luns[0]; \
+ (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
+
+#define ERASE 2 /* READ = 0, WRITE = 1 */
+
+enum {
+ /* IO Types */
+ PBLK_IOTYPE_USER = 1 << 0,
+ PBLK_IOTYPE_GC = 1 << 1,
+
+ /* Write buffer flags */
+ PBLK_FLUSH_ENTRY = 1 << 2,
+ PBLK_WRITTEN_DATA = 1 << 3,
+ PBLK_SUBMITTED_ENTRY = 1 << 4,
+ PBLK_WRITABLE_ENTRY = 1 << 5,
+};
+
+enum {
+ PBLK_BLK_ST_OPEN = 0x1,
+ PBLK_BLK_ST_CLOSED = 0x2,
+};
+
+/* The number of GC lists and the rate-limiter states go together. This way the
+ * rate-limiter can dictate how much GC is needed based on resource utilization.
+ */
+#define PBLK_NR_GC_LISTS 3
+#define PBLK_MAX_GC_JOBS 32
+
+enum {
+ PBLK_RL_HIGH = 1,
+ PBLK_RL_MID = 2,
+ PBLK_RL_LOW = 3,
+};
+
+struct pblk_sec_meta {
+ u64 reserved;
+ __le64 lba;
+};
+
+#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
+
+/* write completion context */
+struct pblk_c_ctx {
+ struct list_head list; /* Head for out-of-order completion */
+
+ unsigned long *lun_bitmap; /* Luns used on current request */
+ unsigned int sentry;
+ unsigned int nr_valid;
+ unsigned int nr_padded;
+};
+
+/* Read context */
+struct pblk_r_ctx {
+ struct bio *orig_bio;
+};
+
+/* Recovery context */
+struct pblk_rec_ctx {
+ struct pblk *pblk;
+ struct nvm_rq *rqd;
+ struct list_head failed;
+ struct work_struct ws_rec;
+};
+
+/* Write context */
+struct pblk_w_ctx {
+ struct bio_list bios; /* Original bios - used for completion
+ * in REQ_FUA, REQ_FLUSH case
+ */
+ u64 lba; /* Logic addr. associated with entry */
+ struct ppa_addr ppa; /* Physic addr. associated with entry */
+ int flags; /* Write context flags */
+};
+
+struct pblk_rb_entry {
+ struct ppa_addr cacheline; /* Cacheline for this entry */
+ void *data; /* Pointer to data on this entry */
+ struct pblk_w_ctx w_ctx; /* Context for this entry */
+ struct list_head index; /* List head to enable indexes */
+};
+
+#define EMPTY_ENTRY (~0U)
+
+struct pblk_rb_pages {
+ struct page *pages;
+ int order;
+ struct list_head list;
+};
+
+struct pblk_rb {
+ struct pblk_rb_entry *entries; /* Ring buffer entries */
+ unsigned int mem; /* Write offset - points to next
+ * writable entry in memory
+ */
+ unsigned int subm; /* Read offset - points to last entry
+ * that has been submitted to the media
+ * to be persisted
+ */
+ unsigned int sync; /* Synced - backpointer that signals
+ * the last submitted entry that has
+ * been successfully persisted to media
+ */
+ unsigned int sync_point; /* Sync point - last entry that must be
+ * flushed to the media. Used with
+ * REQ_FLUSH and REQ_FUA
+ */
+ unsigned int l2p_update; /* l2p update point - next entry for
+ * which l2p mapping will be updated to
+ * contain a device ppa address (instead
+ * of a cacheline
+ */
+ unsigned int nr_entries; /* Number of entries in write buffer -
+ * must be a power of two
+ */
+ unsigned int seg_size; /* Size of the data segments being
+ * stored on each entry. Typically this
+ * will be 4KB
+ */
+
+ struct list_head pages; /* List of data pages */
+
+ spinlock_t w_lock; /* Write lock */
+ spinlock_t s_lock; /* Sync lock */
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_t inflight_sync_point; /* Not served REQ_FLUSH | REQ_FUA */
+#endif
+};
+
+#define PBLK_RECOVERY_SECTORS 16
+
+struct pblk_lun {
+ struct ppa_addr bppa;
+
+ u8 *bb_list; /* Bad block list for LUN. Only used on
+ * bring up. Bad blocks are managed
+ * within lines on run-time.
+ */
+
+ struct semaphore wr_sem;
+};
+
+struct pblk_gc_rq {
+ struct pblk_line *line;
+ void *data;
+ u64 *lba_list;
+ int nr_secs;
+ int secs_to_gc;
+ struct list_head list;
+};
+
+struct pblk_gc {
+ int gc_active;
+ int gc_enabled;
+ int gc_forced;
+ int gc_jobs_active;
+ atomic_t inflight_gc;
+
+ struct task_struct *gc_ts;
+ struct task_struct *gc_writer_ts;
+ struct workqueue_struct *gc_reader_wq;
+ struct timer_list gc_timer;
+
+ int w_entries;
+ struct list_head w_list;
+
+ spinlock_t lock;
+ spinlock_t w_lock;
+};
+
+struct pblk_rl {
+ unsigned int high; /* Upper threshold for rate limiter (free run -
+ * user I/O rate limiter
+ */
+ unsigned int low; /* Lower threshold for rate limiter (user I/O
+ * rate limiter - stall)
+ */
+ unsigned int high_pw; /* High rounded up as a power of 2 */
+
+#define PBLK_USER_HIGH_THRS 2 /* Begin write limit at 50 percent
+ * available blks
+ */
+#define PBLK_USER_LOW_THRS 20 /* Aggressive GC at 5% available blocks */
+
+ int rb_windows_pw; /* Number of rate windows in the write buffer
+ * given as a power-of-2. This guarantees that
+ * when user I/O is being rate limited, there
+ * will be reserved enough space for the GC to
+ * place its payload. A window is of
+ * pblk->max_write_pgs size, which in NVMe is
+ * 64, i.e., 256kb.
+ */
+ int rb_budget; /* Total number of entries available for I/O */
+ int rb_user_max; /* Max buffer entries available for user I/O */
+ atomic_t rb_user_cnt; /* User I/O buffer counter */
+ int rb_gc_max; /* Max buffer entries available for GC I/O */
+ int rb_gc_rsv; /* Reserved buffer entries for GC I/O */
+ int rb_state; /* Rate-limiter current state */
+ atomic_t rb_gc_cnt; /* GC I/O buffer counter */
+
+ int rb_user_active;
+ struct timer_list u_timer;
+
+ unsigned long long nr_secs;
+ unsigned long total_blocks;
+ atomic_t free_blocks;
+};
+
+#define PBLK_LINE_NR_LUN_BITMAP 2
+#define PBLK_LINE_NR_SEC_BITMAP 2
+#define PBLK_LINE_EMPTY (~0U)
+
+enum {
+ /* Line Types */
+ PBLK_LINETYPE_FREE = 0,
+ PBLK_LINETYPE_LOG = 1,
+ PBLK_LINETYPE_DATA = 2,
+
+ /* Line state */
+ PBLK_LINESTATE_FREE = 10,
+ PBLK_LINESTATE_OPEN = 11,
+ PBLK_LINESTATE_CLOSED = 12,
+ PBLK_LINESTATE_GC = 13,
+ PBLK_LINESTATE_BAD = 14,
+ PBLK_LINESTATE_CORRUPT = 15,
+
+ /* GC group */
+ PBLK_LINEGC_NONE = 20,
+ PBLK_LINEGC_EMPTY = 21,
+ PBLK_LINEGC_LOW = 22,
+ PBLK_LINEGC_MID = 23,
+ PBLK_LINEGC_HIGH = 24,
+ PBLK_LINEGC_FULL = 25,
+};
+
+#define PBLK_MAGIC 0x70626c6b /*pblk*/
+
+struct line_header {
+ __le32 crc;
+ __le32 identifier; /* pblk identifier */
+ __u8 uuid[16]; /* instance uuid */
+ __le16 type; /* line type */
+ __le16 version; /* type version */
+ __le32 id; /* line id for current line */
+};
+
+struct line_smeta {
+ struct line_header header;
+
+ __le32 crc; /* Full structure including struct crc */
+ /* Previous line metadata */
+ __le32 prev_id; /* Line id for previous line */
+
+ /* Current line metadata */
+ __le64 seq_nr; /* Sequence number for current line */
+
+ /* Active writers */
+ __le32 window_wr_lun; /* Number of parallel LUNs to write */
+
+ __le32 rsvd[2];
+};
+
+/*
+ * Metadata Layout:
+ * 1. struct pblk_emeta
+ * 2. nr_lbas u64 forming lba list
+ * 3. nr_lines (all) u32 valid sector count (vsc) (~0U: non-alloc line)
+ * 4. nr_luns bits (u64 format) forming line bad block bitmap
+ *
+ * 3. and 4. will be part of FTL log
+ */
+struct line_emeta {
+ struct line_header header;
+
+ __le32 crc; /* Full structure including struct crc */
+
+ /* Previous line metadata */
+ __le32 prev_id; /* Line id for prev line */
+
+ /* Current line metadata */
+ __le64 seq_nr; /* Sequence number for current line */
+
+ /* Active writers */
+ __le32 window_wr_lun; /* Number of parallel LUNs to write */
+
+ /* Bookkeeping for recovery */
+ __le32 next_id; /* Line id for next line */
+ __le64 nr_lbas; /* Number of lbas mapped in line */
+ __le64 nr_valid_lbas; /* Number of valid lbas mapped in line */
+};
+
+struct pblk_line {
+ struct pblk *pblk;
+ unsigned int id; /* Line number corresponds to the
+ * block line
+ */
+ unsigned int seq_nr; /* Unique line sequence number */
+
+ int state; /* PBLK_LINESTATE_X */
+ int type; /* PBLK_LINETYPE_X */
+ int gc_group; /* PBLK_LINEGC_X */
+ struct list_head list; /* Free, GC lists */
+
+ unsigned long *lun_bitmap; /* Bitmap for LUNs mapped in line */
+
+ struct line_smeta *smeta; /* Start metadata */
+ struct line_emeta *emeta; /* End metadata */
+ int meta_line; /* Metadata line id */
+ u64 smeta_ssec; /* Sector where smeta starts */
+ u64 emeta_ssec; /* Sector where emeta starts */
+
+ unsigned int sec_in_line; /* Number of usable secs in line */
+
+ atomic_t blk_in_line; /* Number of good blocks in line */
+ unsigned long *blk_bitmap; /* Bitmap for valid/invalid blocks */
+ unsigned long *erase_bitmap; /* Bitmap for erased blocks */
+
+ unsigned long *map_bitmap; /* Bitmap for mapped sectors in line */
+ unsigned long *invalid_bitmap; /* Bitmap for invalid sectors in line */
+
+ atomic_t left_eblks; /* Blocks left for erasing */
+ atomic_t left_seblks; /* Blocks left for sync erasing */
+
+ int left_msecs; /* Sectors left for mapping */
+ int left_ssecs; /* Sectors left to sync */
+ unsigned int cur_sec; /* Sector map pointer */
+ unsigned int vsc; /* Valid sector count in line */
+
+ struct kref ref; /* Write buffer L2P references */
+
+ spinlock_t lock; /* Necessary for invalid_bitmap only */
+};
+
+#define PBLK_DATA_LINES 4
+
+enum{
+ PBLK_KMALLOC_META = 1,
+ PBLK_VMALLOC_META = 2,
+};
+
+struct pblk_line_metadata {
+ void *meta;
+};
+
+struct pblk_line_mgmt {
+ int nr_lines; /* Total number of full lines */
+ int nr_free_lines; /* Number of full lines in free list */
+
+ /* Free lists - use free_lock */
+ struct list_head free_list; /* Full lines ready to use */
+ struct list_head corrupt_list; /* Full lines corrupted */
+ struct list_head bad_list; /* Full lines bad */
+
+ /* GC lists - use gc_lock */
+ struct list_head *gc_lists[PBLK_NR_GC_LISTS];
+ struct list_head gc_high_list; /* Full lines ready to GC, high isc */
+ struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */
+ struct list_head gc_low_list; /* Full lines ready to GC, low isc */
+
+ struct list_head gc_full_list; /* Full lines ready to GC, no valid */
+ struct list_head gc_empty_list; /* Full lines close, all valid */
+
+ struct pblk_line *log_line; /* Current FTL log line */
+ struct pblk_line *data_line; /* Current data line */
+ struct pblk_line *log_next; /* Next FTL log line */
+ struct pblk_line *data_next; /* Next data line */
+
+ /* Metadata allocation type: VMALLOC | KMALLOC */
+ int smeta_alloc_type;
+ int emeta_alloc_type;
+
+ /* Pre-allocated metadata for data lines */
+ struct pblk_line_metadata sline_meta[PBLK_DATA_LINES];
+ struct pblk_line_metadata eline_meta[PBLK_DATA_LINES];
+ unsigned long meta_bitmap;
+
+ /* Helpers for fast bitmap calculations */
+ unsigned long *bb_template;
+ unsigned long *bb_aux;
+
+ unsigned long d_seq_nr; /* Data line unique sequence number */
+ unsigned long l_seq_nr; /* Log line unique sequence number */
+
+ spinlock_t free_lock;
+ spinlock_t gc_lock;
+};
+
+struct pblk_line_meta {
+ unsigned int smeta_len; /* Total length for smeta */
+ unsigned int smeta_sec; /* Sectors needed for smeta*/
+ unsigned int emeta_len; /* Total length for emeta */
+ unsigned int emeta_sec; /* Sectors needed for emeta*/
+ unsigned int emeta_bb; /* Boundary for bb that affects emeta */
+ unsigned int sec_bitmap_len; /* Length for sector bitmap in line */
+ unsigned int blk_bitmap_len; /* Length for block bitmap in line */
+ unsigned int lun_bitmap_len; /* Length for lun bitmap in line */
+
+ unsigned int blk_per_line; /* Number of blocks in a full line */
+ unsigned int sec_per_line; /* Number of sectors in a line */
+ unsigned int min_blk_line; /* Min. number of good blocks in line */
+
+ unsigned int mid_thrs; /* Threshold for GC mid list */
+ unsigned int high_thrs; /* Threshold for GC high list */
+};
+
+struct pblk_addr_format {
+ u64 ch_mask;
+ u64 lun_mask;
+ u64 pln_mask;
+ u64 blk_mask;
+ u64 pg_mask;
+ u64 sec_mask;
+ u8 ch_offset;
+ u8 lun_offset;
+ u8 pln_offset;
+ u8 blk_offset;
+ u8 pg_offset;
+ u8 sec_offset;
+};
+
+struct pblk {
+ struct nvm_tgt_dev *dev;
+ struct gendisk *disk;
+
+ struct kobject kobj;
+
+ struct pblk_lun *luns;
+
+ struct pblk_line *lines; /* Line array */
+ struct pblk_line_mgmt l_mg; /* Line management */
+ struct pblk_line_meta lm; /* Line metadata */
+
+ int ppaf_bitsize;
+ struct pblk_addr_format ppaf;
+
+ struct pblk_rb rwb;
+
+ int min_write_pgs; /* Minimum amount of pages required by controller */
+ int max_write_pgs; /* Maximum amount of pages supported by controller */
+ int pgs_in_buffer; /* Number of pages that need to be held in buffer to
+ * guarantee successful reads.
+ */
+
+ sector_t capacity; /* Device capacity when bad blocks are subtracted */
+ int over_pct; /* Percentage of device used for over-provisioning */
+
+ /* pblk provisioning values. Used by rate limiter */
+ struct pblk_rl rl;
+
+ struct semaphore erase_sem;
+
+ unsigned char instance_uuid[16];
+#ifdef CONFIG_NVM_DEBUG
+ /* All debug counters apply to 4kb sector I/Os */
+ atomic_long_t inflight_writes; /* Inflight writes (user and gc) */
+ atomic_long_t padded_writes; /* Sectors padded due to flush/fua */
+ atomic_long_t padded_wb; /* Sectors padded in write buffer */
+ atomic_long_t nr_flush; /* Number of flush/fua I/O */
+ atomic_long_t req_writes; /* Sectors stored on write buffer */
+ atomic_long_t sub_writes; /* Sectors submitted from buffer */
+ atomic_long_t sync_writes; /* Sectors synced to media */
+ atomic_long_t compl_writes; /* Sectors completed in write bio */
+ atomic_long_t inflight_reads; /* Inflight sector read requests */
+ atomic_long_t sync_reads; /* Completed sector read requests */
+ atomic_long_t recov_writes; /* Sectors submitted from recovery */
+ atomic_long_t recov_gc_writes; /* Sectors submitted from write GC */
+ atomic_long_t recov_gc_reads; /* Sectors submitted from read GC */
+#endif
+
+ spinlock_t lock;
+
+ atomic_long_t read_failed;
+ atomic_long_t read_empty;
+ atomic_long_t read_high_ecc;
+ atomic_long_t read_failed_gc;
+ atomic_long_t write_failed;
+ atomic_long_t erase_failed;
+
+ struct task_struct *writer_ts;
+
+ /* Simple translation map of logical addresses to physical addresses.
+ * The logical addresses is known by the host system, while the physical
+ * addresses are used when writing to the disk block device.
+ */
+ unsigned char *trans_map;
+ spinlock_t trans_lock;
+
+ struct list_head compl_list;
+
+ mempool_t *page_pool;
+ mempool_t *line_ws_pool;
+ mempool_t *rec_pool;
+ mempool_t *r_rq_pool;
+ mempool_t *w_rq_pool;
+ mempool_t *line_meta_pool;
+
+ struct workqueue_struct *kw_wq;
+ struct timer_list wtimer;
+
+ struct pblk_gc gc;
+};
+
+struct pblk_line_ws {
+ struct pblk *pblk;
+ struct pblk_line *line;
+ void *priv;
+ struct work_struct ws;
+};
+
+#define pblk_r_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_r_ctx))
+#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
+
+/*
+ * pblk ring buffer operations
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+ unsigned int power_size, unsigned int power_seg_sz);
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries);
+void *pblk_rb_entries_ref(struct pblk_rb *rb);
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+ unsigned int nr_entries, unsigned int *pos);
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos);
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, unsigned int pos);
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
+ unsigned int pos);
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos);
+
+void pblk_rb_sync_l2p(struct pblk_rb *rb);
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+ struct pblk_c_ctx *c_ctx,
+ unsigned int pos,
+ unsigned int nr_entries,
+ unsigned int count);
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+ struct list_head *list,
+ unsigned int max);
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+ u64 pos, int bio_iter);
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+ struct ppa_addr *ppa);
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb);
+
+unsigned int pblk_rb_read_count(struct pblk_rb *rb);
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos);
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb);
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos);
+void pblk_rb_data_free(struct pblk_rb *rb);
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf);
+
+/*
+ * pblk core
+ */
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw);
+int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx);
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw);
+void pblk_flush_writer(struct pblk *pblk);
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba);
+void pblk_discard(struct pblk *pblk, struct bio *bio);
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+ unsigned int nr_secs, unsigned int len,
+ gfp_t gfp_mask);
+struct pblk_line *pblk_line_get(struct pblk *pblk);
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
+struct pblk_line *pblk_line_get_data(struct pblk *pblk);
+struct pblk_line *pblk_line_get_data_next(struct pblk *pblk);
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_is_full(struct pblk_line *line);
+void pblk_line_free(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_close_ws(struct work_struct *work);
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_mark_bb(struct work_struct *work);
+void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+ void (*work)(struct work_struct *));
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line);
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa);
+void pblk_line_put(struct kref *ref);
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line);
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+ unsigned long secs_to_flush);
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap);
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap);
+void pblk_end_bio_sync(struct bio *bio);
+void pblk_end_io_sync(struct nvm_rq *rqd);
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+ int nr_pages);
+void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr);
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+ int nr_pages);
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa);
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa);
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa);
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa, struct ppa_addr entry_line);
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+ struct pblk_line *gc_line);
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+ u64 *lba_list, int nr_secs);
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+ sector_t blba, int nr_secs);
+
+/*
+ * pblk user I/O write path
+ */
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
+ unsigned long flags);
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+ unsigned int nr_entries, unsigned int nr_rec_entries,
+ struct pblk_line *gc_line, unsigned long flags);
+
+/*
+ * pblk map
+ */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int sentry, unsigned long *lun_bitmap,
+ unsigned int valid_secs, struct ppa_addr *erase_ppa);
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+ unsigned long *lun_bitmap, unsigned int valid_secs,
+ unsigned int off);
+
+/*
+ * pblk write thread
+ */
+int pblk_write_ts(void *data);
+void pblk_write_timer_fn(unsigned long data);
+void pblk_write_should_kick(struct pblk *pblk);
+
+/*
+ * pblk read path
+ */
+int pblk_submit_read(struct pblk *pblk, struct bio *bio);
+int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
+ unsigned int nr_secs, unsigned int *secs_to_gc,
+ struct pblk_line *line);
+/*
+ * pblk recovery
+ */
+void pblk_submit_rec(struct work_struct *work);
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
+void pblk_recov_pad(struct pblk *pblk);
+__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta);
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
+ struct pblk_rec_ctx *recovery, u64 *comp_bits,
+ unsigned int comp);
+
+/*
+ * pblk gc
+ */
+#define PBLK_GC_TRIES 3
+
+int pblk_gc_init(struct pblk *pblk);
+void pblk_gc_exit(struct pblk *pblk);
+void pblk_gc_should_start(struct pblk *pblk);
+void pblk_gc_should_stop(struct pblk *pblk);
+int pblk_gc_status(struct pblk *pblk);
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+ int *gc_active);
+void pblk_gc_sysfs_force(struct pblk *pblk, int force);
+
+/*
+ * pblk rate limiter
+ */
+void pblk_rl_init(struct pblk_rl *rl, int budget);
+void pblk_rl_free(struct pblk_rl *rl);
+int pblk_rl_gc_thrs(struct pblk_rl *rl);
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
+void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv);
+int pblk_rl_sysfs_rate_show(struct pblk_rl *rl);
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
+
+/*
+ * pblk sysfs
+ */
+int pblk_sysfs_init(struct gendisk *tdisk);
+void pblk_sysfs_exit(struct gendisk *tdisk);
+
+static inline void *pblk_malloc(size_t size, int type, gfp_t flags)
+{
+ if (type == PBLK_KMALLOC_META)
+ return kmalloc(size, flags);
+ return vmalloc(size);
+}
+
+static inline void pblk_mfree(void *ptr, int type)
+{
+ if (type == PBLK_KMALLOC_META)
+ kfree(ptr);
+ else
+ vfree(ptr);
+}
+
+static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx)
+{
+ return c_ctx - sizeof(struct nvm_rq);
+}
+
+static inline void *pblk_line_emeta_to_lbas(struct line_emeta *emeta)
+{
+ return (emeta) + 1;
+}
+
+#define NVM_MEM_PAGE_WRITE (8)
+
+static inline int pblk_pad_distance(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl;
+}
+
+static inline int pblk_dev_ppa_to_line(struct ppa_addr p)
+{
+ return p.g.blk;
+}
+
+static inline int pblk_tgt_ppa_to_line(struct ppa_addr p)
+{
+ return p.g.blk;
+}
+
+static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+{
+ return p.g.lun * geo->nr_chnls + p.g.ch;
+}
+
+/* A block within a line corresponds to the lun */
+static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+{
+ return p.g.lun * geo->nr_chnls + p.g.ch;
+}
+
+static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
+{
+ struct ppa_addr ppa64;
+
+ ppa64.ppa = 0;
+
+ if (ppa32 == -1) {
+ ppa64.ppa = ADDR_EMPTY;
+ } else if (ppa32 & (1U << 31)) {
+ ppa64.c.line = ppa32 & ((~0U) >> 1);
+ ppa64.c.is_cached = 1;
+ } else {
+ ppa64.g.blk = (ppa32 & pblk->ppaf.blk_mask) >>
+ pblk->ppaf.blk_offset;
+ ppa64.g.pg = (ppa32 & pblk->ppaf.pg_mask) >>
+ pblk->ppaf.pg_offset;
+ ppa64.g.lun = (ppa32 & pblk->ppaf.lun_mask) >>
+ pblk->ppaf.lun_offset;
+ ppa64.g.ch = (ppa32 & pblk->ppaf.ch_mask) >>
+ pblk->ppaf.ch_offset;
+ ppa64.g.pl = (ppa32 & pblk->ppaf.pln_mask) >>
+ pblk->ppaf.pln_offset;
+ ppa64.g.sec = (ppa32 & pblk->ppaf.sec_mask) >>
+ pblk->ppaf.sec_offset;
+ }
+
+ return ppa64;
+}
+
+static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
+ sector_t lba)
+{
+ struct ppa_addr ppa;
+
+ if (pblk->ppaf_bitsize < 32) {
+ u32 *map = (u32 *)pblk->trans_map;
+
+ ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
+ } else {
+ struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
+
+ ppa = map[lba];
+ }
+
+ return ppa;
+}
+
+static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
+{
+ u32 ppa32 = 0;
+
+ if (ppa64.ppa == ADDR_EMPTY) {
+ ppa32 = ~0U;
+ } else if (ppa64.c.is_cached) {
+ ppa32 |= ppa64.c.line;
+ ppa32 |= 1U << 31;
+ } else {
+ ppa32 |= ppa64.g.blk << pblk->ppaf.blk_offset;
+ ppa32 |= ppa64.g.pg << pblk->ppaf.pg_offset;
+ ppa32 |= ppa64.g.lun << pblk->ppaf.lun_offset;
+ ppa32 |= ppa64.g.ch << pblk->ppaf.ch_offset;
+ ppa32 |= ppa64.g.pl << pblk->ppaf.pln_offset;
+ ppa32 |= ppa64.g.sec << pblk->ppaf.sec_offset;
+ }
+
+ return ppa32;
+}
+
+static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa)
+{
+ if (pblk->ppaf_bitsize < 32) {
+ u32 *map = (u32 *)pblk->trans_map;
+
+ map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
+ } else {
+ u64 *map = (u64 *)pblk->trans_map;
+
+ map[lba] = ppa.ppa;
+ }
+}
+
+static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
+ struct ppa_addr p)
+{
+ u64 paddr;
+
+ paddr = 0;
+ paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset;
+ paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
+ paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
+ paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
+ paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
+
+ return paddr;
+}
+
+static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
+{
+ return (ppa_addr.ppa == ADDR_EMPTY);
+}
+
+static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
+{
+ ppa_addr->ppa = ADDR_EMPTY;
+}
+
+static inline int pblk_addr_in_cache(struct ppa_addr ppa)
+{
+ return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached);
+}
+
+static inline int pblk_addr_to_cacheline(struct ppa_addr ppa)
+{
+ return ppa.c.line;
+}
+
+static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
+{
+ struct ppa_addr p;
+
+ p.c.line = addr;
+ p.c.is_cached = 1;
+
+ return p;
+}
+
+static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
+ u64 line_id)
+{
+ struct ppa_addr ppa;
+
+ ppa.ppa = 0;
+ ppa.g.blk = line_id;
+ ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
+ ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
+ ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
+ ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
+ ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
+
+ return ppa;
+}
+
+static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr,
+ u64 line_id)
+{
+ struct ppa_addr ppa;
+
+ ppa = addr_to_gen_ppa(pblk, paddr, line_id);
+
+ return ppa;
+}
+
+static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
+ struct line_smeta *smeta)
+{
+ u32 crc = ~(u32)0;
+
+ crc = crc32_le(crc, (unsigned char *)smeta + sizeof(crc),
+ sizeof(struct line_header) - sizeof(crc));
+
+ return crc;
+}
+
+static inline u32 pblk_calc_smeta_crc(struct pblk *pblk,
+ struct line_smeta *smeta)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ u32 crc = ~(u32)0;
+
+ crc = crc32_le(crc, (unsigned char *)smeta +
+ sizeof(struct line_header) + sizeof(crc),
+ lm->smeta_len -
+ sizeof(struct line_header) - sizeof(crc));
+
+ return crc;
+}
+
+static inline u32 pblk_calc_emeta_crc(struct pblk *pblk,
+ struct line_emeta *emeta)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ u32 crc = ~(u32)0;
+
+ crc = crc32_le(crc, (unsigned char *)emeta +
+ sizeof(struct line_header) + sizeof(crc),
+ lm->emeta_len -
+ sizeof(struct line_header) - sizeof(crc));
+
+ return crc;
+}
+
+static inline int pblk_set_progr_mode(struct pblk *pblk, int type)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int flags;
+
+ flags = geo->plane_mode >> 1;
+
+ if (type == WRITE)
+ flags |= NVM_IO_SCRAMBLE_ENABLE;
+
+ return flags;
+}
+
+static inline int pblk_set_read_mode(struct pblk *pblk)
+{
+ return NVM_IO_SNGL_ACCESS | NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static inline void print_ppa(struct ppa_addr *p, char *msg, int error)
+{
+ if (p->c.is_cached) {
+ pr_err("ppa: (%s: %x) cache line: %llu\n",
+ msg, error, (u64)p->c.line);
+ } else {
+ pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
+ msg, error,
+ p->g.ch, p->g.lun, p->g.blk,
+ p->g.pg, p->g.pl, p->g.sec);
+ }
+}
+
+static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
+ int error)
+{
+ int bit = -1;
+
+ if (rqd->nr_ppas == 1) {
+ print_ppa(&rqd->ppa_addr, "rqd", error);
+ return;
+ }
+
+ while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
+ bit + 1)) < rqd->nr_ppas) {
+ print_ppa(&rqd->ppa_list[bit], "rqd", error);
+ }
+
+ pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
+}
+#endif
+
+static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
+ struct ppa_addr *ppas, int nr_ppas)
+{
+ struct nvm_geo *geo = &tgt_dev->geo;
+ struct ppa_addr *ppa;
+ int i;
+
+ for (i = 0; i < nr_ppas; i++) {
+ ppa = &ppas[i];
+
+ if (!ppa->c.is_cached &&
+ ppa->g.ch < geo->nr_chnls &&
+ ppa->g.lun < geo->luns_per_chnl &&
+ ppa->g.pl < geo->nr_planes &&
+ ppa->g.blk < geo->blks_per_lun &&
+ ppa->g.pg < geo->pgs_per_blk &&
+ ppa->g.sec < geo->sec_per_pg)
+ continue;
+
+#ifdef CONFIG_NVM_DEBUG
+ print_ppa(ppa, "boundary", i);
+#endif
+ return 1;
+ }
+ return 0;
+}
+
+static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+
+ if (paddr > lm->sec_per_line)
+ return 1;
+
+ return 0;
+}
+
+static inline unsigned int pblk_get_bi_idx(struct bio *bio)
+{
+ return bio->bi_iter.bi_idx;
+}
+
+static inline sector_t pblk_get_lba(struct bio *bio)
+{
+ return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
+}
+
+static inline unsigned int pblk_get_secs(struct bio *bio)
+{
+ return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
+}
+
+static inline sector_t pblk_get_sector(sector_t lba)
+{
+ return lba * NR_PHY_IN_LOG;
+}
+
+static inline void pblk_setup_uuid(struct pblk *pblk)
+{
+ uuid_le uuid;
+
+ uuid_le_gen(&uuid);
+ memcpy(pblk->instance_uuid, uuid.b, 16);
+}
+#endif /* PBLK_H_ */
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index e00b1d7b976f..cf0e28a0ff61 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -318,10 +318,6 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
}
page = mempool_alloc(rrpc->page_pool, GFP_NOIO);
- if (!page) {
- bio_put(bio);
- return -ENOMEM;
- }
while ((slot = find_first_zero_bit(rblk->invalid_pages,
nr_sec_per_blk)) < nr_sec_per_blk) {
@@ -414,7 +410,6 @@ static void rrpc_block_gc(struct work_struct *work)
struct rrpc *rrpc = gcb->rrpc;
struct rrpc_block *rblk = gcb->rblk;
struct rrpc_lun *rlun = rblk->rlun;
- struct nvm_tgt_dev *dev = rrpc->dev;
struct ppa_addr ppa;
mempool_free(gcb, rrpc->gcb_pool);
@@ -430,7 +425,7 @@ static void rrpc_block_gc(struct work_struct *work)
ppa.g.lun = rlun->bppa.g.lun;
ppa.g.blk = rblk->id;
- if (nvm_erase_blk(dev, &ppa, 0))
+ if (nvm_erase_sync(rrpc->dev, &ppa, 1))
goto put_back;
rrpc_put_blk(rrpc, rblk);
@@ -822,7 +817,7 @@ static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
for (i = 0; i < npages; i++) {
/* We assume that mapping occurs at 4KB granularity */
- BUG_ON(!(laddr + i >= 0 && laddr + i < rrpc->nr_sects));
+ BUG_ON(!(laddr + i < rrpc->nr_sects));
gp = &rrpc->trans_map[laddr + i];
if (gp->rblk) {
@@ -851,7 +846,7 @@ static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd,
if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
return NVM_IO_REQUEUE;
- BUG_ON(!(laddr >= 0 && laddr < rrpc->nr_sects));
+ BUG_ON(!(laddr < rrpc->nr_sects));
gp = &rrpc->trans_map[laddr];
if (gp->rblk) {
@@ -1007,11 +1002,6 @@ static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio)
}
rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL);
- if (!rqd) {
- pr_err_ratelimited("rrpc: not able to queue bio.");
- bio_io_error(bio);
- return BLK_QC_T_NONE;
- }
memset(rqd, 0, sizeof(struct nvm_rq));
err = rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_NONE);
@@ -1275,8 +1265,10 @@ static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
}
nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
- if (nr_blks < 0)
- return nr_blks;
+ if (nr_blks < 0) {
+ ret = nr_blks;
+ goto out;
+ }
for (i = 0; i < nr_blks; i++) {
if (blks[i] == NVM_BLK_T_FREE)
@@ -1514,7 +1506,8 @@ err:
static struct nvm_tgt_type tt_rrpc;
-static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk)
+static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
+ int flags)
{
struct request_queue *bqueue = dev->q;
struct request_queue *tqueue = tdisk->queue;
diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index ceff415f201c..ee1a3d9147ef 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -144,12 +144,22 @@ config XGENE_SLIMPRO_MBOX
want to use the APM X-Gene SLIMpro IPCM support.
config BCM_PDC_MBOX
- tristate "Broadcom PDC Mailbox"
- depends on ARM64 || COMPILE_TEST
+ tristate "Broadcom FlexSparx DMA Mailbox"
+ depends on ARCH_BCM_IPROC || COMPILE_TEST
depends on HAS_DMA
+ help
+ Mailbox implementation for the Broadcom FlexSparx DMA ring manager,
+ which provides access to various offload engines on Broadcom
+ SoCs, including FA2/FA+ on Northstar Plus and PDC on Northstar 2.
+
+config BCM_FLEXRM_MBOX
+ tristate "Broadcom FlexRM Mailbox"
+ depends on ARM64
+ depends on HAS_DMA
+ select GENERIC_MSI_IRQ_DOMAIN
default ARCH_BCM_IPROC
help
- Mailbox implementation for the Broadcom PDC ring manager,
+ Mailbox implementation of the Broadcom FlexRM ring manager,
which provides access to various offload engines on Broadcom
- SoCs. Say Y here if you want to use the Broadcom PDC.
+ SoCs. Say Y here if you want to use the Broadcom FlexRM.
endif
diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile
index 7dde4f609ae8..e2bcb03cd35b 100644
--- a/drivers/mailbox/Makefile
+++ b/drivers/mailbox/Makefile
@@ -30,4 +30,6 @@ obj-$(CONFIG_HI6220_MBOX) += hi6220-mailbox.o
obj-$(CONFIG_BCM_PDC_MBOX) += bcm-pdc-mailbox.o
+obj-$(CONFIG_BCM_FLEXRM_MBOX) += bcm-flexrm-mailbox.o
+
obj-$(CONFIG_TEGRA_HSP_MBOX) += tegra-hsp.o
diff --git a/drivers/mailbox/bcm-flexrm-mailbox.c b/drivers/mailbox/bcm-flexrm-mailbox.c
new file mode 100644
index 000000000000..da67882caa7b
--- /dev/null
+++ b/drivers/mailbox/bcm-flexrm-mailbox.c
@@ -0,0 +1,1595 @@
+/* Broadcom FlexRM Mailbox Driver
+ *
+ * Copyright (C) 2017 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Each Broadcom FlexSparx4 offload engine is implemented as an
+ * extension to Broadcom FlexRM ring manager. The FlexRM ring
+ * manager provides a set of rings which can be used to submit
+ * work to a FlexSparx4 offload engine.
+ *
+ * This driver creates a mailbox controller using a set of FlexRM
+ * rings where each mailbox channel represents a separate FlexRM ring.
+ */
+
+#include <asm/barrier.h>
+#include <asm/byteorder.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mailbox_controller.h>
+#include <linux/mailbox_client.h>
+#include <linux/mailbox/brcm-message.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+
+/* ====== FlexRM register defines ===== */
+
+/* FlexRM configuration */
+#define RING_REGS_SIZE 0x10000
+#define RING_DESC_SIZE 8
+#define RING_DESC_INDEX(offset) \
+ ((offset) / RING_DESC_SIZE)
+#define RING_DESC_OFFSET(index) \
+ ((index) * RING_DESC_SIZE)
+#define RING_MAX_REQ_COUNT 1024
+#define RING_BD_ALIGN_ORDER 12
+#define RING_BD_ALIGN_CHECK(addr) \
+ (!((addr) & ((0x1 << RING_BD_ALIGN_ORDER) - 1)))
+#define RING_BD_TOGGLE_INVALID(offset) \
+ (((offset) >> RING_BD_ALIGN_ORDER) & 0x1)
+#define RING_BD_TOGGLE_VALID(offset) \
+ (!RING_BD_TOGGLE_INVALID(offset))
+#define RING_BD_DESC_PER_REQ 32
+#define RING_BD_DESC_COUNT \
+ (RING_MAX_REQ_COUNT * RING_BD_DESC_PER_REQ)
+#define RING_BD_SIZE \
+ (RING_BD_DESC_COUNT * RING_DESC_SIZE)
+#define RING_CMPL_ALIGN_ORDER 13
+#define RING_CMPL_DESC_COUNT RING_MAX_REQ_COUNT
+#define RING_CMPL_SIZE \
+ (RING_CMPL_DESC_COUNT * RING_DESC_SIZE)
+#define RING_VER_MAGIC 0x76303031
+
+/* Per-Ring register offsets */
+#define RING_VER 0x000
+#define RING_BD_START_ADDR 0x004
+#define RING_BD_READ_PTR 0x008
+#define RING_BD_WRITE_PTR 0x00c
+#define RING_BD_READ_PTR_DDR_LS 0x010
+#define RING_BD_READ_PTR_DDR_MS 0x014
+#define RING_CMPL_START_ADDR 0x018
+#define RING_CMPL_WRITE_PTR 0x01c
+#define RING_NUM_REQ_RECV_LS 0x020
+#define RING_NUM_REQ_RECV_MS 0x024
+#define RING_NUM_REQ_TRANS_LS 0x028
+#define RING_NUM_REQ_TRANS_MS 0x02c
+#define RING_NUM_REQ_OUTSTAND 0x030
+#define RING_CONTROL 0x034
+#define RING_FLUSH_DONE 0x038
+#define RING_MSI_ADDR_LS 0x03c
+#define RING_MSI_ADDR_MS 0x040
+#define RING_MSI_CONTROL 0x048
+#define RING_BD_READ_PTR_DDR_CONTROL 0x04c
+#define RING_MSI_DATA_VALUE 0x064
+
+/* Register RING_BD_START_ADDR fields */
+#define BD_LAST_UPDATE_HW_SHIFT 28
+#define BD_LAST_UPDATE_HW_MASK 0x1
+#define BD_START_ADDR_VALUE(pa) \
+ ((u32)((((dma_addr_t)(pa)) >> RING_BD_ALIGN_ORDER) & 0x0fffffff))
+#define BD_START_ADDR_DECODE(val) \
+ ((dma_addr_t)((val) & 0x0fffffff) << RING_BD_ALIGN_ORDER)
+
+/* Register RING_CMPL_START_ADDR fields */
+#define CMPL_START_ADDR_VALUE(pa) \
+ ((u32)((((u64)(pa)) >> RING_CMPL_ALIGN_ORDER) & 0x03ffffff))
+
+/* Register RING_CONTROL fields */
+#define CONTROL_MASK_DISABLE_CONTROL 12
+#define CONTROL_FLUSH_SHIFT 5
+#define CONTROL_ACTIVE_SHIFT 4
+#define CONTROL_RATE_ADAPT_MASK 0xf
+#define CONTROL_RATE_DYNAMIC 0x0
+#define CONTROL_RATE_FAST 0x8
+#define CONTROL_RATE_MEDIUM 0x9
+#define CONTROL_RATE_SLOW 0xa
+#define CONTROL_RATE_IDLE 0xb
+
+/* Register RING_FLUSH_DONE fields */
+#define FLUSH_DONE_MASK 0x1
+
+/* Register RING_MSI_CONTROL fields */
+#define MSI_TIMER_VAL_SHIFT 16
+#define MSI_TIMER_VAL_MASK 0xffff
+#define MSI_ENABLE_SHIFT 15
+#define MSI_ENABLE_MASK 0x1
+#define MSI_COUNT_SHIFT 0
+#define MSI_COUNT_MASK 0x3ff
+
+/* Register RING_BD_READ_PTR_DDR_CONTROL fields */
+#define BD_READ_PTR_DDR_TIMER_VAL_SHIFT 16
+#define BD_READ_PTR_DDR_TIMER_VAL_MASK 0xffff
+#define BD_READ_PTR_DDR_ENABLE_SHIFT 15
+#define BD_READ_PTR_DDR_ENABLE_MASK 0x1
+
+/* ====== FlexRM ring descriptor defines ===== */
+
+/* Completion descriptor format */
+#define CMPL_OPAQUE_SHIFT 0
+#define CMPL_OPAQUE_MASK 0xffff
+#define CMPL_ENGINE_STATUS_SHIFT 16
+#define CMPL_ENGINE_STATUS_MASK 0xffff
+#define CMPL_DME_STATUS_SHIFT 32
+#define CMPL_DME_STATUS_MASK 0xffff
+#define CMPL_RM_STATUS_SHIFT 48
+#define CMPL_RM_STATUS_MASK 0xffff
+
+/* Completion DME status code */
+#define DME_STATUS_MEM_COR_ERR BIT(0)
+#define DME_STATUS_MEM_UCOR_ERR BIT(1)
+#define DME_STATUS_FIFO_UNDERFLOW BIT(2)
+#define DME_STATUS_FIFO_OVERFLOW BIT(3)
+#define DME_STATUS_RRESP_ERR BIT(4)
+#define DME_STATUS_BRESP_ERR BIT(5)
+#define DME_STATUS_ERROR_MASK (DME_STATUS_MEM_COR_ERR | \
+ DME_STATUS_MEM_UCOR_ERR | \
+ DME_STATUS_FIFO_UNDERFLOW | \
+ DME_STATUS_FIFO_OVERFLOW | \
+ DME_STATUS_RRESP_ERR | \
+ DME_STATUS_BRESP_ERR)
+
+/* Completion RM status code */
+#define RM_STATUS_CODE_SHIFT 0
+#define RM_STATUS_CODE_MASK 0x3ff
+#define RM_STATUS_CODE_GOOD 0x0
+#define RM_STATUS_CODE_AE_TIMEOUT 0x3ff
+
+/* General descriptor format */
+#define DESC_TYPE_SHIFT 60
+#define DESC_TYPE_MASK 0xf
+#define DESC_PAYLOAD_SHIFT 0
+#define DESC_PAYLOAD_MASK 0x0fffffffffffffff
+
+/* Null descriptor format */
+#define NULL_TYPE 0
+#define NULL_TOGGLE_SHIFT 58
+#define NULL_TOGGLE_MASK 0x1
+
+/* Header descriptor format */
+#define HEADER_TYPE 1
+#define HEADER_TOGGLE_SHIFT 58
+#define HEADER_TOGGLE_MASK 0x1
+#define HEADER_ENDPKT_SHIFT 57
+#define HEADER_ENDPKT_MASK 0x1
+#define HEADER_STARTPKT_SHIFT 56
+#define HEADER_STARTPKT_MASK 0x1
+#define HEADER_BDCOUNT_SHIFT 36
+#define HEADER_BDCOUNT_MASK 0x1f
+#define HEADER_BDCOUNT_MAX HEADER_BDCOUNT_MASK
+#define HEADER_FLAGS_SHIFT 16
+#define HEADER_FLAGS_MASK 0xffff
+#define HEADER_OPAQUE_SHIFT 0
+#define HEADER_OPAQUE_MASK 0xffff
+
+/* Source (SRC) descriptor format */
+#define SRC_TYPE 2
+#define SRC_LENGTH_SHIFT 44
+#define SRC_LENGTH_MASK 0xffff
+#define SRC_ADDR_SHIFT 0
+#define SRC_ADDR_MASK 0x00000fffffffffff
+
+/* Destination (DST) descriptor format */
+#define DST_TYPE 3
+#define DST_LENGTH_SHIFT 44
+#define DST_LENGTH_MASK 0xffff
+#define DST_ADDR_SHIFT 0
+#define DST_ADDR_MASK 0x00000fffffffffff
+
+/* Immediate (IMM) descriptor format */
+#define IMM_TYPE 4
+#define IMM_DATA_SHIFT 0
+#define IMM_DATA_MASK 0x0fffffffffffffff
+
+/* Next pointer (NPTR) descriptor format */
+#define NPTR_TYPE 5
+#define NPTR_TOGGLE_SHIFT 58
+#define NPTR_TOGGLE_MASK 0x1
+#define NPTR_ADDR_SHIFT 0
+#define NPTR_ADDR_MASK 0x00000fffffffffff
+
+/* Mega source (MSRC) descriptor format */
+#define MSRC_TYPE 6
+#define MSRC_LENGTH_SHIFT 44
+#define MSRC_LENGTH_MASK 0xffff
+#define MSRC_ADDR_SHIFT 0
+#define MSRC_ADDR_MASK 0x00000fffffffffff
+
+/* Mega destination (MDST) descriptor format */
+#define MDST_TYPE 7
+#define MDST_LENGTH_SHIFT 44
+#define MDST_LENGTH_MASK 0xffff
+#define MDST_ADDR_SHIFT 0
+#define MDST_ADDR_MASK 0x00000fffffffffff
+
+/* Source with tlast (SRCT) descriptor format */
+#define SRCT_TYPE 8
+#define SRCT_LENGTH_SHIFT 44
+#define SRCT_LENGTH_MASK 0xffff
+#define SRCT_ADDR_SHIFT 0
+#define SRCT_ADDR_MASK 0x00000fffffffffff
+
+/* Destination with tlast (DSTT) descriptor format */
+#define DSTT_TYPE 9
+#define DSTT_LENGTH_SHIFT 44
+#define DSTT_LENGTH_MASK 0xffff
+#define DSTT_ADDR_SHIFT 0
+#define DSTT_ADDR_MASK 0x00000fffffffffff
+
+/* Immediate with tlast (IMMT) descriptor format */
+#define IMMT_TYPE 10
+#define IMMT_DATA_SHIFT 0
+#define IMMT_DATA_MASK 0x0fffffffffffffff
+
+/* Descriptor helper macros */
+#define DESC_DEC(_d, _s, _m) (((_d) >> (_s)) & (_m))
+#define DESC_ENC(_d, _v, _s, _m) \
+ do { \
+ (_d) &= ~((u64)(_m) << (_s)); \
+ (_d) |= (((u64)(_v) & (_m)) << (_s)); \
+ } while (0)
+
+/* ====== FlexRM data structures ===== */
+
+struct flexrm_ring {
+ /* Unprotected members */
+ int num;
+ struct flexrm_mbox *mbox;
+ void __iomem *regs;
+ bool irq_requested;
+ unsigned int irq;
+ unsigned int msi_timer_val;
+ unsigned int msi_count_threshold;
+ struct ida requests_ida;
+ struct brcm_message *requests[RING_MAX_REQ_COUNT];
+ void *bd_base;
+ dma_addr_t bd_dma_base;
+ u32 bd_write_offset;
+ void *cmpl_base;
+ dma_addr_t cmpl_dma_base;
+ /* Protected members */
+ spinlock_t lock;
+ struct brcm_message *last_pending_msg;
+ u32 cmpl_read_offset;
+};
+
+struct flexrm_mbox {
+ struct device *dev;
+ void __iomem *regs;
+ u32 num_rings;
+ struct flexrm_ring *rings;
+ struct dma_pool *bd_pool;
+ struct dma_pool *cmpl_pool;
+ struct mbox_controller controller;
+};
+
+/* ====== FlexRM ring descriptor helper routines ===== */
+
+static u64 flexrm_read_desc(void *desc_ptr)
+{
+ return le64_to_cpu(*((u64 *)desc_ptr));
+}
+
+static void flexrm_write_desc(void *desc_ptr, u64 desc)
+{
+ *((u64 *)desc_ptr) = cpu_to_le64(desc);
+}
+
+static u32 flexrm_cmpl_desc_to_reqid(u64 cmpl_desc)
+{
+ return (u32)(cmpl_desc & CMPL_OPAQUE_MASK);
+}
+
+static int flexrm_cmpl_desc_to_error(u64 cmpl_desc)
+{
+ u32 status;
+
+ status = DESC_DEC(cmpl_desc, CMPL_DME_STATUS_SHIFT,
+ CMPL_DME_STATUS_MASK);
+ if (status & DME_STATUS_ERROR_MASK)
+ return -EIO;
+
+ status = DESC_DEC(cmpl_desc, CMPL_RM_STATUS_SHIFT,
+ CMPL_RM_STATUS_MASK);
+ status &= RM_STATUS_CODE_MASK;
+ if (status == RM_STATUS_CODE_AE_TIMEOUT)
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+static bool flexrm_is_next_table_desc(void *desc_ptr)
+{
+ u64 desc = flexrm_read_desc(desc_ptr);
+ u32 type = DESC_DEC(desc, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+
+ return (type == NPTR_TYPE) ? true : false;
+}
+
+static u64 flexrm_next_table_desc(u32 toggle, dma_addr_t next_addr)
+{
+ u64 desc = 0;
+
+ DESC_ENC(desc, NPTR_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+ DESC_ENC(desc, toggle, NPTR_TOGGLE_SHIFT, NPTR_TOGGLE_MASK);
+ DESC_ENC(desc, next_addr, NPTR_ADDR_SHIFT, NPTR_ADDR_MASK);
+
+ return desc;
+}
+
+static u64 flexrm_null_desc(u32 toggle)
+{
+ u64 desc = 0;
+
+ DESC_ENC(desc, NULL_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+ DESC_ENC(desc, toggle, NULL_TOGGLE_SHIFT, NULL_TOGGLE_MASK);
+
+ return desc;
+}
+
+static u32 flexrm_estimate_header_desc_count(u32 nhcnt)
+{
+ u32 hcnt = nhcnt / HEADER_BDCOUNT_MAX;
+
+ if (!(nhcnt % HEADER_BDCOUNT_MAX))
+ hcnt += 1;
+
+ return hcnt;
+}
+
+static void flexrm_flip_header_toogle(void *desc_ptr)
+{
+ u64 desc = flexrm_read_desc(desc_ptr);
+
+ if (desc & ((u64)0x1 << HEADER_TOGGLE_SHIFT))
+ desc &= ~((u64)0x1 << HEADER_TOGGLE_SHIFT);
+ else
+ desc |= ((u64)0x1 << HEADER_TOGGLE_SHIFT);
+
+ flexrm_write_desc(desc_ptr, desc);
+}
+
+static u64 flexrm_header_desc(u32 toggle, u32 startpkt, u32 endpkt,
+ u32 bdcount, u32 flags, u32 opaque)
+{
+ u64 desc = 0;
+
+ DESC_ENC(desc, HEADER_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+ DESC_ENC(desc, toggle, HEADER_TOGGLE_SHIFT, HEADER_TOGGLE_MASK);
+ DESC_ENC(desc, startpkt, HEADER_STARTPKT_SHIFT, HEADER_STARTPKT_MASK);
+ DESC_ENC(desc, endpkt, HEADER_ENDPKT_SHIFT, HEADER_ENDPKT_MASK);
+ DESC_ENC(desc, bdcount, HEADER_BDCOUNT_SHIFT, HEADER_BDCOUNT_MASK);
+ DESC_ENC(desc, flags, HEADER_FLAGS_SHIFT, HEADER_FLAGS_MASK);
+ DESC_ENC(desc, opaque, HEADER_OPAQUE_SHIFT, HEADER_OPAQUE_MASK);
+
+ return desc;
+}
+
+static void flexrm_enqueue_desc(u32 nhpos, u32 nhcnt, u32 reqid,
+ u64 desc, void **desc_ptr, u32 *toggle,
+ void *start_desc, void *end_desc)
+{
+ u64 d;
+ u32 nhavail, _toggle, _startpkt, _endpkt, _bdcount;
+
+ /* Sanity check */
+ if (nhcnt <= nhpos)
+ return;
+
+ /*
+ * Each request or packet start with a HEADER descriptor followed
+ * by one or more non-HEADER descriptors (SRC, SRCT, MSRC, DST,
+ * DSTT, MDST, IMM, and IMMT). The number of non-HEADER descriptors
+ * following a HEADER descriptor is represented by BDCOUNT field
+ * of HEADER descriptor. The max value of BDCOUNT field is 31 which
+ * means we can only have 31 non-HEADER descriptors following one
+ * HEADER descriptor.
+ *
+ * In general use, number of non-HEADER descriptors can easily go
+ * beyond 31. To tackle this situation, we have packet (or request)
+ * extenstion bits (STARTPKT and ENDPKT) in the HEADER descriptor.
+ *
+ * To use packet extension, the first HEADER descriptor of request
+ * (or packet) will have STARTPKT=1 and ENDPKT=0. The intermediate
+ * HEADER descriptors will have STARTPKT=0 and ENDPKT=0. The last
+ * HEADER descriptor will have STARTPKT=0 and ENDPKT=1. Also, the
+ * TOGGLE bit of the first HEADER will be set to invalid state to
+ * ensure that FlexRM does not start fetching descriptors till all
+ * descriptors are enqueued. The user of this function will flip
+ * the TOGGLE bit of first HEADER after all descriptors are
+ * enqueued.
+ */
+
+ if ((nhpos % HEADER_BDCOUNT_MAX == 0) && (nhcnt - nhpos)) {
+ /* Prepare the header descriptor */
+ nhavail = (nhcnt - nhpos);
+ _toggle = (nhpos == 0) ? !(*toggle) : (*toggle);
+ _startpkt = (nhpos == 0) ? 0x1 : 0x0;
+ _endpkt = (nhavail <= HEADER_BDCOUNT_MAX) ? 0x1 : 0x0;
+ _bdcount = (nhavail <= HEADER_BDCOUNT_MAX) ?
+ nhavail : HEADER_BDCOUNT_MAX;
+ if (nhavail <= HEADER_BDCOUNT_MAX)
+ _bdcount = nhavail;
+ else
+ _bdcount = HEADER_BDCOUNT_MAX;
+ d = flexrm_header_desc(_toggle, _startpkt, _endpkt,
+ _bdcount, 0x0, reqid);
+
+ /* Write header descriptor */
+ flexrm_write_desc(*desc_ptr, d);
+
+ /* Point to next descriptor */
+ *desc_ptr += sizeof(desc);
+ if (*desc_ptr == end_desc)
+ *desc_ptr = start_desc;
+
+ /* Skip next pointer descriptors */
+ while (flexrm_is_next_table_desc(*desc_ptr)) {
+ *toggle = (*toggle) ? 0 : 1;
+ *desc_ptr += sizeof(desc);
+ if (*desc_ptr == end_desc)
+ *desc_ptr = start_desc;
+ }
+ }
+
+ /* Write desired descriptor */
+ flexrm_write_desc(*desc_ptr, desc);
+
+ /* Point to next descriptor */
+ *desc_ptr += sizeof(desc);
+ if (*desc_ptr == end_desc)
+ *desc_ptr = start_desc;
+
+ /* Skip next pointer descriptors */
+ while (flexrm_is_next_table_desc(*desc_ptr)) {
+ *toggle = (*toggle) ? 0 : 1;
+ *desc_ptr += sizeof(desc);
+ if (*desc_ptr == end_desc)
+ *desc_ptr = start_desc;
+ }
+}
+
+static u64 flexrm_src_desc(dma_addr_t addr, unsigned int length)
+{
+ u64 desc = 0;
+
+ DESC_ENC(desc, SRC_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+ DESC_ENC(desc, length, SRC_LENGTH_SHIFT, SRC_LENGTH_MASK);
+ DESC_ENC(desc, addr, SRC_ADDR_SHIFT, SRC_ADDR_MASK);
+
+ return desc;
+}
+
+static u64 flexrm_msrc_desc(dma_addr_t addr, unsigned int length_div_16)
+{
+ u64 desc = 0;
+
+ DESC_ENC(desc, MSRC_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+ DESC_ENC(desc, length_div_16, MSRC_LENGTH_SHIFT, MSRC_LENGTH_MASK);
+ DESC_ENC(desc, addr, MSRC_ADDR_SHIFT, MSRC_ADDR_MASK);
+
+ return desc;
+}
+
+static u64 flexrm_dst_desc(dma_addr_t addr, unsigned int length)
+{
+ u64 desc = 0;
+
+ DESC_ENC(desc, DST_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+ DESC_ENC(desc, length, DST_LENGTH_SHIFT, DST_LENGTH_MASK);
+ DESC_ENC(desc, addr, DST_ADDR_SHIFT, DST_ADDR_MASK);
+
+ return desc;
+}
+
+static u64 flexrm_mdst_desc(dma_addr_t addr, unsigned int length_div_16)
+{
+ u64 desc = 0;
+
+ DESC_ENC(desc, MDST_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+ DESC_ENC(desc, length_div_16, MDST_LENGTH_SHIFT, MDST_LENGTH_MASK);
+ DESC_ENC(desc, addr, MDST_ADDR_SHIFT, MDST_ADDR_MASK);
+
+ return desc;
+}
+
+static u64 flexrm_imm_desc(u64 data)
+{
+ u64 desc = 0;
+
+ DESC_ENC(desc, IMM_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+ DESC_ENC(desc, data, IMM_DATA_SHIFT, IMM_DATA_MASK);
+
+ return desc;
+}
+
+static u64 flexrm_srct_desc(dma_addr_t addr, unsigned int length)
+{
+ u64 desc = 0;
+
+ DESC_ENC(desc, SRCT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+ DESC_ENC(desc, length, SRCT_LENGTH_SHIFT, SRCT_LENGTH_MASK);
+ DESC_ENC(desc, addr, SRCT_ADDR_SHIFT, SRCT_ADDR_MASK);
+
+ return desc;
+}
+
+static u64 flexrm_dstt_desc(dma_addr_t addr, unsigned int length)
+{
+ u64 desc = 0;
+
+ DESC_ENC(desc, DSTT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+ DESC_ENC(desc, length, DSTT_LENGTH_SHIFT, DSTT_LENGTH_MASK);
+ DESC_ENC(desc, addr, DSTT_ADDR_SHIFT, DSTT_ADDR_MASK);
+
+ return desc;
+}
+
+static u64 flexrm_immt_desc(u64 data)
+{
+ u64 desc = 0;
+
+ DESC_ENC(desc, IMMT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+ DESC_ENC(desc, data, IMMT_DATA_SHIFT, IMMT_DATA_MASK);
+
+ return desc;
+}
+
+static bool flexrm_spu_sanity_check(struct brcm_message *msg)
+{
+ struct scatterlist *sg;
+
+ if (!msg->spu.src || !msg->spu.dst)
+ return false;
+ for (sg = msg->spu.src; sg; sg = sg_next(sg)) {
+ if (sg->length & 0xf) {
+ if (sg->length > SRC_LENGTH_MASK)
+ return false;
+ } else {
+ if (sg->length > (MSRC_LENGTH_MASK * 16))
+ return false;
+ }
+ }
+ for (sg = msg->spu.dst; sg; sg = sg_next(sg)) {
+ if (sg->length & 0xf) {
+ if (sg->length > DST_LENGTH_MASK)
+ return false;
+ } else {
+ if (sg->length > (MDST_LENGTH_MASK * 16))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static u32 flexrm_spu_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+ u32 cnt = 0;
+ unsigned int dst_target = 0;
+ struct scatterlist *src_sg = msg->spu.src, *dst_sg = msg->spu.dst;
+
+ while (src_sg || dst_sg) {
+ if (src_sg) {
+ cnt++;
+ dst_target = src_sg->length;
+ src_sg = sg_next(src_sg);
+ } else
+ dst_target = UINT_MAX;
+
+ while (dst_target && dst_sg) {
+ cnt++;
+ if (dst_sg->length < dst_target)
+ dst_target -= dst_sg->length;
+ else
+ dst_target = 0;
+ dst_sg = sg_next(dst_sg);
+ }
+ }
+
+ return cnt;
+}
+
+static int flexrm_spu_dma_map(struct device *dev, struct brcm_message *msg)
+{
+ int rc;
+
+ rc = dma_map_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+ DMA_TO_DEVICE);
+ if (rc < 0)
+ return rc;
+
+ rc = dma_map_sg(dev, msg->spu.dst, sg_nents(msg->spu.dst),
+ DMA_FROM_DEVICE);
+ if (rc < 0) {
+ dma_unmap_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+ DMA_TO_DEVICE);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void flexrm_spu_dma_unmap(struct device *dev, struct brcm_message *msg)
+{
+ dma_unmap_sg(dev, msg->spu.dst, sg_nents(msg->spu.dst),
+ DMA_FROM_DEVICE);
+ dma_unmap_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+ DMA_TO_DEVICE);
+}
+
+static void *flexrm_spu_write_descs(struct brcm_message *msg, u32 nhcnt,
+ u32 reqid, void *desc_ptr, u32 toggle,
+ void *start_desc, void *end_desc)
+{
+ u64 d;
+ u32 nhpos = 0;
+ void *orig_desc_ptr = desc_ptr;
+ unsigned int dst_target = 0;
+ struct scatterlist *src_sg = msg->spu.src, *dst_sg = msg->spu.dst;
+
+ while (src_sg || dst_sg) {
+ if (src_sg) {
+ if (sg_dma_len(src_sg) & 0xf)
+ d = flexrm_src_desc(sg_dma_address(src_sg),
+ sg_dma_len(src_sg));
+ else
+ d = flexrm_msrc_desc(sg_dma_address(src_sg),
+ sg_dma_len(src_sg)/16);
+ flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+ d, &desc_ptr, &toggle,
+ start_desc, end_desc);
+ nhpos++;
+ dst_target = sg_dma_len(src_sg);
+ src_sg = sg_next(src_sg);
+ } else
+ dst_target = UINT_MAX;
+
+ while (dst_target && dst_sg) {
+ if (sg_dma_len(dst_sg) & 0xf)
+ d = flexrm_dst_desc(sg_dma_address(dst_sg),
+ sg_dma_len(dst_sg));
+ else
+ d = flexrm_mdst_desc(sg_dma_address(dst_sg),
+ sg_dma_len(dst_sg)/16);
+ flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+ d, &desc_ptr, &toggle,
+ start_desc, end_desc);
+ nhpos++;
+ if (sg_dma_len(dst_sg) < dst_target)
+ dst_target -= sg_dma_len(dst_sg);
+ else
+ dst_target = 0;
+ dst_sg = sg_next(dst_sg);
+ }
+ }
+
+ /* Null descriptor with invalid toggle bit */
+ flexrm_write_desc(desc_ptr, flexrm_null_desc(!toggle));
+
+ /* Ensure that descriptors have been written to memory */
+ wmb();
+
+ /* Flip toggle bit in header */
+ flexrm_flip_header_toogle(orig_desc_ptr);
+
+ return desc_ptr;
+}
+
+static bool flexrm_sba_sanity_check(struct brcm_message *msg)
+{
+ u32 i;
+
+ if (!msg->sba.cmds || !msg->sba.cmds_count)
+ return false;
+
+ for (i = 0; i < msg->sba.cmds_count; i++) {
+ if (((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) ||
+ (msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C)) &&
+ (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT))
+ return false;
+ if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) &&
+ (msg->sba.cmds[i].data_len > SRCT_LENGTH_MASK))
+ return false;
+ if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C) &&
+ (msg->sba.cmds[i].data_len > SRCT_LENGTH_MASK))
+ return false;
+ if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_RESP) &&
+ (msg->sba.cmds[i].resp_len > DSTT_LENGTH_MASK))
+ return false;
+ if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT) &&
+ (msg->sba.cmds[i].data_len > DSTT_LENGTH_MASK))
+ return false;
+ }
+
+ return true;
+}
+
+static u32 flexrm_sba_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+ u32 i, cnt;
+
+ cnt = 0;
+ for (i = 0; i < msg->sba.cmds_count; i++) {
+ cnt++;
+
+ if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) ||
+ (msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C))
+ cnt++;
+
+ if (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_RESP)
+ cnt++;
+
+ if (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT)
+ cnt++;
+ }
+
+ return cnt;
+}
+
+static void *flexrm_sba_write_descs(struct brcm_message *msg, u32 nhcnt,
+ u32 reqid, void *desc_ptr, u32 toggle,
+ void *start_desc, void *end_desc)
+{
+ u64 d;
+ u32 i, nhpos = 0;
+ struct brcm_sba_command *c;
+ void *orig_desc_ptr = desc_ptr;
+
+ /* Convert SBA commands into descriptors */
+ for (i = 0; i < msg->sba.cmds_count; i++) {
+ c = &msg->sba.cmds[i];
+
+ if ((c->flags & BRCM_SBA_CMD_HAS_RESP) &&
+ (c->flags & BRCM_SBA_CMD_HAS_OUTPUT)) {
+ /* Destination response descriptor */
+ d = flexrm_dst_desc(c->resp, c->resp_len);
+ flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+ d, &desc_ptr, &toggle,
+ start_desc, end_desc);
+ nhpos++;
+ } else if (c->flags & BRCM_SBA_CMD_HAS_RESP) {
+ /* Destination response with tlast descriptor */
+ d = flexrm_dstt_desc(c->resp, c->resp_len);
+ flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+ d, &desc_ptr, &toggle,
+ start_desc, end_desc);
+ nhpos++;
+ }
+
+ if (c->flags & BRCM_SBA_CMD_HAS_OUTPUT) {
+ /* Destination with tlast descriptor */
+ d = flexrm_dstt_desc(c->data, c->data_len);
+ flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+ d, &desc_ptr, &toggle,
+ start_desc, end_desc);
+ nhpos++;
+ }
+
+ if (c->flags & BRCM_SBA_CMD_TYPE_B) {
+ /* Command as immediate descriptor */
+ d = flexrm_imm_desc(c->cmd);
+ flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+ d, &desc_ptr, &toggle,
+ start_desc, end_desc);
+ nhpos++;
+ } else {
+ /* Command as immediate descriptor with tlast */
+ d = flexrm_immt_desc(c->cmd);
+ flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+ d, &desc_ptr, &toggle,
+ start_desc, end_desc);
+ nhpos++;
+ }
+
+ if ((c->flags & BRCM_SBA_CMD_TYPE_B) ||
+ (c->flags & BRCM_SBA_CMD_TYPE_C)) {
+ /* Source with tlast descriptor */
+ d = flexrm_srct_desc(c->data, c->data_len);
+ flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+ d, &desc_ptr, &toggle,
+ start_desc, end_desc);
+ nhpos++;
+ }
+ }
+
+ /* Null descriptor with invalid toggle bit */
+ flexrm_write_desc(desc_ptr, flexrm_null_desc(!toggle));
+
+ /* Ensure that descriptors have been written to memory */
+ wmb();
+
+ /* Flip toggle bit in header */
+ flexrm_flip_header_toogle(orig_desc_ptr);
+
+ return desc_ptr;
+}
+
+static bool flexrm_sanity_check(struct brcm_message *msg)
+{
+ if (!msg)
+ return false;
+
+ switch (msg->type) {
+ case BRCM_MESSAGE_SPU:
+ return flexrm_spu_sanity_check(msg);
+ case BRCM_MESSAGE_SBA:
+ return flexrm_sba_sanity_check(msg);
+ default:
+ return false;
+ };
+}
+
+static u32 flexrm_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+ if (!msg)
+ return 0;
+
+ switch (msg->type) {
+ case BRCM_MESSAGE_SPU:
+ return flexrm_spu_estimate_nonheader_desc_count(msg);
+ case BRCM_MESSAGE_SBA:
+ return flexrm_sba_estimate_nonheader_desc_count(msg);
+ default:
+ return 0;
+ };
+}
+
+static int flexrm_dma_map(struct device *dev, struct brcm_message *msg)
+{
+ if (!dev || !msg)
+ return -EINVAL;
+
+ switch (msg->type) {
+ case BRCM_MESSAGE_SPU:
+ return flexrm_spu_dma_map(dev, msg);
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static void flexrm_dma_unmap(struct device *dev, struct brcm_message *msg)
+{
+ if (!dev || !msg)
+ return;
+
+ switch (msg->type) {
+ case BRCM_MESSAGE_SPU:
+ flexrm_spu_dma_unmap(dev, msg);
+ break;
+ default:
+ break;
+ }
+}
+
+static void *flexrm_write_descs(struct brcm_message *msg, u32 nhcnt,
+ u32 reqid, void *desc_ptr, u32 toggle,
+ void *start_desc, void *end_desc)
+{
+ if (!msg || !desc_ptr || !start_desc || !end_desc)
+ return ERR_PTR(-ENOTSUPP);
+
+ if ((desc_ptr < start_desc) || (end_desc <= desc_ptr))
+ return ERR_PTR(-ERANGE);
+
+ switch (msg->type) {
+ case BRCM_MESSAGE_SPU:
+ return flexrm_spu_write_descs(msg, nhcnt, reqid,
+ desc_ptr, toggle,
+ start_desc, end_desc);
+ case BRCM_MESSAGE_SBA:
+ return flexrm_sba_write_descs(msg, nhcnt, reqid,
+ desc_ptr, toggle,
+ start_desc, end_desc);
+ default:
+ return ERR_PTR(-ENOTSUPP);
+ };
+}
+
+/* ====== FlexRM driver helper routines ===== */
+
+static int flexrm_new_request(struct flexrm_ring *ring,
+ struct brcm_message *batch_msg,
+ struct brcm_message *msg)
+{
+ void *next;
+ unsigned long flags;
+ u32 val, count, nhcnt;
+ u32 read_offset, write_offset;
+ bool exit_cleanup = false;
+ int ret = 0, reqid;
+
+ /* Do sanity check on message */
+ if (!flexrm_sanity_check(msg))
+ return -EIO;
+ msg->error = 0;
+
+ /* If no requests possible then save data pointer and goto done. */
+ reqid = ida_simple_get(&ring->requests_ida, 0,
+ RING_MAX_REQ_COUNT, GFP_KERNEL);
+ if (reqid < 0) {
+ spin_lock_irqsave(&ring->lock, flags);
+ if (batch_msg)
+ ring->last_pending_msg = batch_msg;
+ else
+ ring->last_pending_msg = msg;
+ spin_unlock_irqrestore(&ring->lock, flags);
+ return 0;
+ }
+ ring->requests[reqid] = msg;
+
+ /* Do DMA mappings for the message */
+ ret = flexrm_dma_map(ring->mbox->dev, msg);
+ if (ret < 0) {
+ ring->requests[reqid] = NULL;
+ ida_simple_remove(&ring->requests_ida, reqid);
+ return ret;
+ }
+
+ /* If last_pending_msg is already set then goto done with error */
+ spin_lock_irqsave(&ring->lock, flags);
+ if (ring->last_pending_msg)
+ ret = -ENOSPC;
+ spin_unlock_irqrestore(&ring->lock, flags);
+ if (ret < 0) {
+ dev_warn(ring->mbox->dev, "no space in ring %d\n", ring->num);
+ exit_cleanup = true;
+ goto exit;
+ }
+
+ /* Determine current HW BD read offset */
+ read_offset = readl_relaxed(ring->regs + RING_BD_READ_PTR);
+ val = readl_relaxed(ring->regs + RING_BD_START_ADDR);
+ read_offset *= RING_DESC_SIZE;
+ read_offset += (u32)(BD_START_ADDR_DECODE(val) - ring->bd_dma_base);
+
+ /*
+ * Number required descriptors = number of non-header descriptors +
+ * number of header descriptors +
+ * 1x null descriptor
+ */
+ nhcnt = flexrm_estimate_nonheader_desc_count(msg);
+ count = flexrm_estimate_header_desc_count(nhcnt) + nhcnt + 1;
+
+ /* Check for available descriptor space. */
+ write_offset = ring->bd_write_offset;
+ while (count) {
+ if (!flexrm_is_next_table_desc(ring->bd_base + write_offset))
+ count--;
+ write_offset += RING_DESC_SIZE;
+ if (write_offset == RING_BD_SIZE)
+ write_offset = 0x0;
+ if (write_offset == read_offset)
+ break;
+ }
+ if (count) {
+ spin_lock_irqsave(&ring->lock, flags);
+ if (batch_msg)
+ ring->last_pending_msg = batch_msg;
+ else
+ ring->last_pending_msg = msg;
+ spin_unlock_irqrestore(&ring->lock, flags);
+ ret = 0;
+ exit_cleanup = true;
+ goto exit;
+ }
+
+ /* Write descriptors to ring */
+ next = flexrm_write_descs(msg, nhcnt, reqid,
+ ring->bd_base + ring->bd_write_offset,
+ RING_BD_TOGGLE_VALID(ring->bd_write_offset),
+ ring->bd_base, ring->bd_base + RING_BD_SIZE);
+ if (IS_ERR(next)) {
+ ret = PTR_ERR(next);
+ exit_cleanup = true;
+ goto exit;
+ }
+
+ /* Save ring BD write offset */
+ ring->bd_write_offset = (unsigned long)(next - ring->bd_base);
+
+exit:
+ /* Update error status in message */
+ msg->error = ret;
+
+ /* Cleanup if we failed */
+ if (exit_cleanup) {
+ flexrm_dma_unmap(ring->mbox->dev, msg);
+ ring->requests[reqid] = NULL;
+ ida_simple_remove(&ring->requests_ida, reqid);
+ }
+
+ return ret;
+}
+
+static int flexrm_process_completions(struct flexrm_ring *ring)
+{
+ u64 desc;
+ int err, count = 0;
+ unsigned long flags;
+ struct brcm_message *msg = NULL;
+ u32 reqid, cmpl_read_offset, cmpl_write_offset;
+ struct mbox_chan *chan = &ring->mbox->controller.chans[ring->num];
+
+ spin_lock_irqsave(&ring->lock, flags);
+
+ /* Check last_pending_msg */
+ if (ring->last_pending_msg) {
+ msg = ring->last_pending_msg;
+ ring->last_pending_msg = NULL;
+ }
+
+ /*
+ * Get current completion read and write offset
+ *
+ * Note: We should read completion write pointer atleast once
+ * after we get a MSI interrupt because HW maintains internal
+ * MSI status which will allow next MSI interrupt only after
+ * completion write pointer is read.
+ */
+ cmpl_write_offset = readl_relaxed(ring->regs + RING_CMPL_WRITE_PTR);
+ cmpl_write_offset *= RING_DESC_SIZE;
+ cmpl_read_offset = ring->cmpl_read_offset;
+ ring->cmpl_read_offset = cmpl_write_offset;
+
+ spin_unlock_irqrestore(&ring->lock, flags);
+
+ /* If last_pending_msg was set then queue it back */
+ if (msg)
+ mbox_send_message(chan, msg);
+
+ /* For each completed request notify mailbox clients */
+ reqid = 0;
+ while (cmpl_read_offset != cmpl_write_offset) {
+ /* Dequeue next completion descriptor */
+ desc = *((u64 *)(ring->cmpl_base + cmpl_read_offset));
+
+ /* Next read offset */
+ cmpl_read_offset += RING_DESC_SIZE;
+ if (cmpl_read_offset == RING_CMPL_SIZE)
+ cmpl_read_offset = 0;
+
+ /* Decode error from completion descriptor */
+ err = flexrm_cmpl_desc_to_error(desc);
+ if (err < 0) {
+ dev_warn(ring->mbox->dev,
+ "got completion desc=0x%lx with error %d",
+ (unsigned long)desc, err);
+ }
+
+ /* Determine request id from completion descriptor */
+ reqid = flexrm_cmpl_desc_to_reqid(desc);
+
+ /* Determine message pointer based on reqid */
+ msg = ring->requests[reqid];
+ if (!msg) {
+ dev_warn(ring->mbox->dev,
+ "null msg pointer for completion desc=0x%lx",
+ (unsigned long)desc);
+ continue;
+ }
+
+ /* Release reqid for recycling */
+ ring->requests[reqid] = NULL;
+ ida_simple_remove(&ring->requests_ida, reqid);
+
+ /* Unmap DMA mappings */
+ flexrm_dma_unmap(ring->mbox->dev, msg);
+
+ /* Give-back message to mailbox client */
+ msg->error = err;
+ mbox_chan_received_data(chan, msg);
+
+ /* Increment number of completions processed */
+ count++;
+ }
+
+ return count;
+}
+
+/* ====== FlexRM interrupt handler ===== */
+
+static irqreturn_t flexrm_irq_event(int irq, void *dev_id)
+{
+ /* We only have MSI for completions so just wakeup IRQ thread */
+ /* Ring related errors will be informed via completion descriptors */
+
+ return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t flexrm_irq_thread(int irq, void *dev_id)
+{
+ flexrm_process_completions(dev_id);
+
+ return IRQ_HANDLED;
+}
+
+/* ====== FlexRM mailbox callbacks ===== */
+
+static int flexrm_send_data(struct mbox_chan *chan, void *data)
+{
+ int i, rc;
+ struct flexrm_ring *ring = chan->con_priv;
+ struct brcm_message *msg = data;
+
+ if (msg->type == BRCM_MESSAGE_BATCH) {
+ for (i = msg->batch.msgs_queued;
+ i < msg->batch.msgs_count; i++) {
+ rc = flexrm_new_request(ring, msg,
+ &msg->batch.msgs[i]);
+ if (rc) {
+ msg->error = rc;
+ return rc;
+ }
+ msg->batch.msgs_queued++;
+ }
+ return 0;
+ }
+
+ return flexrm_new_request(ring, NULL, data);
+}
+
+static bool flexrm_peek_data(struct mbox_chan *chan)
+{
+ int cnt = flexrm_process_completions(chan->con_priv);
+
+ return (cnt > 0) ? true : false;
+}
+
+static int flexrm_startup(struct mbox_chan *chan)
+{
+ u64 d;
+ u32 val, off;
+ int ret = 0;
+ dma_addr_t next_addr;
+ struct flexrm_ring *ring = chan->con_priv;
+
+ /* Allocate BD memory */
+ ring->bd_base = dma_pool_alloc(ring->mbox->bd_pool,
+ GFP_KERNEL, &ring->bd_dma_base);
+ if (!ring->bd_base) {
+ dev_err(ring->mbox->dev, "can't allocate BD memory\n");
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ /* Configure next table pointer entries in BD memory */
+ for (off = 0; off < RING_BD_SIZE; off += RING_DESC_SIZE) {
+ next_addr = off + RING_DESC_SIZE;
+ if (next_addr == RING_BD_SIZE)
+ next_addr = 0;
+ next_addr += ring->bd_dma_base;
+ if (RING_BD_ALIGN_CHECK(next_addr))
+ d = flexrm_next_table_desc(RING_BD_TOGGLE_VALID(off),
+ next_addr);
+ else
+ d = flexrm_null_desc(RING_BD_TOGGLE_INVALID(off));
+ flexrm_write_desc(ring->bd_base + off, d);
+ }
+
+ /* Allocate completion memory */
+ ring->cmpl_base = dma_pool_alloc(ring->mbox->cmpl_pool,
+ GFP_KERNEL, &ring->cmpl_dma_base);
+ if (!ring->cmpl_base) {
+ dev_err(ring->mbox->dev, "can't allocate completion memory\n");
+ ret = -ENOMEM;
+ goto fail_free_bd_memory;
+ }
+ memset(ring->cmpl_base, 0, RING_CMPL_SIZE);
+
+ /* Request IRQ */
+ if (ring->irq == UINT_MAX) {
+ dev_err(ring->mbox->dev, "ring IRQ not available\n");
+ ret = -ENODEV;
+ goto fail_free_cmpl_memory;
+ }
+ ret = request_threaded_irq(ring->irq,
+ flexrm_irq_event,
+ flexrm_irq_thread,
+ 0, dev_name(ring->mbox->dev), ring);
+ if (ret) {
+ dev_err(ring->mbox->dev, "failed to request ring IRQ\n");
+ goto fail_free_cmpl_memory;
+ }
+ ring->irq_requested = true;
+
+ /* Disable/inactivate ring */
+ writel_relaxed(0x0, ring->regs + RING_CONTROL);
+
+ /* Program BD start address */
+ val = BD_START_ADDR_VALUE(ring->bd_dma_base);
+ writel_relaxed(val, ring->regs + RING_BD_START_ADDR);
+
+ /* BD write pointer will be same as HW write pointer */
+ ring->bd_write_offset =
+ readl_relaxed(ring->regs + RING_BD_WRITE_PTR);
+ ring->bd_write_offset *= RING_DESC_SIZE;
+
+ /* Program completion start address */
+ val = CMPL_START_ADDR_VALUE(ring->cmpl_dma_base);
+ writel_relaxed(val, ring->regs + RING_CMPL_START_ADDR);
+
+ /* Ensure last pending message is cleared */
+ ring->last_pending_msg = NULL;
+
+ /* Completion read pointer will be same as HW write pointer */
+ ring->cmpl_read_offset =
+ readl_relaxed(ring->regs + RING_CMPL_WRITE_PTR);
+ ring->cmpl_read_offset *= RING_DESC_SIZE;
+
+ /* Read ring Tx, Rx, and Outstanding counts to clear */
+ readl_relaxed(ring->regs + RING_NUM_REQ_RECV_LS);
+ readl_relaxed(ring->regs + RING_NUM_REQ_RECV_MS);
+ readl_relaxed(ring->regs + RING_NUM_REQ_TRANS_LS);
+ readl_relaxed(ring->regs + RING_NUM_REQ_TRANS_MS);
+ readl_relaxed(ring->regs + RING_NUM_REQ_OUTSTAND);
+
+ /* Configure RING_MSI_CONTROL */
+ val = 0;
+ val |= (ring->msi_timer_val << MSI_TIMER_VAL_SHIFT);
+ val |= BIT(MSI_ENABLE_SHIFT);
+ val |= (ring->msi_count_threshold & MSI_COUNT_MASK) << MSI_COUNT_SHIFT;
+ writel_relaxed(val, ring->regs + RING_MSI_CONTROL);
+
+ /* Enable/activate ring */
+ val = BIT(CONTROL_ACTIVE_SHIFT);
+ writel_relaxed(val, ring->regs + RING_CONTROL);
+
+ return 0;
+
+fail_free_cmpl_memory:
+ dma_pool_free(ring->mbox->cmpl_pool,
+ ring->cmpl_base, ring->cmpl_dma_base);
+ ring->cmpl_base = NULL;
+fail_free_bd_memory:
+ dma_pool_free(ring->mbox->bd_pool,
+ ring->bd_base, ring->bd_dma_base);
+ ring->bd_base = NULL;
+fail:
+ return ret;
+}
+
+static void flexrm_shutdown(struct mbox_chan *chan)
+{
+ u32 reqid;
+ unsigned int timeout;
+ struct brcm_message *msg;
+ struct flexrm_ring *ring = chan->con_priv;
+
+ /* Disable/inactivate ring */
+ writel_relaxed(0x0, ring->regs + RING_CONTROL);
+
+ /* Flush ring with timeout of 1s */
+ timeout = 1000;
+ writel_relaxed(BIT(CONTROL_FLUSH_SHIFT),
+ ring->regs + RING_CONTROL);
+ do {
+ if (readl_relaxed(ring->regs + RING_FLUSH_DONE) &
+ FLUSH_DONE_MASK)
+ break;
+ mdelay(1);
+ } while (timeout--);
+
+ /* Abort all in-flight requests */
+ for (reqid = 0; reqid < RING_MAX_REQ_COUNT; reqid++) {
+ msg = ring->requests[reqid];
+ if (!msg)
+ continue;
+
+ /* Release reqid for recycling */
+ ring->requests[reqid] = NULL;
+ ida_simple_remove(&ring->requests_ida, reqid);
+
+ /* Unmap DMA mappings */
+ flexrm_dma_unmap(ring->mbox->dev, msg);
+
+ /* Give-back message to mailbox client */
+ msg->error = -EIO;
+ mbox_chan_received_data(chan, msg);
+ }
+
+ /* Release IRQ */
+ if (ring->irq_requested) {
+ free_irq(ring->irq, ring);
+ ring->irq_requested = false;
+ }
+
+ /* Free-up completion descriptor ring */
+ if (ring->cmpl_base) {
+ dma_pool_free(ring->mbox->cmpl_pool,
+ ring->cmpl_base, ring->cmpl_dma_base);
+ ring->cmpl_base = NULL;
+ }
+
+ /* Free-up BD descriptor ring */
+ if (ring->bd_base) {
+ dma_pool_free(ring->mbox->bd_pool,
+ ring->bd_base, ring->bd_dma_base);
+ ring->bd_base = NULL;
+ }
+}
+
+static bool flexrm_last_tx_done(struct mbox_chan *chan)
+{
+ bool ret;
+ unsigned long flags;
+ struct flexrm_ring *ring = chan->con_priv;
+
+ spin_lock_irqsave(&ring->lock, flags);
+ ret = (ring->last_pending_msg) ? false : true;
+ spin_unlock_irqrestore(&ring->lock, flags);
+
+ return ret;
+}
+
+static const struct mbox_chan_ops flexrm_mbox_chan_ops = {
+ .send_data = flexrm_send_data,
+ .startup = flexrm_startup,
+ .shutdown = flexrm_shutdown,
+ .last_tx_done = flexrm_last_tx_done,
+ .peek_data = flexrm_peek_data,
+};
+
+static struct mbox_chan *flexrm_mbox_of_xlate(struct mbox_controller *cntlr,
+ const struct of_phandle_args *pa)
+{
+ struct mbox_chan *chan;
+ struct flexrm_ring *ring;
+
+ if (pa->args_count < 3)
+ return ERR_PTR(-EINVAL);
+
+ if (pa->args[0] >= cntlr->num_chans)
+ return ERR_PTR(-ENOENT);
+
+ if (pa->args[1] > MSI_COUNT_MASK)
+ return ERR_PTR(-EINVAL);
+
+ if (pa->args[2] > MSI_TIMER_VAL_MASK)
+ return ERR_PTR(-EINVAL);
+
+ chan = &cntlr->chans[pa->args[0]];
+ ring = chan->con_priv;
+ ring->msi_count_threshold = pa->args[1];
+ ring->msi_timer_val = pa->args[2];
+
+ return chan;
+}
+
+/* ====== FlexRM platform driver ===== */
+
+static void flexrm_mbox_msi_write(struct msi_desc *desc, struct msi_msg *msg)
+{
+ struct device *dev = msi_desc_to_dev(desc);
+ struct flexrm_mbox *mbox = dev_get_drvdata(dev);
+ struct flexrm_ring *ring = &mbox->rings[desc->platform.msi_index];
+
+ /* Configure per-Ring MSI registers */
+ writel_relaxed(msg->address_lo, ring->regs + RING_MSI_ADDR_LS);
+ writel_relaxed(msg->address_hi, ring->regs + RING_MSI_ADDR_MS);
+ writel_relaxed(msg->data, ring->regs + RING_MSI_DATA_VALUE);
+}
+
+static int flexrm_mbox_probe(struct platform_device *pdev)
+{
+ int index, ret = 0;
+ void __iomem *regs;
+ void __iomem *regs_end;
+ struct msi_desc *desc;
+ struct resource *iomem;
+ struct flexrm_ring *ring;
+ struct flexrm_mbox *mbox;
+ struct device *dev = &pdev->dev;
+
+ /* Allocate driver mailbox struct */
+ mbox = devm_kzalloc(dev, sizeof(*mbox), GFP_KERNEL);
+ if (!mbox) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ mbox->dev = dev;
+ platform_set_drvdata(pdev, mbox);
+
+ /* Get resource for registers */
+ iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!iomem || (resource_size(iomem) < RING_REGS_SIZE)) {
+ ret = -ENODEV;
+ goto fail;
+ }
+
+ /* Map registers of all rings */
+ mbox->regs = devm_ioremap_resource(&pdev->dev, iomem);
+ if (IS_ERR(mbox->regs)) {
+ ret = PTR_ERR(mbox->regs);
+ dev_err(&pdev->dev, "Failed to remap mailbox regs: %d\n", ret);
+ goto fail;
+ }
+ regs_end = mbox->regs + resource_size(iomem);
+
+ /* Scan and count available rings */
+ mbox->num_rings = 0;
+ for (regs = mbox->regs; regs < regs_end; regs += RING_REGS_SIZE) {
+ if (readl_relaxed(regs + RING_VER) == RING_VER_MAGIC)
+ mbox->num_rings++;
+ }
+ if (!mbox->num_rings) {
+ ret = -ENODEV;
+ goto fail;
+ }
+
+ /* Allocate driver ring structs */
+ ring = devm_kcalloc(dev, mbox->num_rings, sizeof(*ring), GFP_KERNEL);
+ if (!ring) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ mbox->rings = ring;
+
+ /* Initialize members of driver ring structs */
+ regs = mbox->regs;
+ for (index = 0; index < mbox->num_rings; index++) {
+ ring = &mbox->rings[index];
+ ring->num = index;
+ ring->mbox = mbox;
+ while ((regs < regs_end) &&
+ (readl_relaxed(regs + RING_VER) != RING_VER_MAGIC))
+ regs += RING_REGS_SIZE;
+ if (regs_end <= regs) {
+ ret = -ENODEV;
+ goto fail;
+ }
+ ring->regs = regs;
+ regs += RING_REGS_SIZE;
+ ring->irq = UINT_MAX;
+ ring->irq_requested = false;
+ ring->msi_timer_val = MSI_TIMER_VAL_MASK;
+ ring->msi_count_threshold = 0x1;
+ ida_init(&ring->requests_ida);
+ memset(ring->requests, 0, sizeof(ring->requests));
+ ring->bd_base = NULL;
+ ring->bd_dma_base = 0;
+ ring->cmpl_base = NULL;
+ ring->cmpl_dma_base = 0;
+ spin_lock_init(&ring->lock);
+ ring->last_pending_msg = NULL;
+ ring->cmpl_read_offset = 0;
+ }
+
+ /* FlexRM is capable of 40-bit physical addresses only */
+ ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40));
+ if (ret) {
+ ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+ if (ret)
+ goto fail;
+ }
+
+ /* Create DMA pool for ring BD memory */
+ mbox->bd_pool = dma_pool_create("bd", dev, RING_BD_SIZE,
+ 1 << RING_BD_ALIGN_ORDER, 0);
+ if (!mbox->bd_pool) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ /* Create DMA pool for ring completion memory */
+ mbox->cmpl_pool = dma_pool_create("cmpl", dev, RING_CMPL_SIZE,
+ 1 << RING_CMPL_ALIGN_ORDER, 0);
+ if (!mbox->cmpl_pool) {
+ ret = -ENOMEM;
+ goto fail_destroy_bd_pool;
+ }
+
+ /* Allocate platform MSIs for each ring */
+ ret = platform_msi_domain_alloc_irqs(dev, mbox->num_rings,
+ flexrm_mbox_msi_write);
+ if (ret)
+ goto fail_destroy_cmpl_pool;
+
+ /* Save alloced IRQ numbers for each ring */
+ for_each_msi_entry(desc, dev) {
+ ring = &mbox->rings[desc->platform.msi_index];
+ ring->irq = desc->irq;
+ }
+
+ /* Initialize mailbox controller */
+ mbox->controller.txdone_irq = false;
+ mbox->controller.txdone_poll = true;
+ mbox->controller.txpoll_period = 1;
+ mbox->controller.ops = &flexrm_mbox_chan_ops;
+ mbox->controller.dev = dev;
+ mbox->controller.num_chans = mbox->num_rings;
+ mbox->controller.of_xlate = flexrm_mbox_of_xlate;
+ mbox->controller.chans = devm_kcalloc(dev, mbox->num_rings,
+ sizeof(*mbox->controller.chans), GFP_KERNEL);
+ if (!mbox->controller.chans) {
+ ret = -ENOMEM;
+ goto fail_free_msis;
+ }
+ for (index = 0; index < mbox->num_rings; index++)
+ mbox->controller.chans[index].con_priv = &mbox->rings[index];
+
+ /* Register mailbox controller */
+ ret = mbox_controller_register(&mbox->controller);
+ if (ret)
+ goto fail_free_msis;
+
+ dev_info(dev, "registered flexrm mailbox with %d channels\n",
+ mbox->controller.num_chans);
+
+ return 0;
+
+fail_free_msis:
+ platform_msi_domain_free_irqs(dev);
+fail_destroy_cmpl_pool:
+ dma_pool_destroy(mbox->cmpl_pool);
+fail_destroy_bd_pool:
+ dma_pool_destroy(mbox->bd_pool);
+fail:
+ return ret;
+}
+
+static int flexrm_mbox_remove(struct platform_device *pdev)
+{
+ int index;
+ struct device *dev = &pdev->dev;
+ struct flexrm_ring *ring;
+ struct flexrm_mbox *mbox = platform_get_drvdata(pdev);
+
+ mbox_controller_unregister(&mbox->controller);
+
+ platform_msi_domain_free_irqs(dev);
+
+ dma_pool_destroy(mbox->cmpl_pool);
+ dma_pool_destroy(mbox->bd_pool);
+
+ for (index = 0; index < mbox->num_rings; index++) {
+ ring = &mbox->rings[index];
+ ida_destroy(&ring->requests_ida);
+ }
+
+ return 0;
+}
+
+static const struct of_device_id flexrm_mbox_of_match[] = {
+ { .compatible = "brcm,iproc-flexrm-mbox", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, flexrm_mbox_of_match);
+
+static struct platform_driver flexrm_mbox_driver = {
+ .driver = {
+ .name = "brcm-flexrm-mbox",
+ .of_match_table = flexrm_mbox_of_match,
+ },
+ .probe = flexrm_mbox_probe,
+ .remove = flexrm_mbox_remove,
+};
+module_platform_driver(flexrm_mbox_driver);
+
+MODULE_AUTHOR("Anup Patel <anup.patel@broadcom.com>");
+MODULE_DESCRIPTION("Broadcom FlexRM mailbox driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mailbox/bcm-pdc-mailbox.c b/drivers/mailbox/bcm-pdc-mailbox.c
index 2aeb034d5fb9..4fe7be0bdd11 100644
--- a/drivers/mailbox/bcm-pdc-mailbox.c
+++ b/drivers/mailbox/bcm-pdc-mailbox.c
@@ -18,7 +18,8 @@
* Broadcom PDC Mailbox Driver
* The PDC provides a ring based programming interface to one or more hardware
* offload engines. For example, the PDC driver works with both SPU-M and SPU2
- * cryptographic offload hardware. In some chips the PDC is referred to as MDE.
+ * cryptographic offload hardware. In some chips the PDC is referred to as MDE,
+ * and in others the FA2/FA+ hardware is used with this PDC driver.
*
* The PDC driver registers with the Linux mailbox framework as a mailbox
* controller, once for each PDC instance. Ring 0 for each PDC is registered as
@@ -108,6 +109,7 @@
#define PDC_INTMASK_OFFSET 0x24
#define PDC_INTSTATUS_OFFSET 0x20
#define PDC_RCVLAZY0_OFFSET (0x30 + 4 * PDC_RINGSET)
+#define FA_RCVLAZY0_OFFSET 0x100
/*
* For SPU2, configure MDE_CKSUM_CONTROL to write 17 bytes of metadata
@@ -162,6 +164,11 @@
/* Maximum size buffer the DMA engine can handle */
#define PDC_DMA_BUF_MAX 16384
+enum pdc_hw {
+ FA_HW, /* FA2/FA+ hardware (i.e. Northstar Plus) */
+ PDC_HW /* PDC/MDE hardware (i.e. Northstar 2, Pegasus) */
+};
+
struct pdc_dma_map {
void *ctx; /* opaque context associated with frame */
};
@@ -211,13 +218,13 @@ struct pdc_regs {
u32 gptimer; /* 0x028 */
u32 PAD;
- u32 intrcvlazy_0; /* 0x030 */
- u32 intrcvlazy_1; /* 0x034 */
- u32 intrcvlazy_2; /* 0x038 */
- u32 intrcvlazy_3; /* 0x03c */
+ u32 intrcvlazy_0; /* 0x030 (Only in PDC, not FA2) */
+ u32 intrcvlazy_1; /* 0x034 (Only in PDC, not FA2) */
+ u32 intrcvlazy_2; /* 0x038 (Only in PDC, not FA2) */
+ u32 intrcvlazy_3; /* 0x03c (Only in PDC, not FA2) */
u32 PAD[48];
- u32 removed_intrecvlazy; /* 0x100 */
+ u32 fa_intrecvlazy; /* 0x100 (Only in FA2, not PDC) */
u32 flowctlthresh; /* 0x104 */
u32 wrrthresh; /* 0x108 */
u32 gmac_idle_cnt_thresh; /* 0x10c */
@@ -243,7 +250,7 @@ struct pdc_regs {
u32 serdes_status1; /* 0x1b0 */
u32 PAD[11]; /* 0x1b4-1dc */
u32 clk_ctl_st; /* 0x1e0 */
- u32 hw_war; /* 0x1e4 */
+ u32 hw_war; /* 0x1e4 (Only in PDC, not FA2) */
u32 pwrctl; /* 0x1e8 */
u32 PAD[5];
@@ -410,6 +417,9 @@ struct pdc_state {
u32 txnobuf; /* unable to create tx descriptor */
u32 rxnobuf; /* unable to create rx descriptor */
u32 rx_oflow; /* count of rx overflows */
+
+ /* hardware type - FA2 or PDC/MDE */
+ enum pdc_hw hw_type;
};
/* Global variables */
@@ -1396,7 +1406,13 @@ static int pdc_interrupts_init(struct pdc_state *pdcs)
/* interrupt configuration */
iowrite32(PDC_INTMASK, pdcs->pdc_reg_vbase + PDC_INTMASK_OFFSET);
- iowrite32(PDC_LAZY_INT, pdcs->pdc_reg_vbase + PDC_RCVLAZY0_OFFSET);
+
+ if (pdcs->hw_type == FA_HW)
+ iowrite32(PDC_LAZY_INT, pdcs->pdc_reg_vbase +
+ FA_RCVLAZY0_OFFSET);
+ else
+ iowrite32(PDC_LAZY_INT, pdcs->pdc_reg_vbase +
+ PDC_RCVLAZY0_OFFSET);
/* read irq from device tree */
pdcs->pdc_irq = irq_of_parse_and_map(dn, 0);
@@ -1465,6 +1481,17 @@ static int pdc_mb_init(struct pdc_state *pdcs)
return 0;
}
+/* Device tree API */
+static const int pdc_hw = PDC_HW;
+static const int fa_hw = FA_HW;
+
+static const struct of_device_id pdc_mbox_of_match[] = {
+ {.compatible = "brcm,iproc-pdc-mbox", .data = &pdc_hw},
+ {.compatible = "brcm,iproc-fa2-mbox", .data = &fa_hw},
+ { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, pdc_mbox_of_match);
+
/**
* pdc_dt_read() - Read application-specific data from device tree.
* @pdev: Platform device
@@ -1481,6 +1508,8 @@ static int pdc_dt_read(struct platform_device *pdev, struct pdc_state *pdcs)
{
struct device *dev = &pdev->dev;
struct device_node *dn = pdev->dev.of_node;
+ const struct of_device_id *match;
+ const int *hw_type;
int err;
err = of_property_read_u32(dn, "brcm,rx-status-len",
@@ -1492,6 +1521,14 @@ static int pdc_dt_read(struct platform_device *pdev, struct pdc_state *pdcs)
pdcs->use_bcm_hdr = of_property_read_bool(dn, "brcm,use-bcm-hdr");
+ pdcs->hw_type = PDC_HW;
+
+ match = of_match_device(of_match_ptr(pdc_mbox_of_match), dev);
+ if (match != NULL) {
+ hw_type = match->data;
+ pdcs->hw_type = *hw_type;
+ }
+
return 0;
}
@@ -1525,7 +1562,7 @@ static int pdc_probe(struct platform_device *pdev)
pdcs->pdc_idx = pdcg.num_spu;
pdcg.num_spu++;
- err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+ err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(39));
if (err) {
dev_warn(dev, "PDC device cannot perform DMA. Error %d.", err);
goto cleanup;
@@ -1611,12 +1648,6 @@ static int pdc_remove(struct platform_device *pdev)
return 0;
}
-static const struct of_device_id pdc_mbox_of_match[] = {
- {.compatible = "brcm,iproc-pdc-mbox"},
- { /* sentinel */ }
-};
-MODULE_DEVICE_TABLE(of, pdc_mbox_of_match);
-
static struct platform_driver pdc_mbox_driver = {
.probe = pdc_probe,
.remove = pdc_remove,
diff --git a/drivers/mailbox/hi6220-mailbox.c b/drivers/mailbox/hi6220-mailbox.c
index 613722db5daf..519376d3534c 100644
--- a/drivers/mailbox/hi6220-mailbox.c
+++ b/drivers/mailbox/hi6220-mailbox.c
@@ -221,7 +221,7 @@ static void hi6220_mbox_shutdown(struct mbox_chan *chan)
mbox->irq_map_chan[mchan->ack_irq] = NULL;
}
-static struct mbox_chan_ops hi6220_mbox_ops = {
+static const struct mbox_chan_ops hi6220_mbox_ops = {
.send_data = hi6220_mbox_send_data,
.startup = hi6220_mbox_startup,
.shutdown = hi6220_mbox_shutdown,
diff --git a/drivers/mailbox/mailbox-xgene-slimpro.c b/drivers/mailbox/mailbox-xgene-slimpro.c
index dd2afbca51c9..a7040163dd43 100644
--- a/drivers/mailbox/mailbox-xgene-slimpro.c
+++ b/drivers/mailbox/mailbox-xgene-slimpro.c
@@ -174,7 +174,7 @@ static void slimpro_mbox_shutdown(struct mbox_chan *chan)
devm_free_irq(mb_chan->dev, mb_chan->irq, mb_chan);
}
-static struct mbox_chan_ops slimpro_mbox_ops = {
+static const struct mbox_chan_ops slimpro_mbox_ops = {
.send_data = slimpro_mbox_send_data,
.startup = slimpro_mbox_startup,
.shutdown = slimpro_mbox_shutdown,
diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c
index 4671f8a12872..9dfbf7ea10a2 100644
--- a/drivers/mailbox/mailbox.c
+++ b/drivers/mailbox/mailbox.c
@@ -103,11 +103,14 @@ static void tx_tick(struct mbox_chan *chan, int r)
/* Submit next message */
msg_submit(chan);
+ if (!mssg)
+ return;
+
/* Notify the client */
- if (mssg && chan->cl->tx_done)
+ if (chan->cl->tx_done)
chan->cl->tx_done(chan->cl, mssg, r);
- if (chan->cl->tx_block)
+ if (r != -ETIME && chan->cl->tx_block)
complete(&chan->tx_complete);
}
@@ -260,7 +263,7 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg)
msg_submit(chan);
- if (chan->cl->tx_block && chan->active_req) {
+ if (chan->cl->tx_block) {
unsigned long wait;
int ret;
@@ -271,8 +274,8 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg)
ret = wait_for_completion_timeout(&chan->tx_complete, wait);
if (ret == 0) {
- t = -EIO;
- tx_tick(chan, -EIO);
+ t = -ETIME;
+ tx_tick(chan, t);
}
}
@@ -453,6 +456,12 @@ int mbox_controller_register(struct mbox_controller *mbox)
txdone = TXDONE_BY_ACK;
if (txdone == TXDONE_BY_POLL) {
+
+ if (!mbox->ops->last_tx_done) {
+ dev_err(mbox->dev, "last_tx_done method is absent\n");
+ return -EINVAL;
+ }
+
hrtimer_init(&mbox->poll_hrt, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
mbox->poll_hrt.function = txdone_hrtimer;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9c689b34e6e7..975922c8f231 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2773,7 +2773,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->num_discard_bios = 1;
ti->discards_supported = true;
- ti->discard_zeroes_data_unsupported = true;
ti->split_discard_bios = false;
cache->features = ca->features;
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 136fda3ff9e5..fea5bd52ada8 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -132,6 +132,7 @@ void dm_init_md_queue(struct mapped_device *md);
void dm_init_normal_md_queue(struct mapped_device *md);
int md_in_flight(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
+void disable_write_zeroes(struct mapped_device *md);
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
{
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 389a3637ffcc..ef1d836bd81b 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2030,7 +2030,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
wake_up_process(cc->write_thread);
ti->num_flush_bios = 1;
- ti->discard_zeroes_data_unsupported = true;
return 0;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 03940bf36f6c..3702e502466d 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -312,9 +312,12 @@ static void do_region(int op, int op_flags, unsigned region,
*/
if (op == REQ_OP_DISCARD)
special_cmd_max_sectors = q->limits.max_discard_sectors;
+ else if (op == REQ_OP_WRITE_ZEROES)
+ special_cmd_max_sectors = q->limits.max_write_zeroes_sectors;
else if (op == REQ_OP_WRITE_SAME)
special_cmd_max_sectors = q->limits.max_write_same_sectors;
- if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_SAME) &&
+ if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
+ op == REQ_OP_WRITE_SAME) &&
special_cmd_max_sectors == 0) {
dec_count(io, region, -EOPNOTSUPP);
return;
@@ -328,11 +331,18 @@ static void do_region(int op, int op_flags, unsigned region,
/*
* Allocate a suitably sized-bio.
*/
- if ((op == REQ_OP_DISCARD) || (op == REQ_OP_WRITE_SAME))
+ switch (op) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
+ num_bvecs = 0;
+ break;
+ case REQ_OP_WRITE_SAME:
num_bvecs = 1;
- else
+ break;
+ default:
num_bvecs = min_t(int, BIO_MAX_PAGES,
dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
+ }
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
@@ -341,7 +351,7 @@ static void do_region(int op, int op_flags, unsigned region,
bio_set_op_attrs(bio, op, op_flags);
store_io_and_region_in_bio(bio, io, region);
- if (op == REQ_OP_DISCARD) {
+ if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) {
num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
remaining -= num_sectors;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 9e9d04cb7d51..f85846741d50 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -733,11 +733,11 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
job->pages = &zero_page_list;
/*
- * Use WRITE SAME to optimize zeroing if all dests support it.
+ * Use WRITE ZEROES to optimize zeroing if all dests support it.
*/
- job->rw = REQ_OP_WRITE_SAME;
+ job->rw = REQ_OP_WRITE_ZEROES;
for (i = 0; i < job->num_dests; i++)
- if (!bdev_write_same(job->dests[i].bdev)) {
+ if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) {
job->rw = WRITE;
break;
}
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9..e17fd44ceef5 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -59,6 +59,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
+ ti->num_write_zeroes_bios = 1;
ti->private = lc;
return 0;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 7f223dbed49f..2950b145443d 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1103,6 +1103,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
+ ti->num_write_zeroes_bios = 1;
if (m->queue_mode == DM_TYPE_BIO_BASED)
ti->per_io_data_size = multipath_per_bio_data_size();
else
@@ -1491,7 +1492,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
*/
int r = DM_ENDIO_REQUEUE;
- if (!error && !clone->errors)
+ if (!error)
return 0; /* I/O complete */
if (noretry_error(error))
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1e217ba84d09..2dae3e5b851c 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2813,7 +2813,9 @@ static void configure_discard_support(struct raid_set *rs)
/* Assume discards not supported until after checks below. */
ti->discards_supported = false;
- /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+ /*
+ * XXX: RAID level 4,5,6 require zeroing for safety.
+ */
raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
for (i = 0; i < rs->raid_disks; i++) {
@@ -2827,8 +2829,6 @@ static void configure_discard_support(struct raid_set *rs)
return;
if (raid456) {
- if (!q->limits.discard_zeroes_data)
- return;
if (!devices_handle_discard_safely) {
DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 2ddc2d20e62d..a95cbb80fb34 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1124,7 +1124,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->per_io_data_size = sizeof(struct dm_raid1_bio_record);
- ti->discard_zeroes_data_unsupported = true;
ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
if (!ms->kmirrord_wq) {
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 0b081d170087..bff7e3bdb4ed 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -298,9 +298,14 @@ static void dm_done(struct request *clone, int error, bool mapped)
r = rq_end_io(tio->ti, clone, error, &tio->info);
}
- if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
- !clone->q->limits.max_write_same_sectors))
- disable_write_same(tio->md);
+ if (unlikely(r == -EREMOTEIO)) {
+ if (req_op(clone) == REQ_OP_WRITE_SAME &&
+ !clone->q->limits.max_write_same_sectors)
+ disable_write_same(tio->md);
+ if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
+ !clone->q->limits.max_write_zeroes_sectors)
+ disable_write_zeroes(tio->md);
+ }
if (r <= 0)
/* The target wants to complete the I/O */
@@ -358,7 +363,7 @@ static void dm_complete_request(struct request *rq, int error)
if (!rq->q->mq_ops)
blk_complete_request(rq);
else
- blk_mq_complete_request(rq, error);
+ blk_mq_complete_request(rq);
}
/*
@@ -762,7 +767,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_MQ_RQ_QUEUE_OK;
}
-static struct blk_mq_ops dm_mq_ops = {
+static const struct blk_mq_ops dm_mq_ops = {
.queue_rq = dm_mq_queue_rq,
.complete = dm_softirq_done,
.init_request = dm_mq_init_request,
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 28193a57bf47..5ef49c121d99 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -169,6 +169,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_bios = stripes;
ti->num_discard_bios = stripes;
ti->num_write_same_bios = stripes;
+ ti->num_write_zeroes_bios = stripes;
sc->chunk_size = chunk_size;
if (chunk_size & (chunk_size - 1))
@@ -293,6 +294,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
+ unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) ||
unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) {
target_bio_nr = dm_bio_get_target_bio_nr(bio);
BUG_ON(target_bio_nr >= sc->stripes);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ad16d9c9d5a..958275aca008 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1449,22 +1449,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
return false;
}
-static bool dm_table_discard_zeroes_data(struct dm_table *t)
-{
- struct dm_target *ti;
- unsigned i = 0;
-
- /* Ensure that all targets supports discard_zeroes_data. */
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
-
- if (ti->discard_zeroes_data_unsupported)
- return false;
- }
-
- return true;
-}
-
static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1533,6 +1517,34 @@ static bool dm_table_supports_write_same(struct dm_table *t)
return true;
}
+static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !q->limits.max_write_zeroes_sectors;
+}
+
+static bool dm_table_supports_write_zeroes(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (!ti->num_write_zeroes_bios)
+ return false;
+
+ if (!ti->type->iterate_devices ||
+ ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL))
+ return false;
+ }
+
+ return true;
+}
+
+
static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1592,9 +1604,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
blk_queue_write_cache(q, wc, fua);
- if (!dm_table_discard_zeroes_data(t))
- q->limits.discard_zeroes_data = 0;
-
/* Ensure that all underlying devices are non-rotational. */
if (dm_table_all_devices_attribute(t, device_is_nonrot))
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
@@ -1603,6 +1612,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (!dm_table_supports_write_same(t))
q->limits.max_write_same_sectors = 0;
+ if (!dm_table_supports_write_zeroes(t))
+ q->limits.max_write_zeroes_sectors = 0;
if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2b266a2b5035..a5f1916f621a 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3263,7 +3263,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
* them down to the data device. The thin device's discard
* processing will cause mappings to be removed from the btree.
*/
- ti->discard_zeroes_data_unsupported = true;
if (pf.discard_enabled && pf.discard_passdown) {
ti->num_discard_bios = 1;
@@ -4119,7 +4118,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
/* In case the pool supports discards, pass them on. */
- ti->discard_zeroes_data_unsupported = true;
if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true;
ti->num_discard_bios = 1;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb75979e455..8bf397729bbd 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -810,7 +810,6 @@ static void dec_pending(struct dm_io *io, int error)
queue_io(md, bio);
} else {
/* done with normal IO or empty flush */
- trace_block_bio_complete(md->queue, bio, io_error);
bio->bi_error = io_error;
bio_endio(bio);
}
@@ -825,6 +824,14 @@ void disable_write_same(struct mapped_device *md)
limits->max_write_same_sectors = 0;
}
+void disable_write_zeroes(struct mapped_device *md)
+{
+ struct queue_limits *limits = dm_get_queue_limits(md);
+
+ /* device doesn't really support WRITE ZEROES, disable it */
+ limits->max_write_zeroes_sectors = 0;
+}
+
static void clone_endio(struct bio *bio)
{
int error = bio->bi_error;
@@ -851,9 +858,14 @@ static void clone_endio(struct bio *bio)
}
}
- if (unlikely(r == -EREMOTEIO && (bio_op(bio) == REQ_OP_WRITE_SAME) &&
- !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
- disable_write_same(md);
+ if (unlikely(r == -EREMOTEIO)) {
+ if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+ disable_write_same(md);
+ if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+ disable_write_zeroes(md);
+ }
free_tio(tio);
dec_pending(io, error);
@@ -1202,6 +1214,11 @@ static unsigned get_num_write_same_bios(struct dm_target *ti)
return ti->num_write_same_bios;
}
+static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
+{
+ return ti->num_write_zeroes_bios;
+}
+
typedef bool (*is_split_required_fn)(struct dm_target *ti);
static bool is_split_required_for_discard(struct dm_target *ti)
@@ -1256,6 +1273,11 @@ static int __send_write_same(struct clone_info *ci)
return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
}
+static int __send_write_zeroes(struct clone_info *ci)
+{
+ return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
+}
+
/*
* Select the correct strategy for processing a non-flush bio.
*/
@@ -1270,6 +1292,8 @@ static int __split_and_process_non_flush(struct clone_info *ci)
return __send_discard(ci);
else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
return __send_write_same(ci);
+ else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
+ return __send_write_zeroes(ci);
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3e38e0207a3e..377a8a3672e3 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -293,6 +293,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
split, disk_devt(mddev->gendisk),
bio_sector);
mddev_check_writesame(mddev, split);
+ mddev_check_write_zeroes(mddev, split);
generic_make_request(split);
}
} while (split != bio);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index dde8ecb760c8..1e76d64ce180 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -709,4 +709,11 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
!bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
mddev->queue->limits.max_write_same_sectors = 0;
}
+
+static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
+{
+ if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+ mddev->queue->limits.max_write_zeroes_sectors = 0;
+}
#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 79a12b59250b..e95d521d93e9 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -139,6 +139,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
mp_bh->bio.bi_end_io = multipath_end_request;
mp_bh->bio.bi_private = mp_bh;
mddev_check_writesame(mddev, &mp_bh->bio);
+ mddev_check_write_zeroes(mddev, &mp_bh->bio);
generic_make_request(&mp_bh->bio);
return;
}
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 93347ca7c7a6..ce7a6a56cf73 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -383,6 +383,7 @@ static int raid0_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
@@ -504,6 +505,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
split, disk_devt(mddev->gendisk),
bio_sector);
mddev_check_writesame(mddev, split);
+ mddev_check_write_zeroes(mddev, split);
generic_make_request(split);
}
} while (split != bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a34f58772022..b59cc100320a 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3177,8 +3177,10 @@ static int raid1_run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
- if (mddev->queue)
+ if (mddev->queue) {
blk_queue_max_write_same_sectors(mddev->queue, 0);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
+ }
rdev_for_each(rdev, mddev) {
if (!mddev->gendisk)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e89a8d78a9ed..28ec3a93acee 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3749,6 +3749,7 @@ static int raid10_run(struct mddev *mddev)
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, 0);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, chunk_size);
if (conf->geo.raid_disks % conf->geo.near_copies)
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed5cd705b985..2efdb0d67460 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5031,8 +5031,6 @@ static void raid5_align_endio(struct bio *bi)
rdev_dec_pending(rdev, conf->mddev);
if (!error) {
- trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
- raid_bi, 0);
bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
@@ -7229,7 +7227,6 @@ static int raid5_run(struct mddev *mddev)
if (mddev->queue) {
int chunk_size;
- bool discard_supported = true;
/* read-ahead size must cover two whole stripes, which
* is 2 * (datadisks) * chunksize where 'n' is the
* number of raid devices
@@ -7265,48 +7262,32 @@ static int raid5_run(struct mddev *mddev)
blk_queue_max_discard_sectors(mddev->queue,
0xfffe * STRIPE_SECTORS);
- /*
- * unaligned part of discard request will be ignored, so can't
- * guarantee discard_zeroes_data
- */
- mddev->queue->limits.discard_zeroes_data = 0;
-
blk_queue_max_write_same_sectors(mddev->queue, 0);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
- /*
- * discard_zeroes_data is required, otherwise data
- * could be lost. Consider a scenario: discard a stripe
- * (the stripe could be inconsistent if
- * discard_zeroes_data is 0); write one disk of the
- * stripe (the stripe could be inconsistent again
- * depending on which disks are used to calculate
- * parity); the disk is broken; The stripe data of this
- * disk is lost.
- */
- if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
- !bdev_get_queue(rdev->bdev)->
- limits.discard_zeroes_data)
- discard_supported = false;
- /* Unfortunately, discard_zeroes_data is not currently
- * a guarantee - just a hint. So we only allow DISCARD
- * if the sysadmin has confirmed that only safe devices
- * are in use by setting a module parameter.
- */
- if (!devices_handle_discard_safely) {
- if (discard_supported) {
- pr_info("md/raid456: discard support disabled due to uncertainty.\n");
- pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
- }
- discard_supported = false;
- }
}
- if (discard_supported &&
+ /*
+ * zeroing is required, otherwise data
+ * could be lost. Consider a scenario: discard a stripe
+ * (the stripe could be inconsistent if
+ * discard_zeroes_data is 0); write one disk of the
+ * stripe (the stripe could be inconsistent again
+ * depending on which disks are used to calculate
+ * parity); the disk is broken; The stripe data of this
+ * disk is lost.
+ *
+ * We only allow DISCARD if the sysadmin has confirmed that
+ * only safe devices are in use by setting a module parameter.
+ * A better idea might be to turn DISCARD into WRITE_ZEROES
+ * requests, as that is required to be safe.
+ */
+ if (devices_handle_discard_safely &&
mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
mddev->queue->limits.discard_granularity >= stripe)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 493eb10ce580..4c54ad34e17a 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -167,8 +167,6 @@ static void mmc_queue_setup_discard(struct request_queue *q,
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
blk_queue_max_discard_sectors(q, max_discard);
- if (card->erased_byte == 0 && !mmc_can_discard(card))
- q->limits.discard_zeroes_data = 1;
q->limits.discard_granularity = card->pref_erase << 9;
/* granularity must not be greater than max. discard */
if (card->pref_erase > max_discard)
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 66a9dedd1062..1517da3ddd7d 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -46,7 +46,7 @@
#include "mtdcore.h"
-static struct backing_dev_info *mtd_bdi;
+struct backing_dev_info *mtd_bdi;
#ifdef CONFIG_PM_SLEEP
@@ -496,11 +496,9 @@ int add_mtd_device(struct mtd_info *mtd)
* mtd_device_parse_register() multiple times on the same master MTD,
* especially with CONFIG_MTD_PARTITIONED_MASTER=y.
*/
- if (WARN_ONCE(mtd->backing_dev_info, "MTD already registered\n"))
+ if (WARN_ONCE(mtd->dev.type, "MTD already registered\n"))
return -EEXIST;
- mtd->backing_dev_info = mtd_bdi;
-
BUG_ON(mtd->writesize == 0);
mutex_lock(&mtd_table_mutex);
@@ -1775,13 +1773,18 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name)
struct backing_dev_info *bdi;
int ret;
- bdi = kzalloc(sizeof(*bdi), GFP_KERNEL);
+ bdi = bdi_alloc(GFP_KERNEL);
if (!bdi)
return ERR_PTR(-ENOMEM);
- ret = bdi_setup_and_register(bdi, name);
+ bdi->name = name;
+ /*
+ * We put '-0' suffix to the name to get the same name format as we
+ * used to get. Since this is called only once, we get a unique name.
+ */
+ ret = bdi_register(bdi, "%.28s-0", name);
if (ret)
- kfree(bdi);
+ bdi_put(bdi);
return ret ? ERR_PTR(ret) : bdi;
}
@@ -1813,8 +1816,7 @@ static int __init init_mtd(void)
out_procfs:
if (proc_mtd)
remove_proc_entry("mtd", NULL);
- bdi_destroy(mtd_bdi);
- kfree(mtd_bdi);
+ bdi_put(mtd_bdi);
err_bdi:
class_unregister(&mtd_class);
err_reg:
@@ -1828,8 +1830,7 @@ static void __exit cleanup_mtd(void)
if (proc_mtd)
remove_proc_entry("mtd", NULL);
class_unregister(&mtd_class);
- bdi_destroy(mtd_bdi);
- kfree(mtd_bdi);
+ bdi_put(mtd_bdi);
idr_destroy(&mtd_idr);
}
diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index 20c02a3b7417..e43fea896d1e 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -18,6 +18,7 @@
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/major.h>
+#include <linux/backing-dev.h>
/*
* compare superblocks to see if they're equivalent
@@ -38,6 +39,8 @@ static int get_sb_mtd_compare(struct super_block *sb, void *_mtd)
return 0;
}
+extern struct backing_dev_info *mtd_bdi;
+
/*
* mark the superblock by the MTD device it is using
* - set the device number to be the correct MTD block device for pesuperstence
@@ -49,7 +52,8 @@ static int get_sb_mtd_set(struct super_block *sb, void *_mtd)
sb->s_mtd = mtd;
sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
- sb->s_bdi = mtd->backing_dev_info;
+ sb->s_bdi = bdi_get(mtd_bdi);
+
return 0;
}
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index c80869e60909..51f2be8889b5 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -347,7 +347,7 @@ static int ubiblock_init_request(void *data, struct request *req,
return 0;
}
-static struct blk_mq_ops ubiblock_mq_ops = {
+static const struct blk_mq_ops ubiblock_mq_ops = {
.queue_rq = ubiblock_queue_rq,
.init_request = ubiblock_init_request,
};
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8a4ba8b88e52..34481c9be1d1 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1104,11 +1104,11 @@ static void bond_compute_features(struct bonding *bond)
gso_max_size = min(gso_max_size, slave->dev->gso_max_size);
gso_max_segs = min(gso_max_segs, slave->dev->gso_max_segs);
}
+ bond_dev->hard_header_len = max_hard_header_len;
done:
bond_dev->vlan_features = vlan_features;
bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
- bond_dev->hard_header_len = max_hard_header_len;
bond_dev->gso_max_segs = gso_max_segs;
netif_set_gso_max_size(bond_dev, gso_max_size);
diff --git a/drivers/net/can/usb/Kconfig b/drivers/net/can/usb/Kconfig
index 8483a40e7e9e..5f9e0e6301d0 100644
--- a/drivers/net/can/usb/Kconfig
+++ b/drivers/net/can/usb/Kconfig
@@ -72,6 +72,8 @@ config CAN_PEAK_USB
PCAN-USB Pro dual CAN 2.0b channels USB adapter
PCAN-USB FD single CAN-FD channel USB adapter
PCAN-USB Pro FD dual CAN-FD channels USB adapter
+ PCAN-Chip USB CAN-FD to USB stamp module
+ PCAN-USB X6 6 CAN-FD channels USB adapter
(see also http://www.peak-system.com).
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 300349fe8dc0..eecee7f8dfb7 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -739,13 +739,18 @@ static const struct net_device_ops gs_usb_netdev_ops = {
static int gs_usb_set_identify(struct net_device *netdev, bool do_identify)
{
struct gs_can *dev = netdev_priv(netdev);
- struct gs_identify_mode imode;
+ struct gs_identify_mode *imode;
int rc;
+ imode = kmalloc(sizeof(*imode), GFP_KERNEL);
+
+ if (!imode)
+ return -ENOMEM;
+
if (do_identify)
- imode.mode = GS_CAN_IDENTIFY_ON;
+ imode->mode = GS_CAN_IDENTIFY_ON;
else
- imode.mode = GS_CAN_IDENTIFY_OFF;
+ imode->mode = GS_CAN_IDENTIFY_OFF;
rc = usb_control_msg(interface_to_usbdev(dev->iface),
usb_sndctrlpipe(interface_to_usbdev(dev->iface),
@@ -755,10 +760,12 @@ static int gs_usb_set_identify(struct net_device *netdev, bool do_identify)
USB_RECIP_INTERFACE,
dev->channel,
0,
- &imode,
- sizeof(imode),
+ imode,
+ sizeof(*imode),
100);
+ kfree(imode);
+
return (rc > 0) ? 0 : rc;
}
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
index 0b0302af3bd2..57913dbbae0a 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
@@ -39,6 +39,7 @@ static struct usb_device_id peak_usb_table[] = {
{USB_DEVICE(PCAN_USB_VENDOR_ID, PCAN_USBPRO_PRODUCT_ID)},
{USB_DEVICE(PCAN_USB_VENDOR_ID, PCAN_USBFD_PRODUCT_ID)},
{USB_DEVICE(PCAN_USB_VENDOR_ID, PCAN_USBPROFD_PRODUCT_ID)},
+ {USB_DEVICE(PCAN_USB_VENDOR_ID, PCAN_USBCHIP_PRODUCT_ID)},
{USB_DEVICE(PCAN_USB_VENDOR_ID, PCAN_USBX6_PRODUCT_ID)},
{} /* Terminating entry */
};
@@ -51,6 +52,7 @@ static const struct peak_usb_adapter *const peak_usb_adapters_list[] = {
&pcan_usb_pro,
&pcan_usb_fd,
&pcan_usb_pro_fd,
+ &pcan_usb_chip,
&pcan_usb_x6,
};
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.h b/drivers/net/can/usb/peak_usb/pcan_usb_core.h
index 3cbfb069893d..c01316cac354 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_core.h
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.h
@@ -27,6 +27,7 @@
#define PCAN_USBPRO_PRODUCT_ID 0x000d
#define PCAN_USBPROFD_PRODUCT_ID 0x0011
#define PCAN_USBFD_PRODUCT_ID 0x0012
+#define PCAN_USBCHIP_PRODUCT_ID 0x0013
#define PCAN_USBX6_PRODUCT_ID 0x0014
#define PCAN_USB_DRIVER_NAME "peak_usb"
@@ -90,6 +91,7 @@ struct peak_usb_adapter {
extern const struct peak_usb_adapter pcan_usb;
extern const struct peak_usb_adapter pcan_usb_pro;
extern const struct peak_usb_adapter pcan_usb_fd;
+extern const struct peak_usb_adapter pcan_usb_chip;
extern const struct peak_usb_adapter pcan_usb_pro_fd;
extern const struct peak_usb_adapter pcan_usb_x6;
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
index 304732550f0a..528d3bb4917f 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
@@ -1061,6 +1061,78 @@ const struct peak_usb_adapter pcan_usb_fd = {
.do_get_berr_counter = pcan_usb_fd_get_berr_counter,
};
+/* describes the PCAN-CHIP USB */
+static const struct can_bittiming_const pcan_usb_chip_const = {
+ .name = "pcan_chip_usb",
+ .tseg1_min = 1,
+ .tseg1_max = (1 << PUCAN_TSLOW_TSGEG1_BITS),
+ .tseg2_min = 1,
+ .tseg2_max = (1 << PUCAN_TSLOW_TSGEG2_BITS),
+ .sjw_max = (1 << PUCAN_TSLOW_SJW_BITS),
+ .brp_min = 1,
+ .brp_max = (1 << PUCAN_TSLOW_BRP_BITS),
+ .brp_inc = 1,
+};
+
+static const struct can_bittiming_const pcan_usb_chip_data_const = {
+ .name = "pcan_chip_usb",
+ .tseg1_min = 1,
+ .tseg1_max = (1 << PUCAN_TFAST_TSGEG1_BITS),
+ .tseg2_min = 1,
+ .tseg2_max = (1 << PUCAN_TFAST_TSGEG2_BITS),
+ .sjw_max = (1 << PUCAN_TFAST_SJW_BITS),
+ .brp_min = 1,
+ .brp_max = (1 << PUCAN_TFAST_BRP_BITS),
+ .brp_inc = 1,
+};
+
+const struct peak_usb_adapter pcan_usb_chip = {
+ .name = "PCAN-Chip USB",
+ .device_id = PCAN_USBCHIP_PRODUCT_ID,
+ .ctrl_count = PCAN_USBFD_CHANNEL_COUNT,
+ .ctrlmode_supported = CAN_CTRLMODE_FD |
+ CAN_CTRLMODE_3_SAMPLES | CAN_CTRLMODE_LISTENONLY,
+ .clock = {
+ .freq = PCAN_UFD_CRYSTAL_HZ,
+ },
+ .bittiming_const = &pcan_usb_chip_const,
+ .data_bittiming_const = &pcan_usb_chip_data_const,
+
+ /* size of device private data */
+ .sizeof_dev_private = sizeof(struct pcan_usb_fd_device),
+
+ /* timestamps usage */
+ .ts_used_bits = 32,
+ .ts_period = 1000000, /* calibration period in ts. */
+ .us_per_ts_scale = 1, /* us = (ts * scale) >> shift */
+ .us_per_ts_shift = 0,
+
+ /* give here messages in/out endpoints */
+ .ep_msg_in = PCAN_USBPRO_EP_MSGIN,
+ .ep_msg_out = {PCAN_USBPRO_EP_MSGOUT_0},
+
+ /* size of rx/tx usb buffers */
+ .rx_buffer_size = PCAN_UFD_RX_BUFFER_SIZE,
+ .tx_buffer_size = PCAN_UFD_TX_BUFFER_SIZE,
+
+ /* device callbacks */
+ .intf_probe = pcan_usb_pro_probe, /* same as PCAN-USB Pro */
+ .dev_init = pcan_usb_fd_init,
+
+ .dev_exit = pcan_usb_fd_exit,
+ .dev_free = pcan_usb_fd_free,
+ .dev_set_bus = pcan_usb_fd_set_bus,
+ .dev_set_bittiming = pcan_usb_fd_set_bittiming_slow,
+ .dev_set_data_bittiming = pcan_usb_fd_set_bittiming_fast,
+ .dev_decode_buf = pcan_usb_fd_decode_buf,
+ .dev_start = pcan_usb_fd_start,
+ .dev_stop = pcan_usb_fd_stop,
+ .dev_restart_async = pcan_usb_fd_restart_async,
+ .dev_encode_msg = pcan_usb_fd_encode_msg,
+
+ .do_get_berr_counter = pcan_usb_fd_get_berr_counter,
+};
+
/* describes the PCAN-USB Pro FD adapter */
static const struct can_bittiming_const pcan_usb_pro_fd_const = {
.name = "pcan_usb_pro_fd",
diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 8cf4801994e8..fa0eece21eef 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -326,6 +326,7 @@ static void b53_get_vlan_entry(struct b53_device *dev, u16 vid,
static void b53_set_forwarding(struct b53_device *dev, int enable)
{
+ struct dsa_switch *ds = dev->ds;
u8 mgmt;
b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, &mgmt);
@@ -336,6 +337,15 @@ static void b53_set_forwarding(struct b53_device *dev, int enable)
mgmt &= ~SM_SW_FWD_EN;
b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, mgmt);
+
+ /* Include IMP port in dumb forwarding mode when no tagging protocol is
+ * set
+ */
+ if (ds->ops->get_tag_protocol(ds) == DSA_TAG_PROTO_NONE) {
+ b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, &mgmt);
+ mgmt |= B53_MII_DUMB_FWDG_EN;
+ b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, mgmt);
+ }
}
static void b53_enable_vlan(struct b53_device *dev, bool enable)
@@ -598,7 +608,8 @@ static void b53_switch_reset_gpio(struct b53_device *dev)
static int b53_switch_reset(struct b53_device *dev)
{
- u8 mgmt;
+ unsigned int timeout = 1000;
+ u8 mgmt, reg;
b53_switch_reset_gpio(dev);
@@ -607,6 +618,28 @@ static int b53_switch_reset(struct b53_device *dev)
b53_write8(dev, B53_CTRL_PAGE, B53_SOFTRESET, 0x00);
}
+ /* This is specific to 58xx devices here, do not use is58xx() which
+ * covers the larger Starfigther 2 family, including 7445/7278 which
+ * still use this driver as a library and need to perform the reset
+ * earlier.
+ */
+ if (dev->chip_id == BCM58XX_DEVICE_ID) {
+ b53_read8(dev, B53_CTRL_PAGE, B53_SOFTRESET, &reg);
+ reg |= SW_RST | EN_SW_RST | EN_CH_RST;
+ b53_write8(dev, B53_CTRL_PAGE, B53_SOFTRESET, reg);
+
+ do {
+ b53_read8(dev, B53_CTRL_PAGE, B53_SOFTRESET, &reg);
+ if (!(reg & SW_RST))
+ break;
+
+ usleep_range(1000, 2000);
+ } while (timeout-- > 0);
+
+ if (timeout == 0)
+ return -ETIMEDOUT;
+ }
+
b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, &mgmt);
if (!(mgmt & SM_SW_FWD_EN)) {
@@ -1731,7 +1764,7 @@ static const struct b53_chip_data b53_switch_chips[] = {
.vlans = 4096,
.enabled_ports = 0x1ff,
.arl_entries = 4,
- .cpu_port = B53_CPU_PORT_25,
+ .cpu_port = B53_CPU_PORT,
.vta_regs = B53_VTA_REGS,
.duplex_reg = B53_DUPLEX_STAT_GE,
.jumbo_pm_reg = B53_JUMBO_PORT_MASK,
diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h
index 9fd24c418fa4..e5c86d44667a 100644
--- a/drivers/net/dsa/b53/b53_regs.h
+++ b/drivers/net/dsa/b53/b53_regs.h
@@ -104,6 +104,10 @@
#define B53_UC_FWD_EN BIT(6)
#define B53_MC_FWD_EN BIT(7)
+/* Switch control (8 bit) */
+#define B53_SWITCH_CTRL 0x22
+#define B53_MII_DUMB_FWDG_EN BIT(6)
+
/* (16 bit) */
#define B53_UC_FLOOD_MASK 0x32
#define B53_MC_FLOOD_MASK 0x34
@@ -139,6 +143,7 @@
/* Software reset register (8 bit) */
#define B53_SOFTRESET 0x79
#define SW_RST BIT(7)
+#define EN_CH_RST BIT(6)
#define EN_SW_RST BIT(4)
/* Fast Aging Control register (8 bit) */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index dc52053128bc..3d9490cd2db1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -90,7 +90,7 @@
#define MLX5E_VALID_NUM_MTTS(num_mtts) (MLX5_MTT_OCTW(num_mtts) - 1 <= U16_MAX)
#define MLX5_UMR_ALIGN (2048)
-#define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD (128)
+#define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD (256)
#define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ (64 * 1024)
#define MLX5E_DEFAULT_LRO_TIMEOUT 32
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index d55fff0ba388..26fc77e80f7b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -564,6 +564,7 @@ int mlx5e_ethtool_get_all_flows(struct mlx5e_priv *priv, struct ethtool_rxnfc *i
int idx = 0;
int err = 0;
+ info->data = MAX_NUM_OF_ETHTOOL_RULES;
while ((!err || err == -ENOENT) && idx < info->rule_cnt) {
err = mlx5e_ethtool_get_flow(priv, info, location);
if (!err)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 66c133757a5e..15cc7b469d2e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -174,7 +174,7 @@ unlock:
static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
{
- struct mlx5e_sw_stats *s = &priv->stats.sw;
+ struct mlx5e_sw_stats temp, *s = &temp;
struct mlx5e_rq_stats *rq_stats;
struct mlx5e_sq_stats *sq_stats;
u64 tx_offload_none = 0;
@@ -229,6 +229,7 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
s->link_down_events_phy = MLX5_GET(ppcnt_reg,
priv->stats.pport.phy_counters,
counter_set.phys_layer_cntrs.link_down_events);
+ memcpy(&priv->stats.sw, s, sizeof(*s));
}
static void mlx5e_update_vport_counters(struct mlx5e_priv *priv)
@@ -243,7 +244,6 @@ static void mlx5e_update_vport_counters(struct mlx5e_priv *priv)
MLX5_SET(query_vport_counter_in, in, op_mod, 0);
MLX5_SET(query_vport_counter_in, in, other_vport, 0);
- memset(out, 0, outlen);
mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index fade7233dac5..5436866798f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -639,7 +639,8 @@ static int parse_cls_flower(struct mlx5e_priv *priv,
if (!err && (flow->flags & MLX5E_TC_FLOW_ESWITCH) &&
rep->vport != FDB_UPLINK_VPORT) {
- if (min_inline > esw->offloads.inline_mode) {
+ if (esw->offloads.inline_mode != MLX5_INLINE_MODE_NONE &&
+ esw->offloads.inline_mode < min_inline) {
netdev_warn(priv->netdev,
"Flow is not offloaded due to min inline setting, required %d actual %d\n",
min_inline, esw->offloads.inline_mode);
@@ -785,16 +786,15 @@ static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
return 0;
}
-static int gen_vxlan_header_ipv4(struct net_device *out_dev,
- char buf[],
- unsigned char h_dest[ETH_ALEN],
- int ttl,
- __be32 daddr,
- __be32 saddr,
- __be16 udp_dst_port,
- __be32 vx_vni)
+static void gen_vxlan_header_ipv4(struct net_device *out_dev,
+ char buf[], int encap_size,
+ unsigned char h_dest[ETH_ALEN],
+ int ttl,
+ __be32 daddr,
+ __be32 saddr,
+ __be16 udp_dst_port,
+ __be32 vx_vni)
{
- int encap_size = VXLAN_HLEN + sizeof(struct iphdr) + ETH_HLEN;
struct ethhdr *eth = (struct ethhdr *)buf;
struct iphdr *ip = (struct iphdr *)((char *)eth + sizeof(struct ethhdr));
struct udphdr *udp = (struct udphdr *)((char *)ip + sizeof(struct iphdr));
@@ -817,20 +817,17 @@ static int gen_vxlan_header_ipv4(struct net_device *out_dev,
udp->dest = udp_dst_port;
vxh->vx_flags = VXLAN_HF_VNI;
vxh->vx_vni = vxlan_vni_field(vx_vni);
-
- return encap_size;
}
-static int gen_vxlan_header_ipv6(struct net_device *out_dev,
- char buf[],
- unsigned char h_dest[ETH_ALEN],
- int ttl,
- struct in6_addr *daddr,
- struct in6_addr *saddr,
- __be16 udp_dst_port,
- __be32 vx_vni)
+static void gen_vxlan_header_ipv6(struct net_device *out_dev,
+ char buf[], int encap_size,
+ unsigned char h_dest[ETH_ALEN],
+ int ttl,
+ struct in6_addr *daddr,
+ struct in6_addr *saddr,
+ __be16 udp_dst_port,
+ __be32 vx_vni)
{
- int encap_size = VXLAN_HLEN + sizeof(struct ipv6hdr) + ETH_HLEN;
struct ethhdr *eth = (struct ethhdr *)buf;
struct ipv6hdr *ip6h = (struct ipv6hdr *)((char *)eth + sizeof(struct ethhdr));
struct udphdr *udp = (struct udphdr *)((char *)ip6h + sizeof(struct ipv6hdr));
@@ -852,8 +849,6 @@ static int gen_vxlan_header_ipv6(struct net_device *out_dev,
udp->dest = udp_dst_port;
vxh->vx_flags = VXLAN_HF_VNI;
vxh->vx_vni = vxlan_vni_field(vx_vni);
-
- return encap_size;
}
static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
@@ -862,13 +857,20 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
struct net_device **out_dev)
{
int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
+ int ipv4_encap_size = ETH_HLEN + sizeof(struct iphdr) + VXLAN_HLEN;
struct ip_tunnel_key *tun_key = &e->tun_info.key;
- int encap_size, ttl, err;
struct neighbour *n = NULL;
struct flowi4 fl4 = {};
char *encap_header;
+ int ttl, err;
+
+ if (max_encap_size < ipv4_encap_size) {
+ mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
+ ipv4_encap_size, max_encap_size);
+ return -EOPNOTSUPP;
+ }
- encap_header = kzalloc(max_encap_size, GFP_KERNEL);
+ encap_header = kzalloc(ipv4_encap_size, GFP_KERNEL);
if (!encap_header)
return -ENOMEM;
@@ -903,11 +905,11 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
switch (e->tunnel_type) {
case MLX5_HEADER_TYPE_VXLAN:
- encap_size = gen_vxlan_header_ipv4(*out_dev, encap_header,
- e->h_dest, ttl,
- fl4.daddr,
- fl4.saddr, tun_key->tp_dst,
- tunnel_id_to_key32(tun_key->tun_id));
+ gen_vxlan_header_ipv4(*out_dev, encap_header,
+ ipv4_encap_size, e->h_dest, ttl,
+ fl4.daddr,
+ fl4.saddr, tun_key->tp_dst,
+ tunnel_id_to_key32(tun_key->tun_id));
break;
default:
err = -EOPNOTSUPP;
@@ -915,7 +917,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
}
err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
- encap_size, encap_header, &e->encap_id);
+ ipv4_encap_size, encap_header, &e->encap_id);
out:
if (err && n)
neigh_release(n);
@@ -930,13 +932,20 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
{
int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
+ int ipv6_encap_size = ETH_HLEN + sizeof(struct ipv6hdr) + VXLAN_HLEN;
struct ip_tunnel_key *tun_key = &e->tun_info.key;
- int encap_size, err, ttl = 0;
struct neighbour *n = NULL;
struct flowi6 fl6 = {};
char *encap_header;
+ int err, ttl = 0;
+
+ if (max_encap_size < ipv6_encap_size) {
+ mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
+ ipv6_encap_size, max_encap_size);
+ return -EOPNOTSUPP;
+ }
- encap_header = kzalloc(max_encap_size, GFP_KERNEL);
+ encap_header = kzalloc(ipv6_encap_size, GFP_KERNEL);
if (!encap_header)
return -ENOMEM;
@@ -972,11 +981,11 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
switch (e->tunnel_type) {
case MLX5_HEADER_TYPE_VXLAN:
- encap_size = gen_vxlan_header_ipv6(*out_dev, encap_header,
- e->h_dest, ttl,
- &fl6.daddr,
- &fl6.saddr, tun_key->tp_dst,
- tunnel_id_to_key32(tun_key->tun_id));
+ gen_vxlan_header_ipv6(*out_dev, encap_header,
+ ipv6_encap_size, e->h_dest, ttl,
+ &fl6.daddr,
+ &fl6.saddr, tun_key->tp_dst,
+ tunnel_id_to_key32(tun_key->tun_id));
break;
default:
err = -EOPNOTSUPP;
@@ -984,7 +993,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
}
err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
- encap_size, encap_header, &e->encap_id);
+ ipv6_encap_size, encap_header, &e->encap_id);
out:
if (err && n)
neigh_release(n);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 307ec6c5fd3b..d111cebca9f1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -911,8 +911,7 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode)
struct mlx5_core_dev *dev = devlink_priv(devlink);
struct mlx5_eswitch *esw = dev->priv.eswitch;
int num_vports = esw->enabled_vports;
- int err;
- int vport;
+ int err, vport;
u8 mlx5_mode;
if (!MLX5_CAP_GEN(dev, vport_group_manager))
@@ -921,9 +920,17 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode)
if (esw->mode == SRIOV_NONE)
return -EOPNOTSUPP;
- if (MLX5_CAP_ETH(dev, wqe_inline_mode) !=
- MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+ switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
+ case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+ if (mode == DEVLINK_ESWITCH_INLINE_MODE_NONE)
+ return 0;
+ /* fall through */
+ case MLX5_CAP_INLINE_MODE_L2:
+ esw_warn(dev, "Inline mode can't be set\n");
return -EOPNOTSUPP;
+ case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+ break;
+ }
if (esw->offloads.num_flows > 0) {
esw_warn(dev, "Can't set inline mode when flows are configured\n");
@@ -966,18 +973,14 @@ int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode)
if (esw->mode == SRIOV_NONE)
return -EOPNOTSUPP;
- if (MLX5_CAP_ETH(dev, wqe_inline_mode) !=
- MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
- return -EOPNOTSUPP;
-
return esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
}
int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode)
{
+ u8 prev_mlx5_mode, mlx5_mode = MLX5_INLINE_MODE_L2;
struct mlx5_core_dev *dev = esw->dev;
int vport;
- u8 prev_mlx5_mode, mlx5_mode = MLX5_INLINE_MODE_L2;
if (!MLX5_CAP_GEN(dev, vport_group_manager))
return -EOPNOTSUPP;
@@ -985,10 +988,18 @@ int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode)
if (esw->mode == SRIOV_NONE)
return -EOPNOTSUPP;
- if (MLX5_CAP_ETH(dev, wqe_inline_mode) !=
- MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
- return -EOPNOTSUPP;
+ switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
+ case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+ mlx5_mode = MLX5_INLINE_MODE_NONE;
+ goto out;
+ case MLX5_CAP_INLINE_MODE_L2:
+ mlx5_mode = MLX5_INLINE_MODE_L2;
+ goto out;
+ case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+ goto query_vports;
+ }
+query_vports:
for (vport = 1; vport <= nvfs; vport++) {
mlx5_query_nic_vport_min_inline(dev, vport, &mlx5_mode);
if (vport > 1 && prev_mlx5_mode != mlx5_mode)
@@ -996,6 +1007,7 @@ int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode)
prev_mlx5_mode = mlx5_mode;
}
+out:
*mode = mlx5_mode;
return 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 60154a175bd3..0ad66324247f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1029,7 +1029,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
if (err) {
dev_err(&dev->pdev->dev, "Firmware over %d MS in initializing state, aborting\n",
FW_INIT_TIMEOUT_MILI);
- goto out_err;
+ goto err_cmd_cleanup;
}
err = mlx5_core_enable_hca(dev, 0);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index 2e6b0f290ddc..222b25908d01 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -87,6 +87,7 @@ static void up_rel_func(struct kref *kref)
struct mlx5_uars_page *up = container_of(kref, struct mlx5_uars_page, ref_count);
list_del(&up->list);
+ iounmap(up->map);
if (mlx5_cmd_free_uar(up->mdev, up->index))
mlx5_core_warn(up->mdev, "failed to free uar index %d\n", up->index);
kfree(up->reg_bitmap);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index a6e2bbe629bd..cfdadb658ade 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -64,11 +64,11 @@
((u32)(prio_tc_tbl >> ((7 - prio) * 4)) & 0x7)
static const struct qed_dcbx_app_metadata qed_dcbx_app_update[] = {
- {DCBX_PROTOCOL_ISCSI, "ISCSI", QED_PCI_DEFAULT},
- {DCBX_PROTOCOL_FCOE, "FCOE", QED_PCI_DEFAULT},
- {DCBX_PROTOCOL_ROCE, "ROCE", QED_PCI_DEFAULT},
- {DCBX_PROTOCOL_ROCE_V2, "ROCE_V2", QED_PCI_DEFAULT},
- {DCBX_PROTOCOL_ETH, "ETH", QED_PCI_ETH}
+ {DCBX_PROTOCOL_ISCSI, "ISCSI", QED_PCI_ISCSI},
+ {DCBX_PROTOCOL_FCOE, "FCOE", QED_PCI_FCOE},
+ {DCBX_PROTOCOL_ROCE, "ROCE", QED_PCI_ETH_ROCE},
+ {DCBX_PROTOCOL_ROCE_V2, "ROCE_V2", QED_PCI_ETH_ROCE},
+ {DCBX_PROTOCOL_ETH, "ETH", QED_PCI_ETH},
};
static bool qed_dcbx_app_ethtype(u32 app_info_bitmap)
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 8cfc4a54f2dc..3cd7989c007d 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -1516,11 +1516,12 @@ static netdev_tx_t ravb_start_xmit(struct sk_buff *skb, struct net_device *ndev)
spin_unlock_irqrestore(&priv->lock, flags);
return NETDEV_TX_BUSY;
}
- entry = priv->cur_tx[q] % (priv->num_tx_ring[q] * NUM_TX_DESC);
- priv->tx_skb[q][entry / NUM_TX_DESC] = skb;
if (skb_put_padto(skb, ETH_ZLEN))
- goto drop;
+ goto exit;
+
+ entry = priv->cur_tx[q] % (priv->num_tx_ring[q] * NUM_TX_DESC);
+ priv->tx_skb[q][entry / NUM_TX_DESC] = skb;
buffer = PTR_ALIGN(priv->tx_align[q], DPTR_ALIGN) +
entry / NUM_TX_DESC * DPTR_ALIGN;
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index ee14662415c5..a0c52e328102 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -74,7 +74,10 @@ void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
#define EFX_RXQ_MIN_ENT 128U
#define EFX_TXQ_MIN_ENT(efx) (2 * efx_tx_max_skb_descs(efx))
-#define EFX_TXQ_MAX_ENT(efx) (EFX_WORKAROUND_35388(efx) ? \
+/* All EF10 architecture NICs steal one bit of the DMAQ size for various
+ * other purposes when counting TxQ entries, so we halve the queue size.
+ */
+#define EFX_TXQ_MAX_ENT(efx) (EFX_WORKAROUND_EF10(efx) ? \
EFX_MAX_DMAQ_SIZE / 2 : EFX_MAX_DMAQ_SIZE)
static inline bool efx_rss_enabled(struct efx_nic *efx)
diff --git a/drivers/net/ethernet/sfc/workarounds.h b/drivers/net/ethernet/sfc/workarounds.h
index 103f827a1623..c67fa18b8121 100644
--- a/drivers/net/ethernet/sfc/workarounds.h
+++ b/drivers/net/ethernet/sfc/workarounds.h
@@ -16,6 +16,7 @@
*/
#define EFX_WORKAROUND_SIENA(efx) (efx_nic_rev(efx) == EFX_REV_SIENA_A0)
+#define EFX_WORKAROUND_EF10(efx) (efx_nic_rev(efx) >= EFX_REV_HUNT_A0)
#define EFX_WORKAROUND_10G(efx) 1
/* Bit-bashed I2C reads cause performance drop */
diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
index 9e631952b86f..48a541eb0af2 100644
--- a/drivers/net/ethernet/ti/Kconfig
+++ b/drivers/net/ethernet/ti/Kconfig
@@ -76,7 +76,7 @@ config TI_CPSW
config TI_CPTS
bool "TI Common Platform Time Sync (CPTS) Support"
depends on TI_CPSW || TI_KEYSTONE_NETCP
- depends on PTP_1588_CLOCK
+ depends on POSIX_TIMERS
---help---
This driver supports the Common Platform Time Sync unit of
the CPSW Ethernet Switch and Keystone 2 1g/10g Switch Subsystem.
@@ -87,6 +87,8 @@ config TI_CPTS_MOD
tristate
depends on TI_CPTS
default y if TI_CPSW=y || TI_KEYSTONE_NETCP=y
+ select NET_PTP_CLASSIFY
+ imply PTP_1588_CLOCK
default m
config TI_KEYSTONE_NETCP
diff --git a/drivers/net/ethernet/toshiba/tc35815.c b/drivers/net/ethernet/toshiba/tc35815.c
index a45f98fa4aa7..3dadee1080b9 100644
--- a/drivers/net/ethernet/toshiba/tc35815.c
+++ b/drivers/net/ethernet/toshiba/tc35815.c
@@ -1017,8 +1017,8 @@ tc35815_free_queues(struct net_device *dev)
BUG_ON(lp->tx_skbs[i].skb != skb);
#endif
if (skb) {
- dev_kfree_skb(skb);
pci_unmap_single(lp->pci_dev, lp->tx_skbs[i].skb_dma, skb->len, PCI_DMA_TODEVICE);
+ dev_kfree_skb(skb);
lp->tx_skbs[i].skb = NULL;
lp->tx_skbs[i].skb_dma = 0;
}
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index f9f3dba7a588..db23cb36ae5c 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -751,7 +751,6 @@ struct netvsc_device {
u32 send_section_cnt;
u32 send_section_size;
unsigned long *send_section_map;
- int map_words;
/* Used for NetVSP initialization protocol */
struct completion channel_init_wait;
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 8dd0b8770328..15ef713d96c0 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -236,6 +236,7 @@ static int netvsc_init_buf(struct hv_device *device)
struct netvsc_device *net_device;
struct nvsp_message *init_packet;
struct net_device *ndev;
+ size_t map_words;
int node;
net_device = get_outbound_net_device(device);
@@ -401,11 +402,9 @@ static int netvsc_init_buf(struct hv_device *device)
net_device->send_section_size, net_device->send_section_cnt);
/* Setup state for managing the send buffer. */
- net_device->map_words = DIV_ROUND_UP(net_device->send_section_cnt,
- BITS_PER_LONG);
+ map_words = DIV_ROUND_UP(net_device->send_section_cnt, BITS_PER_LONG);
- net_device->send_section_map = kcalloc(net_device->map_words,
- sizeof(ulong), GFP_KERNEL);
+ net_device->send_section_map = kcalloc(map_words, sizeof(ulong), GFP_KERNEL);
if (net_device->send_section_map == NULL) {
ret = -ENOMEM;
goto cleanup;
@@ -683,7 +682,7 @@ static u32 netvsc_get_next_send_section(struct netvsc_device *net_device)
unsigned long *map_addr = net_device->send_section_map;
unsigned int i;
- for_each_clear_bit(i, map_addr, net_device->map_words) {
+ for_each_clear_bit(i, map_addr, net_device->send_section_cnt) {
if (sync_test_and_set_bit(i, map_addr) == 0)
return i;
}
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index ff0a5ed3ca80..49ce4e9f4a0f 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -617,7 +617,8 @@ static void macsec_encrypt_done(struct crypto_async_request *base, int err)
static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm,
unsigned char **iv,
- struct scatterlist **sg)
+ struct scatterlist **sg,
+ int num_frags)
{
size_t size, iv_offset, sg_offset;
struct aead_request *req;
@@ -629,7 +630,7 @@ static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm,
size = ALIGN(size, __alignof__(struct scatterlist));
sg_offset = size;
- size += sizeof(struct scatterlist) * (MAX_SKB_FRAGS + 1);
+ size += sizeof(struct scatterlist) * num_frags;
tmp = kmalloc(size, GFP_ATOMIC);
if (!tmp)
@@ -649,6 +650,7 @@ static struct sk_buff *macsec_encrypt(struct sk_buff *skb,
{
int ret;
struct scatterlist *sg;
+ struct sk_buff *trailer;
unsigned char *iv;
struct ethhdr *eth;
struct macsec_eth_header *hh;
@@ -723,7 +725,14 @@ static struct sk_buff *macsec_encrypt(struct sk_buff *skb,
return ERR_PTR(-EINVAL);
}
- req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg);
+ ret = skb_cow_data(skb, 0, &trailer);
+ if (unlikely(ret < 0)) {
+ macsec_txsa_put(tx_sa);
+ kfree_skb(skb);
+ return ERR_PTR(ret);
+ }
+
+ req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg, ret);
if (!req) {
macsec_txsa_put(tx_sa);
kfree_skb(skb);
@@ -732,7 +741,7 @@ static struct sk_buff *macsec_encrypt(struct sk_buff *skb,
macsec_fill_iv(iv, secy->sci, pn);
- sg_init_table(sg, MAX_SKB_FRAGS + 1);
+ sg_init_table(sg, ret);
skb_to_sgvec(skb, sg, 0, skb->len);
if (tx_sc->encrypt) {
@@ -917,6 +926,7 @@ static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
{
int ret;
struct scatterlist *sg;
+ struct sk_buff *trailer;
unsigned char *iv;
struct aead_request *req;
struct macsec_eth_header *hdr;
@@ -927,7 +937,12 @@ static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
if (!skb)
return ERR_PTR(-ENOMEM);
- req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg);
+ ret = skb_cow_data(skb, 0, &trailer);
+ if (unlikely(ret < 0)) {
+ kfree_skb(skb);
+ return ERR_PTR(ret);
+ }
+ req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg, ret);
if (!req) {
kfree_skb(skb);
return ERR_PTR(-ENOMEM);
@@ -936,7 +951,7 @@ static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
hdr = (struct macsec_eth_header *)skb->data;
macsec_fill_iv(iv, sci, ntohl(hdr->packet_number));
- sg_init_table(sg, MAX_SKB_FRAGS + 1);
+ sg_init_table(sg, ret);
skb_to_sgvec(skb, sg, 0, skb->len);
if (hdr->tci_an & MACSEC_TCI_E) {
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 9261722960a7..b34eaaae03fd 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1139,6 +1139,7 @@ static int macvlan_port_create(struct net_device *dev)
static void macvlan_port_destroy(struct net_device *dev)
{
struct macvlan_port *port = macvlan_port_get_rtnl(dev);
+ struct sk_buff *skb;
dev->priv_flags &= ~IFF_MACVLAN_PORT;
netdev_rx_handler_unregister(dev);
@@ -1147,7 +1148,15 @@ static void macvlan_port_destroy(struct net_device *dev)
* but we need to cancel it and purge left skbs if any.
*/
cancel_work_sync(&port->bc_work);
- __skb_queue_purge(&port->bc_queue);
+
+ while ((skb = __skb_dequeue(&port->bc_queue))) {
+ const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src;
+
+ if (src)
+ dev_put(src->dev);
+
+ kfree_skb(skb);
+ }
kfree(port);
}
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 1326d99771c1..da5b39268370 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -297,17 +297,6 @@ static int kszphy_config_init(struct phy_device *phydev)
if (priv->led_mode >= 0)
kszphy_setup_led(phydev, type->led_mode_reg, priv->led_mode);
- if (phy_interrupt_is_valid(phydev)) {
- int ctl = phy_read(phydev, MII_BMCR);
-
- if (ctl < 0)
- return ctl;
-
- ret = phy_write(phydev, MII_BMCR, ctl & ~BMCR_ANENABLE);
- if (ret < 0)
- return ret;
- }
-
return 0;
}
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index a2bfc82e95d7..97ff1278167b 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -591,16 +591,18 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
EXPORT_SYMBOL(phy_mii_ioctl);
/**
- * phy_start_aneg - start auto-negotiation for this PHY device
+ * phy_start_aneg_priv - start auto-negotiation for this PHY device
* @phydev: the phy_device struct
+ * @sync: indicate whether we should wait for the workqueue cancelation
*
* Description: Sanitizes the settings (if we're not autonegotiating
* them), and then calls the driver's config_aneg function.
* If the PHYCONTROL Layer is operating, we change the state to
* reflect the beginning of Auto-negotiation or forcing.
*/
-int phy_start_aneg(struct phy_device *phydev)
+static int phy_start_aneg_priv(struct phy_device *phydev, bool sync)
{
+ bool trigger = 0;
int err;
if (!phydev->drv)
@@ -628,10 +630,40 @@ int phy_start_aneg(struct phy_device *phydev)
}
}
+ /* Re-schedule a PHY state machine to check PHY status because
+ * negotiation may already be done and aneg interrupt may not be
+ * generated.
+ */
+ if (phy_interrupt_is_valid(phydev) && (phydev->state == PHY_AN)) {
+ err = phy_aneg_done(phydev);
+ if (err > 0) {
+ trigger = true;
+ err = 0;
+ }
+ }
+
out_unlock:
mutex_unlock(&phydev->lock);
+
+ if (trigger)
+ phy_trigger_machine(phydev, sync);
+
return err;
}
+
+/**
+ * phy_start_aneg - start auto-negotiation for this PHY device
+ * @phydev: the phy_device struct
+ *
+ * Description: Sanitizes the settings (if we're not autonegotiating
+ * them), and then calls the driver's config_aneg function.
+ * If the PHYCONTROL Layer is operating, we change the state to
+ * reflect the beginning of Auto-negotiation or forcing.
+ */
+int phy_start_aneg(struct phy_device *phydev)
+{
+ return phy_start_aneg_priv(phydev, true);
+}
EXPORT_SYMBOL(phy_start_aneg);
/**
@@ -659,7 +691,7 @@ void phy_start_machine(struct phy_device *phydev)
* state machine runs.
*/
-static void phy_trigger_machine(struct phy_device *phydev, bool sync)
+void phy_trigger_machine(struct phy_device *phydev, bool sync)
{
if (sync)
cancel_delayed_work_sync(&phydev->state_queue);
@@ -1154,7 +1186,7 @@ void phy_state_machine(struct work_struct *work)
mutex_unlock(&phydev->lock);
if (needs_aneg)
- err = phy_start_aneg(phydev);
+ err = phy_start_aneg_priv(phydev, false);
else if (do_suspend)
phy_suspend(phydev);
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index f8c81f12d988..85c01247f2e3 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2361,8 +2361,10 @@ start_again:
hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI,
TEAM_CMD_OPTIONS_GET);
- if (!hdr)
+ if (!hdr) {
+ nlmsg_free(skb);
return -EMSGSIZE;
+ }
if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex))
goto nla_put_failure;
@@ -2634,8 +2636,10 @@ start_again:
hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI,
TEAM_CMD_PORT_LIST_GET);
- if (!hdr)
+ if (!hdr) {
+ nlmsg_free(skb);
return -EMSGSIZE;
+ }
if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex))
goto nla_put_failure;
diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig
index 3dd490f53e48..f28bd74ac275 100644
--- a/drivers/net/usb/Kconfig
+++ b/drivers/net/usb/Kconfig
@@ -369,7 +369,7 @@ config USB_NET_NET1080
optionally with LEDs that indicate traffic
config USB_NET_PLUSB
- tristate "Prolific PL-2301/2302/25A1 based cables"
+ tristate "Prolific PL-2301/2302/25A1/27A1 based cables"
# if the handshake/init/reset problems, from original 'plusb',
# are ever resolved ... then remove "experimental"
depends on USB_USBNET
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index 4f2e8141dbe2..00067a0c51ca 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -2534,13 +2534,6 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface,
SET_NETDEV_DEV(net, &interface->dev);
SET_NETDEV_DEVTYPE(net, &hso_type);
- /* registering our net device */
- result = register_netdev(net);
- if (result) {
- dev_err(&interface->dev, "Failed to register device\n");
- goto exit;
- }
-
/* start allocating */
for (i = 0; i < MUX_BULK_RX_BUF_COUNT; i++) {
hso_net->mux_bulk_rx_urb_pool[i] = usb_alloc_urb(0, GFP_KERNEL);
@@ -2560,6 +2553,13 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface,
add_net_device(hso_dev);
+ /* registering our net device */
+ result = register_netdev(net);
+ if (result) {
+ dev_err(&interface->dev, "Failed to register device\n");
+ goto exit;
+ }
+
hso_log_port(hso_dev);
hso_create_rfkill(hso_dev, interface);
@@ -3279,9 +3279,9 @@ static void __exit hso_exit(void)
pr_info("unloaded\n");
tty_unregister_driver(tty_drv);
- put_tty_driver(tty_drv);
/* deregister the usb driver */
usb_deregister(&hso_driver);
+ put_tty_driver(tty_drv);
}
/* Module definitions */
diff --git a/drivers/net/usb/plusb.c b/drivers/net/usb/plusb.c
index 22e1a9a99a7d..6fe59373cba9 100644
--- a/drivers/net/usb/plusb.c
+++ b/drivers/net/usb/plusb.c
@@ -102,7 +102,7 @@ static int pl_reset(struct usbnet *dev)
}
static const struct driver_info prolific_info = {
- .description = "Prolific PL-2301/PL-2302/PL-25A1",
+ .description = "Prolific PL-2301/PL-2302/PL-25A1/PL-27A1",
.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT,
/* some PL-2302 versions seem to fail usb_set_interface() */
.reset = pl_reset,
@@ -139,6 +139,17 @@ static const struct usb_device_id products [] = {
* Host-to-Host Cable
*/
.driver_info = (unsigned long) &prolific_info,
+
+},
+
+/* super speed cables */
+{
+ USB_DEVICE(0x067b, 0x27a1), /* PL-27A1, no eeprom
+ * also: goobay Active USB 3.0
+ * Data Link,
+ * Unitek Y-3501
+ */
+ .driver_info = (unsigned long) &prolific_info,
},
{ }, // END
@@ -158,5 +169,5 @@ static struct usb_driver plusb_driver = {
module_usb_driver(plusb_driver);
MODULE_AUTHOR("David Brownell");
-MODULE_DESCRIPTION("Prolific PL-2301/2302/25A1 USB Host to Host Link Driver");
+MODULE_DESCRIPTION("Prolific PL-2301/2302/25A1/27A1 USB Host to Host Link Driver");
MODULE_LICENSE("GPL");
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index eeb409c287b8..d5e0906262ea 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -49,10 +49,9 @@ unsigned char shutdown_timeout = 5;
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
-unsigned int nvme_max_retries = 5;
-module_param_named(max_retries, nvme_max_retries, uint, 0644);
+static u8 nvme_max_retries = 5;
+module_param_named(max_retries, nvme_max_retries, byte, 0644);
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
-EXPORT_SYMBOL_GPL(nvme_max_retries);
static int nvme_char_major;
module_param(nvme_char_major, int, 0);
@@ -62,11 +61,66 @@ module_param(default_ps_max_latency_us, ulong, 0644);
MODULE_PARM_DESC(default_ps_max_latency_us,
"max power saving latency for new devices; use PM QOS to change per device");
+static bool force_apst;
+module_param(force_apst, bool, 0644);
+MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
+
static LIST_HEAD(nvme_ctrl_list);
static DEFINE_SPINLOCK(dev_list_lock);
static struct class *nvme_class;
+static int nvme_error_status(struct request *req)
+{
+ switch (nvme_req(req)->status & 0x7ff) {
+ case NVME_SC_SUCCESS:
+ return 0;
+ case NVME_SC_CAP_EXCEEDED:
+ return -ENOSPC;
+ default:
+ return -EIO;
+
+ /*
+ * XXX: these errors are a nasty side-band protocol to
+ * drivers/md/dm-mpath.c:noretry_error() that aren't documented
+ * anywhere..
+ */
+ case NVME_SC_CMD_SEQ_ERROR:
+ return -EILSEQ;
+ case NVME_SC_ONCS_NOT_SUPPORTED:
+ return -EOPNOTSUPP;
+ case NVME_SC_WRITE_FAULT:
+ case NVME_SC_READ_ERROR:
+ case NVME_SC_UNWRITTEN_BLOCK:
+ return -ENODATA;
+ }
+}
+
+static inline bool nvme_req_needs_retry(struct request *req)
+{
+ if (blk_noretry_request(req))
+ return false;
+ if (nvme_req(req)->status & NVME_SC_DNR)
+ return false;
+ if (jiffies - req->start_time >= req->timeout)
+ return false;
+ if (nvme_req(req)->retries >= nvme_max_retries)
+ return false;
+ return true;
+}
+
+void nvme_complete_rq(struct request *req)
+{
+ if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
+ nvme_req(req)->retries++;
+ blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
+ return;
+ }
+
+ blk_mq_end_request(req, nvme_error_status(req));
+}
+EXPORT_SYMBOL_GPL(nvme_complete_rq);
+
void nvme_cancel_request(struct request *req, void *data, bool reserved)
{
int status;
@@ -80,7 +134,9 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved)
status = NVME_SC_ABORT_REQ;
if (blk_queue_dying(req->q))
status |= NVME_SC_DNR;
- blk_mq_complete_request(req, status);
+ nvme_req(req)->status = status;
+ blk_mq_complete_request(req);
+
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
@@ -205,12 +261,6 @@ fail:
return NULL;
}
-void nvme_requeue_req(struct request *req)
-{
- blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
-}
-EXPORT_SYMBOL_GPL(nvme_requeue_req);
-
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags, int qid)
{
@@ -327,6 +377,12 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
{
int ret = BLK_MQ_RQ_QUEUE_OK;
+ if (!(req->rq_flags & RQF_DONTPREP)) {
+ nvme_req(req)->retries = 0;
+ nvme_req(req)->flags = 0;
+ req->rq_flags |= RQF_DONTPREP;
+ }
+
switch (req_op(req)) {
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
@@ -335,6 +391,8 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
case REQ_OP_FLUSH:
nvme_setup_flush(ns, cmd);
break;
+ case REQ_OP_WRITE_ZEROES:
+ /* currently only aliased to deallocate for a few ctrls: */
case REQ_OP_DISCARD:
ret = nvme_setup_discard(ns, req, cmd);
break;
@@ -378,7 +436,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
blk_execute_rq(req->q, NULL, req, at_head);
if (result)
*result = nvme_req(req)->result;
- ret = req->errors;
+ if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+ ret = -EINTR;
+ else
+ ret = nvme_req(req)->status;
out:
blk_mq_free_request(req);
return ret;
@@ -463,7 +524,10 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
}
submit:
blk_execute_rq(req->q, disk, req, 0);
- ret = req->errors;
+ if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+ ret = -EINTR;
+ else
+ ret = nvme_req(req)->status;
if (result)
*result = le32_to_cpu(nvme_req(req)->result.u32);
if (meta && !ret && !write) {
@@ -900,16 +964,14 @@ static void nvme_config_discard(struct nvme_ns *ns)
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES)
- ns->queue->limits.discard_zeroes_data = 1;
- else
- ns->queue->limits.discard_zeroes_data = 0;
-
ns->queue->limits.discard_alignment = logical_block_size;
ns->queue->limits.discard_granularity = logical_block_size;
blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+
+ if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+ blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
}
static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
@@ -1267,7 +1329,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
* heuristic: we are willing to spend at most 2% of the time
* transitioning between power states. Therefore, when running
* in any given state, we will enter the next lower-power
- * non-operational state after waiting 100 * (enlat + exlat)
+ * non-operational state after waiting 50 * (enlat + exlat)
* microseconds, as long as that state's total latency is under
* the requested maximum latency.
*
@@ -1278,6 +1340,8 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
unsigned apste;
struct nvme_feat_auto_pst *table;
+ u64 max_lat_us = 0;
+ int max_ps = -1;
int ret;
/*
@@ -1299,6 +1363,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
if (ctrl->ps_max_latency_us == 0) {
/* Turn off APST. */
apste = 0;
+ dev_dbg(ctrl->device, "APST disabled\n");
} else {
__le64 target = cpu_to_le64(0);
int state;
@@ -1348,9 +1413,22 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
target = cpu_to_le64((state << 3) |
(transition_ms << 8));
+
+ if (max_ps == -1)
+ max_ps = state;
+
+ if (total_latency_us > max_lat_us)
+ max_lat_us = total_latency_us;
}
apste = 1;
+
+ if (max_ps == -1) {
+ dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
+ } else {
+ dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
+ max_ps, max_lat_us, (int)sizeof(*table), table);
+ }
}
ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
@@ -1488,6 +1566,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
}
}
+ if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
+ dev_warn(ctrl->dev, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
+ ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
+ }
+
ctrl->oacs = le16_to_cpu(id->oacs);
ctrl->vid = le16_to_cpu(id->vid);
ctrl->oncs = le16_to_cpup(&id->oncs);
@@ -1510,7 +1593,16 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->npss = id->npss;
prev_apsta = ctrl->apsta;
- ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
+ if (ctrl->quirks & NVME_QUIRK_NO_APST) {
+ if (force_apst && id->apsta) {
+ dev_warn(ctrl->dev, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
+ ctrl->apsta = 1;
+ } else {
+ ctrl->apsta = 0;
+ }
+ } else {
+ ctrl->apsta = id->apsta;
+ }
memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
if (ctrl->ops->is_fabrics) {
@@ -2393,7 +2485,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
- blk_mq_freeze_queue_start(ns->queue);
+ blk_freeze_queue_start(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 5b7386f69f4d..990e6fb32a63 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -471,6 +471,16 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
}
EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
+{
+ if (ctrl->opts->max_reconnects != -1 &&
+ ctrl->opts->nr_reconnects < ctrl->opts->max_reconnects)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
+
/**
* nvmf_register_transport() - NVMe Fabrics Library registration function.
* @ops: Transport ops instance to be registered to the
@@ -533,6 +543,7 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_QUEUE_SIZE, "queue_size=%d" },
{ NVMF_OPT_NR_IO_QUEUES, "nr_io_queues=%d" },
{ NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" },
+ { NVMF_OPT_CTRL_LOSS_TMO, "ctrl_loss_tmo=%d" },
{ NVMF_OPT_KATO, "keep_alive_tmo=%d" },
{ NVMF_OPT_HOSTNQN, "hostnqn=%s" },
{ NVMF_OPT_HOST_TRADDR, "host_traddr=%s" },
@@ -546,6 +557,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
char *options, *o, *p;
int token, ret = 0;
size_t nqnlen = 0;
+ int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO;
/* Set defaults */
opts->queue_size = NVMF_DEF_QUEUE_SIZE;
@@ -655,6 +667,16 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->kato = token;
break;
+ case NVMF_OPT_CTRL_LOSS_TMO:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (token < 0)
+ pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n");
+ ctrl_loss_tmo = token;
+ break;
case NVMF_OPT_HOSTNQN:
if (opts->host) {
pr_err("hostnqn already user-assigned: %s\n",
@@ -710,6 +732,12 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
}
+ if (ctrl_loss_tmo < 0)
+ opts->max_reconnects = -1;
+ else
+ opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
+ opts->reconnect_delay);
+
if (!opts->host) {
kref_get(&nvmf_default_host->ref);
opts->host = nvmf_default_host;
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 156018182ce4..f5a9c1fb186f 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -21,6 +21,8 @@
#define NVMF_MAX_QUEUE_SIZE 1024
#define NVMF_DEF_QUEUE_SIZE 128
#define NVMF_DEF_RECONNECT_DELAY 10
+/* default to 600 seconds of reconnect attempts before giving up */
+#define NVMF_DEF_CTRL_LOSS_TMO 600
/*
* Define a host as seen by the target. We allocate one at boot, but also
@@ -53,6 +55,7 @@ enum {
NVMF_OPT_HOSTNQN = 1 << 8,
NVMF_OPT_RECONNECT_DELAY = 1 << 9,
NVMF_OPT_HOST_TRADDR = 1 << 10,
+ NVMF_OPT_CTRL_LOSS_TMO = 1 << 11,
};
/**
@@ -77,6 +80,10 @@ enum {
* @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN.
* @kato: Keep-alive timeout.
* @host: Virtual NVMe host, contains the NQN and Host ID.
+ * @nr_reconnects: number of reconnect attempted since the last ctrl failure
+ * @max_reconnects: maximum number of allowed reconnect attempts before removing
+ * the controller, (-1) means reconnect forever, zero means remove
+ * immediately;
*/
struct nvmf_ctrl_options {
unsigned mask;
@@ -91,6 +98,8 @@ struct nvmf_ctrl_options {
bool discovery_nqn;
unsigned int kato;
struct nvmf_host *host;
+ int nr_reconnects;
+ int max_reconnects;
};
/*
@@ -133,5 +142,6 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
void nvmf_free_options(struct nvmf_ctrl_options *opts);
const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl);
int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
#endif /* _NVME_FABRICS_H */
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index d996ca73d3be..4976db56e351 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -19,6 +19,7 @@
#include <linux/parser.h>
#include <uapi/scsi/fc/fc_fs.h>
#include <uapi/scsi/fc/fc_els.h>
+#include <linux/delay.h>
#include "nvme.h"
#include "fabrics.h"
@@ -44,6 +45,8 @@ enum nvme_fc_queue_flags {
#define NVMEFC_QUEUE_DELAY 3 /* ms units */
+#define NVME_FC_MAX_CONNECT_ATTEMPTS 1
+
struct nvme_fc_queue {
struct nvme_fc_ctrl *ctrl;
struct device *dev;
@@ -61,16 +64,24 @@ struct nvme_fc_queue {
unsigned long flags;
} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */
+enum nvme_fcop_flags {
+ FCOP_FLAGS_TERMIO = (1 << 0),
+ FCOP_FLAGS_RELEASED = (1 << 1),
+ FCOP_FLAGS_COMPLETE = (1 << 2),
+ FCOP_FLAGS_AEN = (1 << 3),
+};
+
struct nvmefc_ls_req_op {
struct nvmefc_ls_req ls_req;
- struct nvme_fc_ctrl *ctrl;
+ struct nvme_fc_rport *rport;
struct nvme_fc_queue *queue;
struct request *rq;
+ u32 flags;
int ls_error;
struct completion ls_done;
- struct list_head lsreq_list; /* ctrl->ls_req_list */
+ struct list_head lsreq_list; /* rport->ls_req_list */
bool req_queued;
};
@@ -79,6 +90,7 @@ enum nvme_fcpop_state {
FCPOP_STATE_IDLE = 1,
FCPOP_STATE_ACTIVE = 2,
FCPOP_STATE_ABORTED = 3,
+ FCPOP_STATE_COMPLETE = 4,
};
struct nvme_fc_fcp_op {
@@ -97,6 +109,7 @@ struct nvme_fc_fcp_op {
struct request *rq;
atomic_t state;
+ u32 flags;
u32 rqno;
u32 nents;
@@ -120,23 +133,24 @@ struct nvme_fc_rport {
struct list_head endp_list; /* for lport->endp_list */
struct list_head ctrl_list;
+ struct list_head ls_req_list;
+ struct device *dev; /* physical device for dma */
+ struct nvme_fc_lport *lport;
spinlock_t lock;
struct kref ref;
} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */
-enum nvme_fcctrl_state {
- FCCTRL_INIT = 0,
- FCCTRL_ACTIVE = 1,
+enum nvme_fcctrl_flags {
+ FCCTRL_TERMIO = (1 << 0),
};
struct nvme_fc_ctrl {
spinlock_t lock;
struct nvme_fc_queue *queues;
- u32 queue_count;
-
struct device *dev;
struct nvme_fc_lport *lport;
struct nvme_fc_rport *rport;
+ u32 queue_count;
u32 cnum;
u64 association_id;
@@ -144,14 +158,19 @@ struct nvme_fc_ctrl {
u64 cap;
struct list_head ctrl_list; /* rport->ctrl_list */
- struct list_head ls_req_list;
struct blk_mq_tag_set admin_tag_set;
struct blk_mq_tag_set tag_set;
struct work_struct delete_work;
+ struct work_struct reset_work;
+ struct delayed_work connect_work;
+ int reconnect_delay;
+ int connect_attempts;
+
struct kref ref;
- int state;
+ u32 flags;
+ u32 iocnt;
struct nvme_fc_fcp_op aen_ops[NVME_FC_NR_AEN_COMMANDS];
@@ -419,9 +438,12 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
INIT_LIST_HEAD(&newrec->endp_list);
INIT_LIST_HEAD(&newrec->ctrl_list);
+ INIT_LIST_HEAD(&newrec->ls_req_list);
kref_init(&newrec->ref);
spin_lock_init(&newrec->lock);
newrec->remoteport.localport = &lport->localport;
+ newrec->dev = lport->dev;
+ newrec->lport = lport;
newrec->remoteport.private = &newrec[1];
newrec->remoteport.port_role = pinfo->port_role;
newrec->remoteport.node_name = pinfo->node_name;
@@ -444,7 +466,6 @@ out_kfree_rport:
out_reghost_failed:
*portptr = NULL;
return ret;
-
}
EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport);
@@ -487,6 +508,30 @@ nvme_fc_rport_get(struct nvme_fc_rport *rport)
return kref_get_unless_zero(&rport->ref);
}
+static int
+nvme_fc_abort_lsops(struct nvme_fc_rport *rport)
+{
+ struct nvmefc_ls_req_op *lsop;
+ unsigned long flags;
+
+restart:
+ spin_lock_irqsave(&rport->lock, flags);
+
+ list_for_each_entry(lsop, &rport->ls_req_list, lsreq_list) {
+ if (!(lsop->flags & FCOP_FLAGS_TERMIO)) {
+ lsop->flags |= FCOP_FLAGS_TERMIO;
+ spin_unlock_irqrestore(&rport->lock, flags);
+ rport->lport->ops->ls_abort(&rport->lport->localport,
+ &rport->remoteport,
+ &lsop->ls_req);
+ goto restart;
+ }
+ }
+ spin_unlock_irqrestore(&rport->lock, flags);
+
+ return 0;
+}
+
/**
* nvme_fc_unregister_remoteport - transport entry point called by an
* LLDD to deregister/remove a previously
@@ -522,6 +567,8 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
spin_unlock_irqrestore(&rport->lock, flags);
+ nvme_fc_abort_lsops(rport);
+
nvme_fc_rport_put(rport);
return 0;
}
@@ -624,16 +671,16 @@ static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
static void
-__nvme_fc_finish_ls_req(struct nvme_fc_ctrl *ctrl,
- struct nvmefc_ls_req_op *lsop)
+__nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
{
+ struct nvme_fc_rport *rport = lsop->rport;
struct nvmefc_ls_req *lsreq = &lsop->ls_req;
unsigned long flags;
- spin_lock_irqsave(&ctrl->lock, flags);
+ spin_lock_irqsave(&rport->lock, flags);
if (!lsop->req_queued) {
- spin_unlock_irqrestore(&ctrl->lock, flags);
+ spin_unlock_irqrestore(&rport->lock, flags);
return;
}
@@ -641,56 +688,71 @@ __nvme_fc_finish_ls_req(struct nvme_fc_ctrl *ctrl,
lsop->req_queued = false;
- spin_unlock_irqrestore(&ctrl->lock, flags);
+ spin_unlock_irqrestore(&rport->lock, flags);
- fc_dma_unmap_single(ctrl->dev, lsreq->rqstdma,
+ fc_dma_unmap_single(rport->dev, lsreq->rqstdma,
(lsreq->rqstlen + lsreq->rsplen),
DMA_BIDIRECTIONAL);
- nvme_fc_ctrl_put(ctrl);
+ nvme_fc_rport_put(rport);
}
static int
-__nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl,
+__nvme_fc_send_ls_req(struct nvme_fc_rport *rport,
struct nvmefc_ls_req_op *lsop,
void (*done)(struct nvmefc_ls_req *req, int status))
{
struct nvmefc_ls_req *lsreq = &lsop->ls_req;
unsigned long flags;
- int ret;
+ int ret = 0;
- if (!nvme_fc_ctrl_get(ctrl))
+ if (rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+ return -ECONNREFUSED;
+
+ if (!nvme_fc_rport_get(rport))
return -ESHUTDOWN;
lsreq->done = done;
- lsop->ctrl = ctrl;
+ lsop->rport = rport;
lsop->req_queued = false;
INIT_LIST_HEAD(&lsop->lsreq_list);
init_completion(&lsop->ls_done);
- lsreq->rqstdma = fc_dma_map_single(ctrl->dev, lsreq->rqstaddr,
+ lsreq->rqstdma = fc_dma_map_single(rport->dev, lsreq->rqstaddr,
lsreq->rqstlen + lsreq->rsplen,
DMA_BIDIRECTIONAL);
- if (fc_dma_mapping_error(ctrl->dev, lsreq->rqstdma)) {
- nvme_fc_ctrl_put(ctrl);
- dev_err(ctrl->dev,
- "els request command failed EFAULT.\n");
- return -EFAULT;
+ if (fc_dma_mapping_error(rport->dev, lsreq->rqstdma)) {
+ ret = -EFAULT;
+ goto out_putrport;
}
lsreq->rspdma = lsreq->rqstdma + lsreq->rqstlen;
- spin_lock_irqsave(&ctrl->lock, flags);
+ spin_lock_irqsave(&rport->lock, flags);
- list_add_tail(&lsop->lsreq_list, &ctrl->ls_req_list);
+ list_add_tail(&lsop->lsreq_list, &rport->ls_req_list);
lsop->req_queued = true;
- spin_unlock_irqrestore(&ctrl->lock, flags);
+ spin_unlock_irqrestore(&rport->lock, flags);
- ret = ctrl->lport->ops->ls_req(&ctrl->lport->localport,
- &ctrl->rport->remoteport, lsreq);
+ ret = rport->lport->ops->ls_req(&rport->lport->localport,
+ &rport->remoteport, lsreq);
if (ret)
- lsop->ls_error = ret;
+ goto out_unlink;
+
+ return 0;
+
+out_unlink:
+ lsop->ls_error = ret;
+ spin_lock_irqsave(&rport->lock, flags);
+ lsop->req_queued = false;
+ list_del(&lsop->lsreq_list);
+ spin_unlock_irqrestore(&rport->lock, flags);
+ fc_dma_unmap_single(rport->dev, lsreq->rqstdma,
+ (lsreq->rqstlen + lsreq->rsplen),
+ DMA_BIDIRECTIONAL);
+out_putrport:
+ nvme_fc_rport_put(rport);
return ret;
}
@@ -705,15 +767,15 @@ nvme_fc_send_ls_req_done(struct nvmefc_ls_req *lsreq, int status)
}
static int
-nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req_op *lsop)
+nvme_fc_send_ls_req(struct nvme_fc_rport *rport, struct nvmefc_ls_req_op *lsop)
{
struct nvmefc_ls_req *lsreq = &lsop->ls_req;
struct fcnvme_ls_rjt *rjt = lsreq->rspaddr;
int ret;
- ret = __nvme_fc_send_ls_req(ctrl, lsop, nvme_fc_send_ls_req_done);
+ ret = __nvme_fc_send_ls_req(rport, lsop, nvme_fc_send_ls_req_done);
- if (!ret)
+ if (!ret) {
/*
* No timeout/not interruptible as we need the struct
* to exist until the lldd calls us back. Thus mandate
@@ -722,14 +784,14 @@ nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req_op *lsop)
*/
wait_for_completion(&lsop->ls_done);
- __nvme_fc_finish_ls_req(ctrl, lsop);
+ __nvme_fc_finish_ls_req(lsop);
- if (ret) {
- dev_err(ctrl->dev,
- "ls request command failed (%d).\n", ret);
- return ret;
+ ret = lsop->ls_error;
}
+ if (ret)
+ return ret;
+
/* ACC or RJT payload ? */
if (rjt->w0.ls_cmd == FCNVME_LS_RJT)
return -ENXIO;
@@ -737,19 +799,14 @@ nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req_op *lsop)
return 0;
}
-static void
-nvme_fc_send_ls_req_async(struct nvme_fc_ctrl *ctrl,
+static int
+nvme_fc_send_ls_req_async(struct nvme_fc_rport *rport,
struct nvmefc_ls_req_op *lsop,
void (*done)(struct nvmefc_ls_req *req, int status))
{
- int ret;
-
- ret = __nvme_fc_send_ls_req(ctrl, lsop, done);
-
/* don't wait for completion */
- if (ret)
- done(&lsop->ls_req, ret);
+ return __nvme_fc_send_ls_req(rport, lsop, done);
}
/* Validation Error indexes into the string table below */
@@ -839,7 +896,7 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
lsreq->rsplen = sizeof(*assoc_acc);
lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
- ret = nvme_fc_send_ls_req(ctrl, lsop);
+ ret = nvme_fc_send_ls_req(ctrl->rport, lsop);
if (ret)
goto out_free_buffer;
@@ -848,11 +905,12 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
/* validate the ACC response */
if (assoc_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
fcret = VERR_LSACC;
- if (assoc_acc->hdr.desc_list_len !=
+ else if (assoc_acc->hdr.desc_list_len !=
fcnvme_lsdesc_len(
sizeof(struct fcnvme_ls_cr_assoc_acc)))
fcret = VERR_CR_ASSOC_ACC_LEN;
- if (assoc_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+ else if (assoc_acc->hdr.rqst.desc_tag !=
+ cpu_to_be32(FCNVME_LSDESC_RQST))
fcret = VERR_LSDESC_RQST;
else if (assoc_acc->hdr.rqst.desc_len !=
fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
@@ -946,7 +1004,7 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
lsreq->rsplen = sizeof(*conn_acc);
lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
- ret = nvme_fc_send_ls_req(ctrl, lsop);
+ ret = nvme_fc_send_ls_req(ctrl->rport, lsop);
if (ret)
goto out_free_buffer;
@@ -955,10 +1013,10 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
/* validate the ACC response */
if (conn_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
fcret = VERR_LSACC;
- if (conn_acc->hdr.desc_list_len !=
+ else if (conn_acc->hdr.desc_list_len !=
fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc)))
fcret = VERR_CR_CONN_ACC_LEN;
- if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+ else if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
fcret = VERR_LSDESC_RQST;
else if (conn_acc->hdr.rqst.desc_len !=
fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
@@ -997,14 +1055,8 @@ static void
nvme_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
{
struct nvmefc_ls_req_op *lsop = ls_req_to_lsop(lsreq);
- struct nvme_fc_ctrl *ctrl = lsop->ctrl;
- __nvme_fc_finish_ls_req(ctrl, lsop);
-
- if (status)
- dev_err(ctrl->dev,
- "disconnect assoc ls request command failed (%d).\n",
- status);
+ __nvme_fc_finish_ls_req(lsop);
/* fc-nvme iniator doesn't care about success or failure of cmd */
@@ -1035,6 +1087,7 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
struct fcnvme_ls_disconnect_acc *discon_acc;
struct nvmefc_ls_req_op *lsop;
struct nvmefc_ls_req *lsreq;
+ int ret;
lsop = kzalloc((sizeof(*lsop) +
ctrl->lport->ops->lsrqst_priv_sz +
@@ -1077,7 +1130,10 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
lsreq->rsplen = sizeof(*discon_acc);
lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
- nvme_fc_send_ls_req_async(ctrl, lsop, nvme_fc_disconnect_assoc_done);
+ ret = nvme_fc_send_ls_req_async(ctrl->rport, lsop,
+ nvme_fc_disconnect_assoc_done);
+ if (ret)
+ kfree(lsop);
/* only meaningful part to terminating the association */
ctrl->association_id = 0;
@@ -1086,6 +1142,7 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
/* *********************** NVME Ctrl Routines **************************** */
+static void __nvme_fc_final_op_cleanup(struct request *rq);
static int
nvme_fc_reinit_request(void *data, struct request *rq)
@@ -1123,21 +1180,84 @@ nvme_fc_exit_request(void *data, struct request *rq,
return __nvme_fc_exit_request(data, op);
}
+static int
+__nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
+{
+ int state;
+
+ state = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
+ if (state != FCPOP_STATE_ACTIVE) {
+ atomic_set(&op->state, state);
+ return -ECANCELED;
+ }
+
+ ctrl->lport->ops->fcp_abort(&ctrl->lport->localport,
+ &ctrl->rport->remoteport,
+ op->queue->lldd_handle,
+ &op->fcp_req);
+
+ return 0;
+}
+
static void
-nvme_fc_exit_aen_ops(struct nvme_fc_ctrl *ctrl)
+nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
{
struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops;
- int i;
+ unsigned long flags;
+ int i, ret;
for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
- if (atomic_read(&aen_op->state) == FCPOP_STATE_UNINIT)
+ if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE)
continue;
- __nvme_fc_exit_request(ctrl, aen_op);
- nvme_fc_ctrl_put(ctrl);
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ if (ctrl->flags & FCCTRL_TERMIO) {
+ ctrl->iocnt++;
+ aen_op->flags |= FCOP_FLAGS_TERMIO;
+ }
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+
+ ret = __nvme_fc_abort_op(ctrl, aen_op);
+ if (ret) {
+ /*
+ * if __nvme_fc_abort_op failed the io wasn't
+ * active. Thus this call path is running in
+ * parallel to the io complete. Treat as non-error.
+ */
+
+ /* back out the flags/counters */
+ spin_lock_irqsave(&ctrl->lock, flags);
+ if (ctrl->flags & FCCTRL_TERMIO)
+ ctrl->iocnt--;
+ aen_op->flags &= ~FCOP_FLAGS_TERMIO;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ return;
+ }
}
}
-void
+static inline int
+__nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
+ struct nvme_fc_fcp_op *op)
+{
+ unsigned long flags;
+ bool complete_rq = false;
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) {
+ if (ctrl->flags & FCCTRL_TERMIO)
+ ctrl->iocnt--;
+ }
+ if (op->flags & FCOP_FLAGS_RELEASED)
+ complete_rq = true;
+ else
+ op->flags |= FCOP_FLAGS_COMPLETE;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+
+ return complete_rq;
+}
+
+static void
nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
{
struct nvme_fc_fcp_op *op = fcp_req_to_fcp_op(req);
@@ -1146,7 +1266,10 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
struct nvme_fc_ctrl *ctrl = op->ctrl;
struct nvme_fc_queue *queue = op->queue;
struct nvme_completion *cqe = &op->rsp_iu.cqe;
- u16 status;
+ struct nvme_command *sqe = &op->cmd_iu.sqe;
+ __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
+ union nvme_result result;
+ bool complete_rq;
/*
* WARNING:
@@ -1181,9 +1304,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
sizeof(op->rsp_iu), DMA_FROM_DEVICE);
if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
- status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
- else
- status = freq->status;
+ status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
+ else if (freq->status)
+ status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
/*
* For the linux implementation, if we have an unsuccesful
@@ -1211,10 +1334,10 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
*/
if (freq->transferred_length !=
be32_to_cpu(op->cmd_iu.data_len)) {
- status = -EIO;
+ status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
goto done;
}
- op->nreq.result.u64 = 0;
+ result.u64 = 0;
break;
case sizeof(struct nvme_fc_ersp_iu):
@@ -1226,28 +1349,40 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
(freq->rcv_rsplen / 4) ||
be32_to_cpu(op->rsp_iu.xfrd_len) !=
freq->transferred_length ||
- op->rqno != le16_to_cpu(cqe->command_id))) {
- status = -EIO;
+ op->rsp_iu.status_code ||
+ sqe->common.command_id != cqe->command_id)) {
+ status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
goto done;
}
- op->nreq.result = cqe->result;
- status = le16_to_cpu(cqe->status) >> 1;
+ result = cqe->result;
+ status = cqe->status;
break;
default:
- status = -EIO;
+ status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
goto done;
}
done:
- if (!queue->qnum && op->rqno >= AEN_CMDID_BASE) {
- nvme_complete_async_event(&queue->ctrl->ctrl, status,
- &op->nreq.result);
+ if (op->flags & FCOP_FLAGS_AEN) {
+ nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
+ complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
+ atomic_set(&op->state, FCPOP_STATE_IDLE);
+ op->flags = FCOP_FLAGS_AEN; /* clear other flags */
nvme_fc_ctrl_put(ctrl);
return;
}
- blk_mq_complete_request(rq, status);
+ complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
+ if (!complete_rq) {
+ if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) {
+ status = cpu_to_le16(NVME_SC_ABORT_REQ);
+ if (blk_queue_dying(rq->q))
+ status |= cpu_to_le16(NVME_SC_DNR);
+ }
+ nvme_end_request(rq, status, result);
+ } else
+ __nvme_fc_final_op_cleanup(rq);
}
static int
@@ -1328,25 +1463,55 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
struct nvme_fc_fcp_op *aen_op;
struct nvme_fc_cmd_iu *cmdiu;
struct nvme_command *sqe;
+ void *private;
int i, ret;
aen_op = ctrl->aen_ops;
for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+ private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz,
+ GFP_KERNEL);
+ if (!private)
+ return -ENOMEM;
+
cmdiu = &aen_op->cmd_iu;
sqe = &cmdiu->sqe;
ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0],
aen_op, (struct request *)NULL,
(AEN_CMDID_BASE + i));
- if (ret)
+ if (ret) {
+ kfree(private);
return ret;
+ }
+
+ aen_op->flags = FCOP_FLAGS_AEN;
+ aen_op->fcp_req.first_sgl = NULL; /* no sg list */
+ aen_op->fcp_req.private = private;
memset(sqe, 0, sizeof(*sqe));
sqe->common.opcode = nvme_admin_async_event;
+ /* Note: core layer may overwrite the sqe.command_id value */
sqe->common.command_id = AEN_CMDID_BASE + i;
}
return 0;
}
+static void
+nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl)
+{
+ struct nvme_fc_fcp_op *aen_op;
+ int i;
+
+ aen_op = ctrl->aen_ops;
+ for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+ if (!aen_op->fcp_req.private)
+ continue;
+
+ __nvme_fc_exit_request(ctrl, aen_op);
+
+ kfree(aen_op->fcp_req.private);
+ aen_op->fcp_req.private = NULL;
+ }
+}
static inline void
__nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl,
@@ -1446,15 +1611,6 @@ __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *ctrl,
}
static void
-nvme_fc_destroy_admin_queue(struct nvme_fc_ctrl *ctrl)
-{
- __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
- blk_cleanup_queue(ctrl->ctrl.admin_q);
- blk_mq_free_tag_set(&ctrl->admin_tag_set);
- nvme_fc_free_queue(&ctrl->queues[0]);
-}
-
-static void
nvme_fc_free_io_queues(struct nvme_fc_ctrl *ctrl)
{
int i;
@@ -1541,19 +1697,27 @@ nvme_fc_ctrl_free(struct kref *ref)
container_of(ref, struct nvme_fc_ctrl, ref);
unsigned long flags;
- if (ctrl->state != FCCTRL_INIT) {
- /* remove from rport list */
- spin_lock_irqsave(&ctrl->rport->lock, flags);
- list_del(&ctrl->ctrl_list);
- spin_unlock_irqrestore(&ctrl->rport->lock, flags);
+ if (ctrl->ctrl.tagset) {
+ blk_cleanup_queue(ctrl->ctrl.connect_q);
+ blk_mq_free_tag_set(&ctrl->tag_set);
}
+ /* remove from rport list */
+ spin_lock_irqsave(&ctrl->rport->lock, flags);
+ list_del(&ctrl->ctrl_list);
+ spin_unlock_irqrestore(&ctrl->rport->lock, flags);
+
+ blk_cleanup_queue(ctrl->ctrl.admin_q);
+ blk_mq_free_tag_set(&ctrl->admin_tag_set);
+
+ kfree(ctrl->queues);
+
put_device(ctrl->dev);
nvme_fc_rport_put(ctrl->rport);
- kfree(ctrl->queues);
ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum);
- nvmf_free_options(ctrl->ctrl.opts);
+ if (ctrl->ctrl.opts)
+ nvmf_free_options(ctrl->ctrl.opts);
kfree(ctrl);
}
@@ -1574,57 +1738,38 @@ nvme_fc_ctrl_get(struct nvme_fc_ctrl *ctrl)
* controller. Called after last nvme_put_ctrl() call
*/
static void
-nvme_fc_free_nvme_ctrl(struct nvme_ctrl *nctrl)
+nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
WARN_ON(nctrl != &ctrl->ctrl);
- /*
- * Tear down the association, which will generate link
- * traffic to terminate connections
- */
-
- if (ctrl->state != FCCTRL_INIT) {
- /* send a Disconnect(association) LS to fc-nvme target */
- nvme_fc_xmt_disconnect_assoc(ctrl);
-
- if (ctrl->ctrl.tagset) {
- blk_cleanup_queue(ctrl->ctrl.connect_q);
- blk_mq_free_tag_set(&ctrl->tag_set);
- nvme_fc_delete_hw_io_queues(ctrl);
- nvme_fc_free_io_queues(ctrl);
- }
-
- nvme_fc_exit_aen_ops(ctrl);
-
- nvme_fc_destroy_admin_queue(ctrl);
- }
-
nvme_fc_ctrl_put(ctrl);
}
-
-static int
-__nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
+static void
+nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
{
- int state;
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: transport association error detected: %s\n",
+ ctrl->cnum, errmsg);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: resetting controller\n", ctrl->cnum);
- state = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
- if (state != FCPOP_STATE_ACTIVE) {
- atomic_set(&op->state, state);
- return -ECANCELED; /* fail */
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+ dev_err(ctrl->ctrl.device,
+ "NVME-FC{%d}: error_recovery: Couldn't change state "
+ "to RECONNECTING\n", ctrl->cnum);
+ return;
}
- ctrl->lport->ops->fcp_abort(&ctrl->lport->localport,
- &ctrl->rport->remoteport,
- op->queue->lldd_handle,
- &op->fcp_req);
-
- return 0;
+ if (!queue_work(nvme_fc_wq, &ctrl->reset_work))
+ dev_err(ctrl->ctrl.device,
+ "NVME-FC{%d}: error_recovery: Failed to schedule "
+ "reset work\n", ctrl->cnum);
}
-enum blk_eh_timer_return
+static enum blk_eh_timer_return
nvme_fc_timeout(struct request *rq, bool reserved)
{
struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
@@ -1640,11 +1785,13 @@ nvme_fc_timeout(struct request *rq, bool reserved)
return BLK_EH_HANDLED;
/*
- * TODO: force a controller reset
- * when that happens, queues will be torn down and outstanding
- * ios will be terminated, and the above abort, on a single io
- * will no longer be needed.
+ * we can't individually ABTS an io without affecting the queue,
+ * thus killing the queue, adn thus the association.
+ * So resolve by performing a controller reset, which will stop
+ * the host/io stack, terminate the association on the link,
+ * and recreate an association on the link.
*/
+ nvme_fc_error_recovery(ctrl, "io timeout error");
return BLK_EH_HANDLED;
}
@@ -1738,6 +1885,13 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
u32 csn;
int ret;
+ /*
+ * before attempting to send the io, check to see if we believe
+ * the target device is present
+ */
+ if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+ return BLK_MQ_RQ_QUEUE_ERROR;
+
if (!nvme_fc_ctrl_get(ctrl))
return BLK_MQ_RQ_QUEUE_ERROR;
@@ -1761,7 +1915,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
op->fcp_req.io_dir = io_dir;
op->fcp_req.transferred_length = 0;
op->fcp_req.rcv_rsplen = 0;
- op->fcp_req.status = 0;
+ op->fcp_req.status = NVME_SC_SUCCESS;
op->fcp_req.sqid = cpu_to_le16(queue->qnum);
/*
@@ -1782,14 +1936,9 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
sqe->rw.dptr.sgl.length = cpu_to_le32(data_len);
sqe->rw.dptr.sgl.addr = 0;
- /* odd that we set the command_id - should come from nvme-fabrics */
- WARN_ON_ONCE(sqe->common.command_id != cpu_to_le16(op->rqno));
-
- if (op->rq) { /* skipped on aens */
+ if (!(op->flags & FCOP_FLAGS_AEN)) {
ret = nvme_fc_map_data(ctrl, op->rq, op);
if (ret < 0) {
- dev_err(queue->ctrl->ctrl.device,
- "Failed to map data (%d)\n", ret);
nvme_cleanup_cmd(op->rq);
nvme_fc_ctrl_put(ctrl);
return (ret == -ENOMEM || ret == -EAGAIN) ?
@@ -1802,7 +1951,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
atomic_set(&op->state, FCPOP_STATE_ACTIVE);
- if (op->rq)
+ if (!(op->flags & FCOP_FLAGS_AEN))
blk_mq_start_request(op->rq);
ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
@@ -1810,9 +1959,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
queue->lldd_handle, &op->fcp_req);
if (ret) {
- dev_err(ctrl->dev,
- "Send nvme command failed - lldd returned %d.\n", ret);
-
if (op->rq) { /* normal request */
nvme_fc_unmap_data(ctrl, op->rq, op);
nvme_cleanup_cmd(op->rq);
@@ -1882,12 +2028,8 @@ nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
struct nvme_fc_fcp_op *op;
req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag);
- if (!req) {
- dev_err(queue->ctrl->ctrl.device,
- "tag 0x%x on QNum %#x not found\n",
- tag, queue->qnum);
+ if (!req)
return 0;
- }
op = blk_mq_rq_to_pdu(req);
@@ -1904,11 +2046,21 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg);
struct nvme_fc_fcp_op *aen_op;
+ unsigned long flags;
+ bool terminating = false;
int ret;
if (aer_idx > NVME_FC_NR_AEN_COMMANDS)
return;
+ spin_lock_irqsave(&ctrl->lock, flags);
+ if (ctrl->flags & FCCTRL_TERMIO)
+ terminating = true;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+
+ if (terminating)
+ return;
+
aen_op = &ctrl->aen_ops[aer_idx];
ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0,
@@ -1919,36 +2071,101 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
}
static void
-nvme_fc_complete_rq(struct request *rq)
+__nvme_fc_final_op_cleanup(struct request *rq)
{
struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
struct nvme_fc_ctrl *ctrl = op->ctrl;
- int error = 0, state;
- state = atomic_xchg(&op->state, FCPOP_STATE_IDLE);
+ atomic_set(&op->state, FCPOP_STATE_IDLE);
+ op->flags &= ~(FCOP_FLAGS_TERMIO | FCOP_FLAGS_RELEASED |
+ FCOP_FLAGS_COMPLETE);
nvme_cleanup_cmd(rq);
-
nvme_fc_unmap_data(ctrl, rq, op);
+ nvme_complete_rq(rq);
+ nvme_fc_ctrl_put(ctrl);
- if (unlikely(rq->errors)) {
- if (nvme_req_needs_retry(rq, rq->errors)) {
- nvme_requeue_req(rq);
- return;
- }
+}
- if (blk_rq_is_passthrough(rq))
- error = rq->errors;
- else
- error = nvme_error_status(rq->errors);
+static void
+nvme_fc_complete_rq(struct request *rq)
+{
+ struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+ struct nvme_fc_ctrl *ctrl = op->ctrl;
+ unsigned long flags;
+ bool completed = false;
+
+ /*
+ * the core layer, on controller resets after calling
+ * nvme_shutdown_ctrl(), calls complete_rq without our
+ * calling blk_mq_complete_request(), thus there may still
+ * be live i/o outstanding with the LLDD. Means transport has
+ * to track complete calls vs fcpio_done calls to know what
+ * path to take on completes and dones.
+ */
+ spin_lock_irqsave(&ctrl->lock, flags);
+ if (op->flags & FCOP_FLAGS_COMPLETE)
+ completed = true;
+ else
+ op->flags |= FCOP_FLAGS_RELEASED;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+
+ if (completed)
+ __nvme_fc_final_op_cleanup(rq);
+}
+
+/*
+ * This routine is used by the transport when it needs to find active
+ * io on a queue that is to be terminated. The transport uses
+ * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke
+ * this routine to kill them on a 1 by 1 basis.
+ *
+ * As FC allocates FC exchange for each io, the transport must contact
+ * the LLDD to terminate the exchange, thus releasing the FC exchange.
+ * After terminating the exchange the LLDD will call the transport's
+ * normal io done path for the request, but it will have an aborted
+ * status. The done path will return the io request back to the block
+ * layer with an error status.
+ */
+static void
+nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
+{
+ struct nvme_ctrl *nctrl = data;
+ struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+ struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
+ unsigned long flags;
+ int status;
+
+ if (!blk_mq_request_started(req))
+ return;
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ if (ctrl->flags & FCCTRL_TERMIO) {
+ ctrl->iocnt++;
+ op->flags |= FCOP_FLAGS_TERMIO;
}
+ spin_unlock_irqrestore(&ctrl->lock, flags);
- nvme_fc_ctrl_put(ctrl);
+ status = __nvme_fc_abort_op(ctrl, op);
+ if (status) {
+ /*
+ * if __nvme_fc_abort_op failed the io wasn't
+ * active. Thus this call path is running in
+ * parallel to the io complete. Treat as non-error.
+ */
- blk_mq_end_request(rq, error);
+ /* back out the flags/counters */
+ spin_lock_irqsave(&ctrl->lock, flags);
+ if (ctrl->flags & FCCTRL_TERMIO)
+ ctrl->iocnt--;
+ op->flags &= ~FCOP_FLAGS_TERMIO;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ return;
+ }
}
-static struct blk_mq_ops nvme_fc_mq_ops = {
+
+static const struct blk_mq_ops nvme_fc_mq_ops = {
.queue_rq = nvme_fc_queue_rq,
.complete = nvme_fc_complete_rq,
.init_request = nvme_fc_init_request,
@@ -1959,145 +2176,275 @@ static struct blk_mq_ops nvme_fc_mq_ops = {
.timeout = nvme_fc_timeout,
};
-static struct blk_mq_ops nvme_fc_admin_mq_ops = {
- .queue_rq = nvme_fc_queue_rq,
- .complete = nvme_fc_complete_rq,
- .init_request = nvme_fc_init_admin_request,
- .exit_request = nvme_fc_exit_request,
- .reinit_request = nvme_fc_reinit_request,
- .init_hctx = nvme_fc_init_admin_hctx,
- .timeout = nvme_fc_timeout,
-};
-
static int
-nvme_fc_configure_admin_queue(struct nvme_fc_ctrl *ctrl)
+nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
{
- u32 segs;
- int error;
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+ int ret;
- nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH);
+ ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
+ if (ret) {
+ dev_info(ctrl->ctrl.device,
+ "set_queue_count failed: %d\n", ret);
+ return ret;
+ }
- error = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
- NVME_FC_AQ_BLKMQ_DEPTH,
- (NVME_FC_AQ_BLKMQ_DEPTH / 4));
- if (error)
- return error;
+ ctrl->queue_count = opts->nr_io_queues + 1;
+ if (!opts->nr_io_queues)
+ return 0;
- memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
- ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
- ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH;
- ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
- ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
- ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
+ dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n",
+ opts->nr_io_queues);
+
+ nvme_fc_init_io_queues(ctrl);
+
+ memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
+ ctrl->tag_set.ops = &nvme_fc_mq_ops;
+ ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
+ ctrl->tag_set.reserved_tags = 1; /* fabric connect */
+ ctrl->tag_set.numa_node = NUMA_NO_NODE;
+ ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ ctrl->tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
(SG_CHUNK_SIZE *
sizeof(struct scatterlist)) +
ctrl->lport->ops->fcprqst_priv_sz;
- ctrl->admin_tag_set.driver_data = ctrl;
- ctrl->admin_tag_set.nr_hw_queues = 1;
- ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
+ ctrl->tag_set.driver_data = ctrl;
+ ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
+ ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
- error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
- if (error)
- goto out_free_queue;
+ ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
+ if (ret)
+ return ret;
- ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
- if (IS_ERR(ctrl->ctrl.admin_q)) {
- error = PTR_ERR(ctrl->ctrl.admin_q);
- goto out_free_tagset;
+ ctrl->ctrl.tagset = &ctrl->tag_set;
+
+ ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
+ if (IS_ERR(ctrl->ctrl.connect_q)) {
+ ret = PTR_ERR(ctrl->ctrl.connect_q);
+ goto out_free_tag_set;
+ }
+
+ ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+ if (ret)
+ goto out_cleanup_blk_queue;
+
+ ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+ if (ret)
+ goto out_delete_hw_queues;
+
+ return 0;
+
+out_delete_hw_queues:
+ nvme_fc_delete_hw_io_queues(ctrl);
+out_cleanup_blk_queue:
+ nvme_stop_keep_alive(&ctrl->ctrl);
+ blk_cleanup_queue(ctrl->ctrl.connect_q);
+out_free_tag_set:
+ blk_mq_free_tag_set(&ctrl->tag_set);
+ nvme_fc_free_io_queues(ctrl);
+
+ /* force put free routine to ignore io queues */
+ ctrl->ctrl.tagset = NULL;
+
+ return ret;
+}
+
+static int
+nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+ int ret;
+
+ ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
+ if (ret) {
+ dev_info(ctrl->ctrl.device,
+ "set_queue_count failed: %d\n", ret);
+ return ret;
}
- error = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
+ /* check for io queues existing */
+ if (ctrl->queue_count == 1)
+ return 0;
+
+ dev_info(ctrl->ctrl.device, "Recreating %d I/O queues.\n",
+ opts->nr_io_queues);
+
+ nvme_fc_init_io_queues(ctrl);
+
+ ret = blk_mq_reinit_tagset(&ctrl->tag_set);
+ if (ret)
+ goto out_free_io_queues;
+
+ ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+ if (ret)
+ goto out_free_io_queues;
+
+ ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+ if (ret)
+ goto out_delete_hw_queues;
+
+ return 0;
+
+out_delete_hw_queues:
+ nvme_fc_delete_hw_io_queues(ctrl);
+out_free_io_queues:
+ nvme_fc_free_io_queues(ctrl);
+ return ret;
+}
+
+/*
+ * This routine restarts the controller on the host side, and
+ * on the link side, recreates the controller association.
+ */
+static int
+nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
+{
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+ u32 segs;
+ int ret;
+ bool changed;
+
+ ctrl->connect_attempts++;
+
+ /*
+ * Create the admin queue
+ */
+
+ nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH);
+
+ ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
NVME_FC_AQ_BLKMQ_DEPTH);
- if (error)
- goto out_cleanup_queue;
+ if (ret)
+ goto out_free_queue;
- error = nvmf_connect_admin_queue(&ctrl->ctrl);
- if (error)
+ ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
+ NVME_FC_AQ_BLKMQ_DEPTH,
+ (NVME_FC_AQ_BLKMQ_DEPTH / 4));
+ if (ret)
goto out_delete_hw_queue;
- error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
- if (error) {
+ if (ctrl->ctrl.state != NVME_CTRL_NEW)
+ blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
+
+ ret = nvmf_connect_admin_queue(&ctrl->ctrl);
+ if (ret)
+ goto out_disconnect_admin_queue;
+
+ /*
+ * Check controller capabilities
+ *
+ * todo:- add code to check if ctrl attributes changed from
+ * prior connection values
+ */
+
+ ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
+ if (ret) {
dev_err(ctrl->ctrl.device,
"prop_get NVME_REG_CAP failed\n");
- goto out_delete_hw_queue;
+ goto out_disconnect_admin_queue;
}
ctrl->ctrl.sqsize =
- min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize);
+ min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
- error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
- if (error)
- goto out_delete_hw_queue;
+ ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
+ if (ret)
+ goto out_disconnect_admin_queue;
segs = min_t(u32, NVME_FC_MAX_SEGMENTS,
ctrl->lport->ops->max_sgl_segments);
ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9);
- error = nvme_init_identify(&ctrl->ctrl);
- if (error)
- goto out_delete_hw_queue;
+ ret = nvme_init_identify(&ctrl->ctrl);
+ if (ret)
+ goto out_disconnect_admin_queue;
+
+ /* sanity checks */
+
+ /* FC-NVME does not have other data in the capsule */
+ if (ctrl->ctrl.icdoff) {
+ dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
+ ctrl->ctrl.icdoff);
+ goto out_disconnect_admin_queue;
+ }
nvme_start_keep_alive(&ctrl->ctrl);
- return 0;
+ /* FC-NVME supports normal SGL Data Block Descriptors */
+ if (opts->queue_size > ctrl->ctrl.maxcmd) {
+ /* warn if maxcmd is lower than queue_size */
+ dev_warn(ctrl->ctrl.device,
+ "queue_size %zu > ctrl maxcmd %u, reducing "
+ "to queue_size\n",
+ opts->queue_size, ctrl->ctrl.maxcmd);
+ opts->queue_size = ctrl->ctrl.maxcmd;
+ }
+
+ ret = nvme_fc_init_aen_ops(ctrl);
+ if (ret)
+ goto out_term_aen_ops;
+
+ /*
+ * Create the io queues
+ */
+
+ if (ctrl->queue_count > 1) {
+ if (ctrl->ctrl.state == NVME_CTRL_NEW)
+ ret = nvme_fc_create_io_queues(ctrl);
+ else
+ ret = nvme_fc_reinit_io_queues(ctrl);
+ if (ret)
+ goto out_term_aen_ops;
+ }
+
+ changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
+ WARN_ON_ONCE(!changed);
+
+ ctrl->connect_attempts = 0;
+
+ kref_get(&ctrl->ctrl.kref);
+
+ if (ctrl->queue_count > 1) {
+ nvme_start_queues(&ctrl->ctrl);
+ nvme_queue_scan(&ctrl->ctrl);
+ nvme_queue_async_events(&ctrl->ctrl);
+ }
+
+ return 0; /* Success */
+
+out_term_aen_ops:
+ nvme_fc_term_aen_ops(ctrl);
+ nvme_stop_keep_alive(&ctrl->ctrl);
+out_disconnect_admin_queue:
+ /* send a Disconnect(association) LS to fc-nvme target */
+ nvme_fc_xmt_disconnect_assoc(ctrl);
out_delete_hw_queue:
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
-out_cleanup_queue:
- blk_cleanup_queue(ctrl->ctrl.admin_q);
-out_free_tagset:
- blk_mq_free_tag_set(&ctrl->admin_tag_set);
out_free_queue:
nvme_fc_free_queue(&ctrl->queues[0]);
- return error;
+
+ return ret;
}
/*
- * This routine is used by the transport when it needs to find active
- * io on a queue that is to be terminated. The transport uses
- * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke
- * this routine to kill them on a 1 by 1 basis.
- *
- * As FC allocates FC exchange for each io, the transport must contact
- * the LLDD to terminate the exchange, thus releasing the FC exchange.
- * After terminating the exchange the LLDD will call the transport's
- * normal io done path for the request, but it will have an aborted
- * status. The done path will return the io request back to the block
- * layer with an error status.
+ * This routine stops operation of the controller on the host side.
+ * On the host os stack side: Admin and IO queues are stopped,
+ * outstanding ios on them terminated via FC ABTS.
+ * On the link side: the association is terminated.
*/
static void
-nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
+nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
{
- struct nvme_ctrl *nctrl = data;
- struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
- struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
-int status;
-
- if (!blk_mq_request_started(req))
- return;
+ unsigned long flags;
- /* this performs an ABTS-LS on the FC exchange for the io */
- status = __nvme_fc_abort_op(ctrl, op);
- /*
- * if __nvme_fc_abort_op failed: io wasn't active to abort
- * consider it done. Assume completion path already completing
- * in parallel
- */
- if (status)
- /* io wasn't active to abort consider it done */
- /* assume completion path already completing in parallel */
- return;
-}
+ nvme_stop_keep_alive(&ctrl->ctrl);
+ spin_lock_irqsave(&ctrl->lock, flags);
+ ctrl->flags |= FCCTRL_TERMIO;
+ ctrl->iocnt = 0;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
-/*
- * This routine stops operation of the controller. Admin and IO queues
- * are stopped, outstanding ios on them terminated, and the nvme ctrl
- * is shutdown.
- */
-static void
-nvme_fc_shutdown_ctrl(struct nvme_fc_ctrl *ctrl)
-{
/*
* If io queues are present, stop them and terminate all outstanding
* ios on them. As FC allocates FC exchange for each io, the
@@ -2116,35 +2463,79 @@ nvme_fc_shutdown_ctrl(struct nvme_fc_ctrl *ctrl)
nvme_fc_terminate_exchange, &ctrl->ctrl);
}
- if (ctrl->ctrl.state == NVME_CTRL_LIVE)
- nvme_shutdown_ctrl(&ctrl->ctrl);
+ /*
+ * Other transports, which don't have link-level contexts bound
+ * to sqe's, would try to gracefully shutdown the controller by
+ * writing the registers for shutdown and polling (call
+ * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially
+ * just aborted and we will wait on those contexts, and given
+ * there was no indication of how live the controlelr is on the
+ * link, don't send more io to create more contexts for the
+ * shutdown. Let the controller fail via keepalive failure if
+ * its still present.
+ */
/*
- * now clean up the admin queue. Same thing as above.
+ * clean up the admin queue. Same thing as above.
* use blk_mq_tagset_busy_itr() and the transport routine to
* terminate the exchanges.
*/
blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
+
+ /* kill the aens as they are a separate path */
+ nvme_fc_abort_aen_ops(ctrl);
+
+ /* wait for all io that had to be aborted */
+ spin_lock_irqsave(&ctrl->lock, flags);
+ while (ctrl->iocnt) {
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ msleep(1000);
+ spin_lock_irqsave(&ctrl->lock, flags);
+ }
+ ctrl->flags &= ~FCCTRL_TERMIO;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+
+ nvme_fc_term_aen_ops(ctrl);
+
+ /*
+ * send a Disconnect(association) LS to fc-nvme target
+ * Note: could have been sent at top of process, but
+ * cleaner on link traffic if after the aborts complete.
+ * Note: if association doesn't exist, association_id will be 0
+ */
+ if (ctrl->association_id)
+ nvme_fc_xmt_disconnect_assoc(ctrl);
+
+ if (ctrl->ctrl.tagset) {
+ nvme_fc_delete_hw_io_queues(ctrl);
+ nvme_fc_free_io_queues(ctrl);
+ }
+
+ __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
+ nvme_fc_free_queue(&ctrl->queues[0]);
}
-/*
- * Called to teardown an association.
- * May be called with association fully in place or partially in place.
- */
static void
-__nvme_fc_remove_ctrl(struct nvme_fc_ctrl *ctrl)
+nvme_fc_delete_ctrl_work(struct work_struct *work)
{
- nvme_stop_keep_alive(&ctrl->ctrl);
+ struct nvme_fc_ctrl *ctrl =
+ container_of(work, struct nvme_fc_ctrl, delete_work);
- /* stop and terminate ios on admin and io queues */
- nvme_fc_shutdown_ctrl(ctrl);
+ cancel_work_sync(&ctrl->reset_work);
+ cancel_delayed_work_sync(&ctrl->connect_work);
+
+ /*
+ * kill the association on the link side. this will block
+ * waiting for io to terminate
+ */
+ nvme_fc_delete_association(ctrl);
/*
* tear down the controller
* This will result in the last reference on the nvme ctrl to
- * expire, calling the transport nvme_fc_free_nvme_ctrl() callback.
+ * expire, calling the transport nvme_fc_nvme_ctrl_freed() callback.
* From there, the transport will tear down it's logical queues and
* association.
*/
@@ -2153,15 +2544,6 @@ __nvme_fc_remove_ctrl(struct nvme_fc_ctrl *ctrl)
nvme_put_ctrl(&ctrl->ctrl);
}
-static void
-nvme_fc_del_ctrl_work(struct work_struct *work)
-{
- struct nvme_fc_ctrl *ctrl =
- container_of(work, struct nvme_fc_ctrl, delete_work);
-
- __nvme_fc_remove_ctrl(ctrl);
-}
-
static int
__nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl)
{
@@ -2181,25 +2563,85 @@ static int
nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
- struct nvme_fc_rport *rport = ctrl->rport;
- unsigned long flags;
int ret;
- spin_lock_irqsave(&rport->lock, flags);
+ if (!kref_get_unless_zero(&ctrl->ctrl.kref))
+ return -EBUSY;
+
ret = __nvme_fc_del_ctrl(ctrl);
- spin_unlock_irqrestore(&rport->lock, flags);
- if (ret)
- return ret;
- flush_work(&ctrl->delete_work);
+ if (!ret)
+ flush_workqueue(nvme_fc_wq);
- return 0;
+ nvme_put_ctrl(&ctrl->ctrl);
+
+ return ret;
+}
+
+static void
+nvme_fc_reset_ctrl_work(struct work_struct *work)
+{
+ struct nvme_fc_ctrl *ctrl =
+ container_of(work, struct nvme_fc_ctrl, reset_work);
+ int ret;
+
+ /* will block will waiting for io to terminate */
+ nvme_fc_delete_association(ctrl);
+
+ ret = nvme_fc_create_association(ctrl);
+ if (ret) {
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
+ ctrl->cnum, ret);
+ if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) {
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: Max reconnect attempts (%d) "
+ "reached. Removing controller\n",
+ ctrl->cnum, ctrl->connect_attempts);
+
+ if (!nvme_change_ctrl_state(&ctrl->ctrl,
+ NVME_CTRL_DELETING)) {
+ dev_err(ctrl->ctrl.device,
+ "NVME-FC{%d}: failed to change state "
+ "to DELETING\n", ctrl->cnum);
+ return;
+ }
+
+ WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work));
+ return;
+ }
+
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: Reconnect attempt in %d seconds.\n",
+ ctrl->cnum, ctrl->reconnect_delay);
+ queue_delayed_work(nvme_fc_wq, &ctrl->connect_work,
+ ctrl->reconnect_delay * HZ);
+ } else
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: controller reset complete\n", ctrl->cnum);
}
+/*
+ * called by the nvme core layer, for sysfs interface that requests
+ * a reset of the nvme controller
+ */
static int
nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl)
{
- return -EIO;
+ struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: admin requested controller reset\n", ctrl->cnum);
+
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
+ return -EBUSY;
+
+ if (!queue_work(nvme_fc_wq, &ctrl->reset_work))
+ return -EBUSY;
+
+ flush_work(&ctrl->reset_work);
+
+ return 0;
}
static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
@@ -2210,95 +2652,75 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
.reg_read64 = nvmf_reg_read64,
.reg_write32 = nvmf_reg_write32,
.reset_ctrl = nvme_fc_reset_nvme_ctrl,
- .free_ctrl = nvme_fc_free_nvme_ctrl,
+ .free_ctrl = nvme_fc_nvme_ctrl_freed,
.submit_async_event = nvme_fc_submit_async_event,
.delete_ctrl = nvme_fc_del_nvme_ctrl,
.get_subsysnqn = nvmf_get_subsysnqn,
.get_address = nvmf_get_address,
};
-static int
-nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
+static void
+nvme_fc_connect_ctrl_work(struct work_struct *work)
{
- struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
int ret;
- ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
- if (ret) {
- dev_info(ctrl->ctrl.device,
- "set_queue_count failed: %d\n", ret);
- return ret;
- }
-
- ctrl->queue_count = opts->nr_io_queues + 1;
- if (!opts->nr_io_queues)
- return 0;
-
- dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n",
- opts->nr_io_queues);
-
- nvme_fc_init_io_queues(ctrl);
-
- memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
- ctrl->tag_set.ops = &nvme_fc_mq_ops;
- ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
- ctrl->tag_set.reserved_tags = 1; /* fabric connect */
- ctrl->tag_set.numa_node = NUMA_NO_NODE;
- ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
- ctrl->tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
- (SG_CHUNK_SIZE *
- sizeof(struct scatterlist)) +
- ctrl->lport->ops->fcprqst_priv_sz;
- ctrl->tag_set.driver_data = ctrl;
- ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
- ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
-
- ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
- if (ret)
- return ret;
-
- ctrl->ctrl.tagset = &ctrl->tag_set;
-
- ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
- if (IS_ERR(ctrl->ctrl.connect_q)) {
- ret = PTR_ERR(ctrl->ctrl.connect_q);
- goto out_free_tag_set;
- }
-
- ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
- if (ret)
- goto out_cleanup_blk_queue;
+ struct nvme_fc_ctrl *ctrl =
+ container_of(to_delayed_work(work),
+ struct nvme_fc_ctrl, connect_work);
- ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
- if (ret)
- goto out_delete_hw_queues;
+ ret = nvme_fc_create_association(ctrl);
+ if (ret) {
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: Reconnect attempt failed (%d)\n",
+ ctrl->cnum, ret);
+ if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) {
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: Max reconnect attempts (%d) "
+ "reached. Removing controller\n",
+ ctrl->cnum, ctrl->connect_attempts);
+
+ if (!nvme_change_ctrl_state(&ctrl->ctrl,
+ NVME_CTRL_DELETING)) {
+ dev_err(ctrl->ctrl.device,
+ "NVME-FC{%d}: failed to change state "
+ "to DELETING\n", ctrl->cnum);
+ return;
+ }
- return 0;
+ WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work));
+ return;
+ }
-out_delete_hw_queues:
- nvme_fc_delete_hw_io_queues(ctrl);
-out_cleanup_blk_queue:
- nvme_stop_keep_alive(&ctrl->ctrl);
- blk_cleanup_queue(ctrl->ctrl.connect_q);
-out_free_tag_set:
- blk_mq_free_tag_set(&ctrl->tag_set);
- nvme_fc_free_io_queues(ctrl);
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: Reconnect attempt in %d seconds.\n",
+ ctrl->cnum, ctrl->reconnect_delay);
+ queue_delayed_work(nvme_fc_wq, &ctrl->connect_work,
+ ctrl->reconnect_delay * HZ);
+ } else
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: controller reconnect complete\n",
+ ctrl->cnum);
+}
- /* force put free routine to ignore io queues */
- ctrl->ctrl.tagset = NULL;
- return ret;
-}
+static const struct blk_mq_ops nvme_fc_admin_mq_ops = {
+ .queue_rq = nvme_fc_queue_rq,
+ .complete = nvme_fc_complete_rq,
+ .init_request = nvme_fc_init_admin_request,
+ .exit_request = nvme_fc_exit_request,
+ .reinit_request = nvme_fc_reinit_request,
+ .init_hctx = nvme_fc_init_admin_hctx,
+ .timeout = nvme_fc_timeout,
+};
static struct nvme_ctrl *
-__nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
+nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
struct nvme_fc_lport *lport, struct nvme_fc_rport *rport)
{
struct nvme_fc_ctrl *ctrl;
unsigned long flags;
int ret, idx;
- bool changed;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl) {
@@ -2314,21 +2736,18 @@ __nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->ctrl_list);
- INIT_LIST_HEAD(&ctrl->ls_req_list);
ctrl->lport = lport;
ctrl->rport = rport;
ctrl->dev = lport->dev;
- ctrl->state = FCCTRL_INIT;
ctrl->cnum = idx;
- ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0);
- if (ret)
- goto out_free_ida;
-
get_device(ctrl->dev);
kref_init(&ctrl->ref);
- INIT_WORK(&ctrl->delete_work, nvme_fc_del_ctrl_work);
+ INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work);
+ INIT_WORK(&ctrl->reset_work, nvme_fc_reset_ctrl_work);
+ INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
+ ctrl->reconnect_delay = opts->reconnect_delay;
spin_lock_init(&ctrl->lock);
/* io queue count */
@@ -2345,87 +2764,87 @@ __nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->queues = kcalloc(ctrl->queue_count, sizeof(struct nvme_fc_queue),
GFP_KERNEL);
if (!ctrl->queues)
- goto out_uninit_ctrl;
-
- ret = nvme_fc_configure_admin_queue(ctrl);
- if (ret)
- goto out_uninit_ctrl;
-
- /* sanity checks */
-
- /* FC-NVME does not have other data in the capsule */
- if (ctrl->ctrl.icdoff) {
- dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
- ctrl->ctrl.icdoff);
- goto out_remove_admin_queue;
- }
-
- /* FC-NVME supports normal SGL Data Block Descriptors */
+ goto out_free_ida;
- if (opts->queue_size > ctrl->ctrl.maxcmd) {
- /* warn if maxcmd is lower than queue_size */
- dev_warn(ctrl->ctrl.device,
- "queue_size %zu > ctrl maxcmd %u, reducing "
- "to queue_size\n",
- opts->queue_size, ctrl->ctrl.maxcmd);
- opts->queue_size = ctrl->ctrl.maxcmd;
- }
+ memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
+ ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
+ ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH;
+ ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
+ ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+ ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
+ (SG_CHUNK_SIZE *
+ sizeof(struct scatterlist)) +
+ ctrl->lport->ops->fcprqst_priv_sz;
+ ctrl->admin_tag_set.driver_data = ctrl;
+ ctrl->admin_tag_set.nr_hw_queues = 1;
+ ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
- ret = nvme_fc_init_aen_ops(ctrl);
+ ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
if (ret)
- goto out_exit_aen_ops;
+ goto out_free_queues;
- if (ctrl->queue_count > 1) {
- ret = nvme_fc_create_io_queues(ctrl);
- if (ret)
- goto out_exit_aen_ops;
+ ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+ if (IS_ERR(ctrl->ctrl.admin_q)) {
+ ret = PTR_ERR(ctrl->ctrl.admin_q);
+ goto out_free_admin_tag_set;
}
- spin_lock_irqsave(&ctrl->lock, flags);
- ctrl->state = FCCTRL_ACTIVE;
- spin_unlock_irqrestore(&ctrl->lock, flags);
-
- changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- WARN_ON_ONCE(!changed);
+ /*
+ * Would have been nice to init io queues tag set as well.
+ * However, we require interaction from the controller
+ * for max io queue count before we can do so.
+ * Defer this to the connect path.
+ */
- dev_info(ctrl->ctrl.device,
- "NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
- ctrl->cnum, ctrl->ctrl.opts->subsysnqn);
+ ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0);
+ if (ret)
+ goto out_cleanup_admin_q;
- kref_get(&ctrl->ctrl.kref);
+ /* at this point, teardown path changes to ref counting on nvme ctrl */
spin_lock_irqsave(&rport->lock, flags);
list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list);
spin_unlock_irqrestore(&rport->lock, flags);
- if (opts->nr_io_queues) {
- nvme_queue_scan(&ctrl->ctrl);
- nvme_queue_async_events(&ctrl->ctrl);
+ ret = nvme_fc_create_association(ctrl);
+ if (ret) {
+ ctrl->ctrl.opts = NULL;
+ /* initiate nvme ctrl ref counting teardown */
+ nvme_uninit_ctrl(&ctrl->ctrl);
+ nvme_put_ctrl(&ctrl->ctrl);
+
+ /* as we're past the point where we transition to the ref
+ * counting teardown path, if we return a bad pointer here,
+ * the calling routine, thinking it's prior to the
+ * transition, will do an rport put. Since the teardown
+ * path also does a rport put, we do an extra get here to
+ * so proper order/teardown happens.
+ */
+ nvme_fc_rport_get(rport);
+
+ if (ret > 0)
+ ret = -EIO;
+ return ERR_PTR(ret);
}
- return &ctrl->ctrl;
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
+ ctrl->cnum, ctrl->ctrl.opts->subsysnqn);
-out_exit_aen_ops:
- nvme_fc_exit_aen_ops(ctrl);
-out_remove_admin_queue:
- /* send a Disconnect(association) LS to fc-nvme target */
- nvme_fc_xmt_disconnect_assoc(ctrl);
- nvme_stop_keep_alive(&ctrl->ctrl);
- nvme_fc_destroy_admin_queue(ctrl);
-out_uninit_ctrl:
- nvme_uninit_ctrl(&ctrl->ctrl);
- nvme_put_ctrl(&ctrl->ctrl);
- if (ret > 0)
- ret = -EIO;
- /* exit via here will follow ctlr ref point callbacks to free */
- return ERR_PTR(ret);
+ return &ctrl->ctrl;
+out_cleanup_admin_q:
+ blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_free_admin_tag_set:
+ blk_mq_free_tag_set(&ctrl->admin_tag_set);
+out_free_queues:
+ kfree(ctrl->queues);
out_free_ida:
+ put_device(ctrl->dev);
ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum);
out_free_ctrl:
kfree(ctrl);
out_fail:
- nvme_fc_rport_put(rport);
/* exit via here doesn't follow ctlr ref points */
return ERR_PTR(ret);
}
@@ -2497,6 +2916,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
{
struct nvme_fc_lport *lport;
struct nvme_fc_rport *rport;
+ struct nvme_ctrl *ctrl;
struct nvmet_fc_traddr laddr = { 0L, 0L };
struct nvmet_fc_traddr raddr = { 0L, 0L };
unsigned long flags;
@@ -2528,7 +2948,10 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
spin_unlock_irqrestore(&nvme_fc_lock, flags);
- return __nvme_fc_create_ctrl(dev, opts, lport, rport);
+ ctrl = nvme_fc_init_ctrl(dev, opts, lport, rport);
+ if (IS_ERR(ctrl))
+ nvme_fc_rport_put(rport);
+ return ctrl;
}
}
spin_unlock_irqrestore(&nvme_fc_lock, flags);
@@ -2546,11 +2969,20 @@ static struct nvmf_transport_ops nvme_fc_transport = {
static int __init nvme_fc_init_module(void)
{
+ int ret;
+
nvme_fc_wq = create_workqueue("nvme_fc_wq");
if (!nvme_fc_wq)
return -ENOMEM;
- return nvmf_register_transport(&nvme_fc_transport);
+ ret = nvmf_register_transport(&nvme_fc_transport);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ destroy_workqueue(nvme_fc_wq);
+ return ret;
}
static void __exit nvme_fc_exit_module(void)
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 21cac8523bd8..e4e4e60b1224 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -241,9 +241,9 @@ static inline void _nvme_nvm_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960);
- BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 128);
+ BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16);
BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != 4096);
- BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64);
}
static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
@@ -324,7 +324,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
nvm_id->cap = le32_to_cpu(nvme_nvm_id->cap);
nvm_id->dom = le32_to_cpu(nvme_nvm_id->dom);
memcpy(&nvm_id->ppaf, &nvme_nvm_id->ppaf,
- sizeof(struct nvme_nvm_addr_format));
+ sizeof(struct nvm_addr_format));
ret = init_grps(nvm_id, nvme_nvm_id);
out:
@@ -483,8 +483,8 @@ static void nvme_nvm_end_io(struct request *rq, int error)
{
struct nvm_rq *rqd = rq->end_io_data;
- rqd->ppa_status = nvme_req(rq)->result.u64;
- rqd->error = error;
+ rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
+ rqd->error = nvme_req(rq)->status;
nvm_end_io(rqd);
kfree(nvme_req(rq)->cmd);
@@ -510,12 +510,12 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
}
rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
- rq->ioprio = bio_prio(bio);
- if (bio_has_data(bio))
- rq->nr_phys_segments = bio_phys_segments(q, bio);
-
- rq->__data_len = bio->bi_iter.bi_size;
- rq->bio = rq->biotail = bio;
+ if (bio) {
+ blk_init_request_from_bio(rq, bio);
+ } else {
+ rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+ rq->__data_len = 0;
+ }
nvme_nvm_rqtocmd(rq, rqd, ns, cmd);
@@ -526,21 +526,6 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
return 0;
}
-static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
- struct request_queue *q = dev->q;
- struct nvme_ns *ns = q->queuedata;
- struct nvme_nvm_command c = {};
-
- c.erase.opcode = NVM_OP_ERASE;
- c.erase.nsid = cpu_to_le32(ns->ns_id);
- c.erase.spba = cpu_to_le64(rqd->ppa_addr.ppa);
- c.erase.length = cpu_to_le16(rqd->nr_ppas - 1);
- c.erase.control = cpu_to_le16(rqd->flags);
-
- return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0);
-}
-
static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
{
struct nvme_ns *ns = nvmdev->q->queuedata;
@@ -576,7 +561,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
.set_bb_tbl = nvme_nvm_set_bb_tbl,
.submit_io = nvme_nvm_submit_io,
- .erase_block = nvme_nvm_erase_block,
.create_dma_pool = nvme_nvm_create_dma_pool,
.destroy_dma_pool = nvme_nvm_destroy_dma_pool,
@@ -611,7 +595,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
__le64 *metadata = NULL;
dma_addr_t metadata_dma;
DECLARE_COMPLETION_ONSTACK(wait);
- int ret;
+ int ret = 0;
rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0,
NVME_QID_ANY);
@@ -681,9 +665,12 @@ submit:
wait_for_completion_io(&wait);
- ret = nvme_error_status(rq->errors);
+ if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
+ ret = -EINTR;
+ else if (nvme_req(rq)->status & 0x7ff)
+ ret = -EIO;
if (result)
- *result = rq->errors & 0x7ff;
+ *result = nvme_req(rq)->status & 0x7ff;
if (status)
*status = le64_to_cpu(nvme_req(rq)->result.u64);
@@ -766,7 +753,7 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3);
/* cdw11-12 */
c.ph_rw.length = cpu_to_le16(vcmd.nppas);
- c.ph_rw.control = cpu_to_le32(vcmd.control);
+ c.ph_rw.control = cpu_to_le16(vcmd.control);
c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13);
c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14);
c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15);
@@ -809,6 +796,8 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
struct request_queue *q = ns->queue;
struct nvm_dev *dev;
+ _nvme_nvm_check_size();
+
dev = nvm_alloc_dev(node);
if (!dev)
return -ENOMEM;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ab2d6ec7eb5c..29c708ca9621 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -21,16 +21,6 @@
#include <linux/lightnvm.h>
#include <linux/sed-opal.h>
-enum {
- /*
- * Driver internal status code for commands that were cancelled due
- * to timeouts or controller shutdown. The value is negative so
- * that it a) doesn't overlap with the unsigned hardware error codes,
- * and b) can easily be tested for.
- */
- NVME_SC_CANCELLED = -EINTR,
-};
-
extern unsigned char nvme_io_timeout;
#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ)
@@ -43,8 +33,6 @@ extern unsigned char shutdown_timeout;
#define NVME_DEFAULT_KATO 5
#define NVME_KATO_GRACE 10
-extern unsigned int nvme_max_retries;
-
enum {
NVME_NS_LBA = 0,
NVME_NS_LIGHTNVM = 1,
@@ -68,10 +56,10 @@ enum nvme_quirks {
NVME_QUIRK_IDENTIFY_CNS = (1 << 1),
/*
- * The controller deterministically returns O's on reads to discarded
- * logical blocks.
+ * The controller deterministically returns O's on reads to
+ * logical blocks that deallocate was called on.
*/
- NVME_QUIRK_DISCARD_ZEROES = (1 << 2),
+ NVME_QUIRK_DEALLOCATE_ZEROES = (1 << 2),
/*
* The controller needs a delay before starts checking the device
@@ -97,6 +85,13 @@ enum nvme_quirks {
struct nvme_request {
struct nvme_command *cmd;
union nvme_result result;
+ u8 retries;
+ u8 flags;
+ u16 status;
+};
+
+enum {
+ NVME_REQ_CANCELLED = (1 << 0),
};
static inline struct nvme_request *nvme_req(struct request *req)
@@ -254,25 +249,17 @@ static inline void nvme_cleanup_cmd(struct request *req)
}
}
-static inline int nvme_error_status(u16 status)
+static inline void nvme_end_request(struct request *req, __le16 status,
+ union nvme_result result)
{
- switch (status & 0x7ff) {
- case NVME_SC_SUCCESS:
- return 0;
- case NVME_SC_CAP_EXCEEDED:
- return -ENOSPC;
- default:
- return -EIO;
- }
-}
+ struct nvme_request *rq = nvme_req(req);
-static inline bool nvme_req_needs_retry(struct request *req, u16 status)
-{
- return !(status & NVME_SC_DNR || blk_noretry_request(req)) &&
- (jiffies - req->start_time) < req->timeout &&
- req->retries < nvme_max_retries;
+ rq->status = le16_to_cpu(status) >> 1;
+ rq->result = result;
+ blk_mq_complete_request(req);
}
+void nvme_complete_rq(struct request *req);
void nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
@@ -307,7 +294,6 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl);
#define NVME_QID_ANY -1
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags, int qid);
-void nvme_requeue_req(struct request *req);
int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd);
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5d309535abbd..c8541c3dcd19 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -104,8 +104,22 @@ struct nvme_dev {
u32 cmbloc;
struct nvme_ctrl ctrl;
struct completion ioq_wait;
+ u32 *dbbuf_dbs;
+ dma_addr_t dbbuf_dbs_dma_addr;
+ u32 *dbbuf_eis;
+ dma_addr_t dbbuf_eis_dma_addr;
};
+static inline unsigned int sq_idx(unsigned int qid, u32 stride)
+{
+ return qid * 2 * stride;
+}
+
+static inline unsigned int cq_idx(unsigned int qid, u32 stride)
+{
+ return (qid * 2 + 1) * stride;
+}
+
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
return container_of(ctrl, struct nvme_dev, ctrl);
@@ -134,6 +148,10 @@ struct nvme_queue {
u16 qid;
u8 cq_phase;
u8 cqe_seen;
+ u32 *dbbuf_sq_db;
+ u32 *dbbuf_cq_db;
+ u32 *dbbuf_sq_ei;
+ u32 *dbbuf_cq_ei;
};
/*
@@ -172,6 +190,112 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
+}
+
+static inline unsigned int nvme_dbbuf_size(u32 stride)
+{
+ return ((num_possible_cpus() + 1) * 8 * stride);
+}
+
+static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
+{
+ unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+ if (dev->dbbuf_dbs)
+ return 0;
+
+ dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
+ &dev->dbbuf_dbs_dma_addr,
+ GFP_KERNEL);
+ if (!dev->dbbuf_dbs)
+ return -ENOMEM;
+ dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
+ &dev->dbbuf_eis_dma_addr,
+ GFP_KERNEL);
+ if (!dev->dbbuf_eis) {
+ dma_free_coherent(dev->dev, mem_size,
+ dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+ dev->dbbuf_dbs = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
+{
+ unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+ if (dev->dbbuf_dbs) {
+ dma_free_coherent(dev->dev, mem_size,
+ dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+ dev->dbbuf_dbs = NULL;
+ }
+ if (dev->dbbuf_eis) {
+ dma_free_coherent(dev->dev, mem_size,
+ dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
+ dev->dbbuf_eis = NULL;
+ }
+}
+
+static void nvme_dbbuf_init(struct nvme_dev *dev,
+ struct nvme_queue *nvmeq, int qid)
+{
+ if (!dev->dbbuf_dbs || !qid)
+ return;
+
+ nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
+ nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
+ nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
+ nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
+}
+
+static void nvme_dbbuf_set(struct nvme_dev *dev)
+{
+ struct nvme_command c;
+
+ if (!dev->dbbuf_dbs)
+ return;
+
+ memset(&c, 0, sizeof(c));
+ c.dbbuf.opcode = nvme_admin_dbbuf;
+ c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
+ c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
+
+ if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
+ dev_warn(dev->dev, "unable to set dbbuf\n");
+ /* Free memory and continue on */
+ nvme_dbbuf_dma_free(dev);
+ }
+}
+
+static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
+{
+ return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
+}
+
+/* Update dbbuf and return true if an MMIO is required */
+static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
+ volatile u32 *dbbuf_ei)
+{
+ if (dbbuf_db) {
+ u16 old_value;
+
+ /*
+ * Ensure that the queue is written before updating
+ * the doorbell in memory
+ */
+ wmb();
+
+ old_value = *dbbuf_db;
+ *dbbuf_db = value;
+
+ if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
+ return false;
+ }
+
+ return true;
}
/*
@@ -298,7 +422,9 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
if (++tail == nvmeq->q_depth)
tail = 0;
- writel(tail, nvmeq->q_db);
+ if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
+ nvmeq->dbbuf_sq_ei))
+ writel(tail, nvmeq->q_db);
nvmeq->sq_tail = tail;
}
@@ -327,10 +453,6 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
iod->nents = 0;
iod->length = size;
- if (!(rq->rq_flags & RQF_DONTPREP)) {
- rq->retries = 0;
- rq->rq_flags |= RQF_DONTPREP;
- }
return BLK_MQ_RQ_QUEUE_OK;
}
@@ -629,34 +751,12 @@ out_free_cmd:
return ret;
}
-static void nvme_complete_rq(struct request *req)
+static void nvme_pci_complete_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_dev *dev = iod->nvmeq->dev;
- int error = 0;
-
- nvme_unmap_data(dev, req);
-
- if (unlikely(req->errors)) {
- if (nvme_req_needs_retry(req, req->errors)) {
- req->retries++;
- nvme_requeue_req(req);
- return;
- }
-
- if (blk_rq_is_passthrough(req))
- error = req->errors;
- else
- error = nvme_error_status(req->errors);
- }
-
- if (unlikely(iod->aborted)) {
- dev_warn(dev->ctrl.device,
- "completing aborted command with status: %04x\n",
- req->errors);
- }
- blk_mq_end_request(req, error);
+ nvme_unmap_data(iod->nvmeq->dev, req);
+ nvme_complete_rq(req);
}
/* We read the CQE phase first to check if the rest of the entry is valid */
@@ -706,15 +806,16 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
}
req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
- nvme_req(req)->result = cqe.result;
- blk_mq_complete_request(req, le16_to_cpu(cqe.status) >> 1);
+ nvme_end_request(req, cqe.status, cqe.result);
}
if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
return;
if (likely(nvmeq->cq_vector >= 0))
- writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+ if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
+ nvmeq->dbbuf_cq_ei))
+ writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
nvmeq->cq_head = head;
nvmeq->cq_phase = phase;
@@ -746,10 +847,8 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
return IRQ_NONE;
}
-static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
{
- struct nvme_queue *nvmeq = hctx->driver_data;
-
if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
spin_lock_irq(&nvmeq->q_lock);
__nvme_process_cq(nvmeq, &tag);
@@ -762,6 +861,13 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
return 0;
}
+static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+ struct nvme_queue *nvmeq = hctx->driver_data;
+
+ return __nvme_poll(nvmeq, tag);
+}
+
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
@@ -813,7 +919,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
struct nvme_queue *nvmeq)
{
struct nvme_command c;
- int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
+ int flags = NVME_QUEUE_PHYS_CONTIG;
/*
* Note: we (ab)use the fact the the prp fields survive if no data
@@ -844,9 +950,9 @@ static void abort_endio(struct request *req, int error)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = iod->nvmeq;
- u16 status = req->errors;
- dev_warn(nvmeq->dev->ctrl.device, "Abort status: 0x%x", status);
+ dev_warn(nvmeq->dev->ctrl.device,
+ "Abort status: 0x%x", nvme_req(req)->status);
atomic_inc(&nvmeq->dev->ctrl.abort_limit);
blk_mq_free_request(req);
}
@@ -860,6 +966,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
struct nvme_command cmd;
/*
+ * Did we miss an interrupt?
+ */
+ if (__nvme_poll(nvmeq, req->tag)) {
+ dev_warn(dev->ctrl.device,
+ "I/O %d QID %d timeout, completion polled\n",
+ req->tag, nvmeq->qid);
+ return BLK_EH_HANDLED;
+ }
+
+ /*
* Shutdown immediately if controller times out while starting. The
* reset work will see the pci device disabled when it gets the forced
* cancellation error. All outstanding requests are completed on
@@ -870,7 +986,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
"I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid);
nvme_dev_disable(dev, false);
- req->errors = NVME_SC_CANCELLED;
+ nvme_req(req)->flags |= NVME_REQ_CANCELLED;
return BLK_EH_HANDLED;
}
@@ -890,7 +1006,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
* Mark the request as handled, since the inline shutdown
* forces all outstanding requests to complete.
*/
- req->errors = NVME_SC_CANCELLED;
+ nvme_req(req)->flags |= NVME_REQ_CANCELLED;
return BLK_EH_HANDLED;
}
@@ -1098,6 +1214,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+ nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
spin_unlock_irq(&nvmeq->q_lock);
}
@@ -1130,18 +1247,18 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
return result;
}
-static struct blk_mq_ops nvme_mq_admin_ops = {
+static const struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,
- .complete = nvme_complete_rq,
+ .complete = nvme_pci_complete_rq,
.init_hctx = nvme_admin_init_hctx,
.exit_hctx = nvme_admin_exit_hctx,
.init_request = nvme_admin_init_request,
.timeout = nvme_timeout,
};
-static struct blk_mq_ops nvme_mq_ops = {
+static const struct blk_mq_ops nvme_mq_ops = {
.queue_rq = nvme_queue_rq,
- .complete = nvme_complete_rq,
+ .complete = nvme_pci_complete_rq,
.init_hctx = nvme_init_hctx,
.init_request = nvme_init_request,
.map_queues = nvme_pci_map_queues,
@@ -1570,6 +1687,8 @@ static int nvme_dev_add(struct nvme_dev *dev)
if (blk_mq_alloc_tag_set(&dev->tagset))
return 0;
dev->ctrl.tagset = &dev->tagset;
+
+ nvme_dbbuf_set(dev);
} else {
blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
@@ -1756,6 +1875,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
+ nvme_dbbuf_dma_free(dev);
put_device(dev->dev);
if (dev->tagset.tags)
blk_mq_free_tag_set(&dev->tagset);
@@ -1823,6 +1943,13 @@ static void nvme_reset_work(struct work_struct *work)
dev->ctrl.opal_dev = NULL;
}
+ if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
+ result = nvme_dbbuf_dma_alloc(dev);
+ if (result)
+ dev_warn(dev->dev,
+ "unable to allocate dma for dbbuf\n");
+ }
+
result = nvme_setup_io_queues(dev);
if (result)
goto out;
@@ -2159,13 +2286,13 @@ static const struct pci_error_handlers nvme_err_handler = {
static const struct pci_device_id nvme_id_table[] = {
{ PCI_VDEVICE(INTEL, 0x0953),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DISCARD_ZEROES, },
+ NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a53),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DISCARD_ZEROES, },
+ NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a54),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DISCARD_ZEROES, },
+ NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
{ PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 16f84eb0b95e..29cf88ac3f61 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -34,7 +34,7 @@
#include "fabrics.h"
-#define NVME_RDMA_CONNECT_TIMEOUT_MS 1000 /* 1 second */
+#define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */
#define NVME_RDMA_MAX_SEGMENT_SIZE 0xffffff /* 24-bit SGL field */
@@ -118,7 +118,6 @@ struct nvme_rdma_ctrl {
struct nvme_rdma_qe async_event_sqe;
- int reconnect_delay;
struct delayed_work reconnect_work;
struct list_head list;
@@ -129,14 +128,8 @@ struct nvme_rdma_ctrl {
u64 cap;
u32 max_fr_pages;
- union {
- struct sockaddr addr;
- struct sockaddr_in addr_in;
- };
- union {
- struct sockaddr src_addr;
- struct sockaddr_in src_addr_in;
- };
+ struct sockaddr_storage addr;
+ struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl;
};
@@ -569,11 +562,12 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
return PTR_ERR(queue->cm_id);
}
- queue->cm_error = -ETIMEDOUT;
if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
- src_addr = &ctrl->src_addr;
+ src_addr = (struct sockaddr *)&ctrl->src_addr;
- ret = rdma_resolve_addr(queue->cm_id, src_addr, &ctrl->addr,
+ queue->cm_error = -ETIMEDOUT;
+ ret = rdma_resolve_addr(queue->cm_id, src_addr,
+ (struct sockaddr *)&ctrl->addr,
NVME_RDMA_CONNECT_TIMEOUT_MS);
if (ret) {
dev_info(ctrl->ctrl.device,
@@ -712,6 +706,26 @@ free_ctrl:
kfree(ctrl);
}
+static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
+{
+ /* If we are resetting/deleting then do nothing */
+ if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) {
+ WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
+ ctrl->ctrl.state == NVME_CTRL_LIVE);
+ return;
+ }
+
+ if (nvmf_should_reconnect(&ctrl->ctrl)) {
+ dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
+ ctrl->ctrl.opts->reconnect_delay);
+ queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
+ ctrl->ctrl.opts->reconnect_delay * HZ);
+ } else {
+ dev_info(ctrl->ctrl.device, "Removing controller...\n");
+ queue_work(nvme_rdma_wq, &ctrl->delete_work);
+ }
+}
+
static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
{
struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
@@ -719,6 +733,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
bool changed;
int ret;
+ ++ctrl->ctrl.opts->nr_reconnects;
+
if (ctrl->queue_count > 1) {
nvme_rdma_free_io_queues(ctrl);
@@ -763,6 +779,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
+ ctrl->ctrl.opts->nr_reconnects = 0;
if (ctrl->queue_count > 1) {
nvme_start_queues(&ctrl->ctrl);
@@ -777,13 +794,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
stop_admin_q:
blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
requeue:
- /* Make sure we are not resetting/deleting */
- if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
- dev_info(ctrl->ctrl.device,
- "Failed reconnect attempt, requeueing...\n");
- queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
- ctrl->reconnect_delay * HZ);
- }
+ dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
+ ctrl->ctrl.opts->nr_reconnects);
+ nvme_rdma_reconnect_or_remove(ctrl);
}
static void nvme_rdma_error_recovery_work(struct work_struct *work)
@@ -810,11 +823,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl);
- dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n",
- ctrl->reconnect_delay);
-
- queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
- ctrl->reconnect_delay * HZ);
+ nvme_rdma_reconnect_or_remove(ctrl);
}
static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
@@ -1169,8 +1178,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
wc->ex.invalidate_rkey == req->mr->rkey)
req->mr->need_inval = false;
- req->req.result = cqe->result;
- blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1);
+ nvme_end_request(rq, cqe->status, cqe->result);
return ret;
}
@@ -1407,7 +1415,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
nvme_rdma_error_recovery(req->queue->ctrl);
/* fail with DNR on cmd timeout */
- rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+ nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
return BLK_EH_HANDLED;
}
@@ -1509,27 +1517,12 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
static void nvme_rdma_complete_rq(struct request *rq)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
- struct nvme_rdma_queue *queue = req->queue;
- int error = 0;
-
- nvme_rdma_unmap_data(queue, rq);
- if (unlikely(rq->errors)) {
- if (nvme_req_needs_retry(rq, rq->errors)) {
- nvme_requeue_req(rq);
- return;
- }
-
- if (blk_rq_is_passthrough(rq))
- error = rq->errors;
- else
- error = nvme_error_status(rq->errors);
- }
-
- blk_mq_end_request(rq, error);
+ nvme_rdma_unmap_data(req->queue, rq);
+ nvme_complete_rq(rq);
}
-static struct blk_mq_ops nvme_rdma_mq_ops = {
+static const struct blk_mq_ops nvme_rdma_mq_ops = {
.queue_rq = nvme_rdma_queue_rq,
.complete = nvme_rdma_complete_rq,
.init_request = nvme_rdma_init_request,
@@ -1540,7 +1533,7 @@ static struct blk_mq_ops nvme_rdma_mq_ops = {
.timeout = nvme_rdma_timeout,
};
-static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
+static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
.queue_rq = nvme_rdma_queue_rq,
.complete = nvme_rdma_complete_rq,
.init_request = nvme_rdma_init_admin_request,
@@ -1857,27 +1850,13 @@ out_free_io_queues:
return ret;
}
-static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p)
-{
- u8 *addr = (u8 *)&in_addr->sin_addr.s_addr;
- size_t buflen = strlen(p);
-
- /* XXX: handle IPv6 addresses */
-
- if (buflen > INET_ADDRSTRLEN)
- return -EINVAL;
- if (in4_pton(p, buflen, addr, '\0', NULL) == 0)
- return -EINVAL;
- in_addr->sin_family = AF_INET;
- return 0;
-}
-
static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
struct nvme_rdma_ctrl *ctrl;
int ret;
bool changed;
+ char *port;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
@@ -1885,40 +1864,33 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);
- ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr);
+ if (opts->mask & NVMF_OPT_TRSVCID)
+ port = opts->trsvcid;
+ else
+ port = __stringify(NVME_RDMA_IP_PORT);
+
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+ opts->traddr, port, &ctrl->addr);
if (ret) {
- pr_err("malformed IP address passed: %s\n", opts->traddr);
+ pr_err("malformed address passed: %s:%s\n", opts->traddr, port);
goto out_free_ctrl;
}
if (opts->mask & NVMF_OPT_HOST_TRADDR) {
- ret = nvme_rdma_parse_ipaddr(&ctrl->src_addr_in,
- opts->host_traddr);
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+ opts->host_traddr, NULL, &ctrl->src_addr);
if (ret) {
- pr_err("malformed src IP address passed: %s\n",
+ pr_err("malformed src address passed: %s\n",
opts->host_traddr);
goto out_free_ctrl;
}
}
- if (opts->mask & NVMF_OPT_TRSVCID) {
- u16 port;
-
- ret = kstrtou16(opts->trsvcid, 0, &port);
- if (ret)
- goto out_free_ctrl;
-
- ctrl->addr_in.sin_port = cpu_to_be16(port);
- } else {
- ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT);
- }
-
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
0 /* no quirks, we're perfect! */);
if (ret)
goto out_free_ctrl;
- ctrl->reconnect_delay = opts->reconnect_delay;
INIT_DELAYED_WORK(&ctrl->reconnect_work,
nvme_rdma_reconnect_ctrl_work);
INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
@@ -1977,7 +1949,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
- dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
+ dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
kref_get(&ctrl->ctrl.kref);
@@ -2013,7 +1985,7 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
.name = "rdma",
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
- NVMF_OPT_HOST_TRADDR,
+ NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
.create_ctrl = nvme_rdma_create_ctrl,
};
@@ -2055,12 +2027,20 @@ static int __init nvme_rdma_init_module(void)
return -ENOMEM;
ret = ib_register_client(&nvme_rdma_ib_client);
- if (ret) {
- destroy_workqueue(nvme_rdma_wq);
- return ret;
- }
+ if (ret)
+ goto err_destroy_wq;
+
+ ret = nvmf_register_transport(&nvme_rdma_transport);
+ if (ret)
+ goto err_unreg_client;
+
+ return 0;
- return nvmf_register_transport(&nvme_rdma_transport);
+err_unreg_client:
+ ib_unregister_client(&nvme_rdma_ib_client);
+err_destroy_wq:
+ destroy_workqueue(nvme_rdma_wq);
+ return ret;
}
static void __exit nvme_rdma_cleanup_module(void)
diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c
index f49ae2758bb7..1f7671e631dd 100644
--- a/drivers/nvme/host/scsi.c
+++ b/drivers/nvme/host/scsi.c
@@ -1609,7 +1609,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
struct nvme_command c;
u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read);
u16 control;
- u32 max_blocks = queue_max_hw_sectors(ns->queue);
+ u32 max_blocks = queue_max_hw_sectors(ns->queue) >> (ns->lba_shift - 9);
num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks);
@@ -2138,15 +2138,6 @@ static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
return res;
}
-static int nvme_trans_security_protocol(struct nvme_ns *ns,
- struct sg_io_hdr *hdr,
- u8 *cmd)
-{
- return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
- ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND,
- SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-}
-
static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
struct sg_io_hdr *hdr)
{
@@ -2414,10 +2405,6 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr)
case REQUEST_SENSE:
retcode = nvme_trans_request_sense(ns, hdr, cmd);
break;
- case SECURITY_PROTOCOL_IN:
- case SECURITY_PROTOCOL_OUT:
- retcode = nvme_trans_security_protocol(ns, hdr, cmd);
- break;
case SYNCHRONIZE_CACHE:
retcode = nvme_trans_synchronize_cache(ns, hdr);
break;
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 76450b0c55f1..ff1f97006322 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -121,7 +121,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
}
switch (req->cmd->get_log_page.lid) {
- case 0x01:
+ case NVME_LOG_ERROR:
/*
* We currently never set the More bit in the status field,
* so all error log entries are invalid and can be zeroed out.
@@ -129,7 +129,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
* mandatory log page.
*/
break;
- case 0x02:
+ case NVME_LOG_SMART:
/*
* XXX: fill out actual smart log
*
@@ -149,7 +149,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
goto err;
}
break;
- case 0x03:
+ case NVME_LOG_FW_SLOT:
/*
* We only support a single firmware slot which always is
* active, so we can zero out the whole firmware slot log and
@@ -480,31 +480,25 @@ static void nvmet_execute_keep_alive(struct nvmet_req *req)
nvmet_req_complete(req, 0);
}
-int nvmet_parse_admin_cmd(struct nvmet_req *req)
+u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
+ u16 ret;
req->ns = NULL;
- if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
- pr_err("nvmet: got admin cmd %d while CC.EN == 0\n",
- cmd->common.opcode);
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
- }
- if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
- pr_err("nvmet: got admin cmd %d while CSTS.RDY == 0\n",
- cmd->common.opcode);
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
- }
+ ret = nvmet_check_ctrl_status(req, cmd);
+ if (unlikely(ret))
+ return ret;
switch (cmd->common.opcode) {
case nvme_admin_get_log_page:
req->data_len = nvmet_get_log_page_len(cmd);
switch (cmd->get_log_page.lid) {
- case 0x01:
- case 0x02:
- case 0x03:
+ case NVME_LOG_ERROR:
+ case NVME_LOG_SMART:
+ case NVME_LOG_FW_SLOT:
req->execute = nvmet_execute_get_log_page;
return 0;
}
@@ -545,6 +539,7 @@ int nvmet_parse_admin_cmd(struct nvmet_req *req)
return 0;
}
- pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+ pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+ req->sq->qid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 798653b329b2..cf90713043da 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -273,8 +273,8 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE,
NULL);
if (IS_ERR(ns->bdev)) {
- pr_err("nvmet: failed to open block device %s: (%ld)\n",
- ns->device_path, PTR_ERR(ns->bdev));
+ pr_err("failed to open block device %s: (%ld)\n",
+ ns->device_path, PTR_ERR(ns->bdev));
ret = PTR_ERR(ns->bdev);
ns->bdev = NULL;
goto out_unlock;
@@ -661,6 +661,23 @@ out:
return status;
}
+u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
+{
+ if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
+ pr_err("got io cmd %d while CC.EN == 0 on qid = %d\n",
+ cmd->common.opcode, req->sq->qid);
+ return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+ }
+
+ if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
+ pr_err("got io cmd %d while CSTS.RDY == 0 on qid = %d\n",
+ cmd->common.opcode, req->sq->qid);
+ req->ns = NULL;
+ return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+ }
+ return 0;
+}
+
static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
const char *hostnqn)
{
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index af8aabf05335..1aaf597e81fc 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -159,15 +159,15 @@ out:
nvmet_req_complete(req, status);
}
-int nvmet_parse_discovery_cmd(struct nvmet_req *req)
+u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
req->ns = NULL;
if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
- pr_err("nvmet: got cmd %d while not ready\n",
- cmd->common.opcode);
+ pr_err("got cmd %d while not ready\n",
+ cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
@@ -180,8 +180,8 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req)
req->execute = nvmet_execute_get_disc_log_page;
return 0;
default:
- pr_err("nvmet: unsupported get_log_page lid %d\n",
- cmd->get_log_page.lid);
+ pr_err("unsupported get_log_page lid %d\n",
+ cmd->get_log_page.lid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
case nvme_admin_identify:
@@ -192,17 +192,16 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req)
nvmet_execute_identify_disc_ctrl;
return 0;
default:
- pr_err("nvmet: unsupported identify cns %d\n",
- cmd->identify.cns);
+ pr_err("unsupported identify cns %d\n",
+ cmd->identify.cns);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
default:
- pr_err("nvmet: unsupported cmd %d\n",
- cmd->common.opcode);
+ pr_err("unsupported cmd %d\n", cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
- pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+ pr_err("unhandled cmd %d\n", cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 8bd022af3df6..3cc17269504b 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -73,7 +73,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
nvmet_req_complete(req, status);
}
-int nvmet_parse_fabrics_cmd(struct nvmet_req *req)
+u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
@@ -122,7 +122,15 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
struct nvmet_ctrl *ctrl = NULL;
u16 status = 0;
- d = kmap(sg_page(req->sg)) + req->sg->offset;
+ d = kmalloc(sizeof(*d), GFP_KERNEL);
+ if (!d) {
+ status = NVME_SC_INTERNAL;
+ goto complete;
+ }
+
+ status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+ if (status)
+ goto out;
/* zero out initial completion result, assign values as needed */
req->rsp->result.u32 = 0;
@@ -143,7 +151,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
}
status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
- le32_to_cpu(c->kato), &ctrl);
+ le32_to_cpu(c->kato), &ctrl);
if (status)
goto out;
@@ -158,7 +166,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid);
out:
- kunmap(sg_page(req->sg));
+ kfree(d);
+complete:
nvmet_req_complete(req, status);
}
@@ -170,7 +179,15 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
u16 qid = le16_to_cpu(c->qid);
u16 status = 0;
- d = kmap(sg_page(req->sg)) + req->sg->offset;
+ d = kmalloc(sizeof(*d), GFP_KERNEL);
+ if (!d) {
+ status = NVME_SC_INTERNAL;
+ goto complete;
+ }
+
+ status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+ if (status)
+ goto out;
/* zero out initial completion result, assign values as needed */
req->rsp->result.u32 = 0;
@@ -183,8 +200,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
}
status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn,
- le16_to_cpu(d->cntlid),
- req, &ctrl);
+ le16_to_cpu(d->cntlid),
+ req, &ctrl);
if (status)
goto out;
@@ -205,7 +222,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
out:
- kunmap(sg_page(req->sg));
+ kfree(d);
+complete:
nvmet_req_complete(req, status);
return;
@@ -214,7 +232,7 @@ out_ctrl_put:
goto out;
}
-int nvmet_parse_connect_cmd(struct nvmet_req *req)
+u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 8f483ee7868c..62eba29c85fb 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -82,10 +82,13 @@ struct nvmet_fc_fcp_iod {
enum nvmet_fcp_datadir io_dir;
bool active;
bool abort;
+ bool aborted;
+ bool writedataactive;
spinlock_t flock;
struct nvmet_req req;
struct work_struct work;
+ struct work_struct done_work;
struct nvmet_fc_tgtport *tgtport;
struct nvmet_fc_tgt_queue *queue;
@@ -116,7 +119,7 @@ struct nvmet_fc_tgt_queue {
u16 qid;
u16 sqsize;
u16 ersp_ratio;
- u16 sqhd;
+ __le16 sqhd;
int cpu;
atomic_t connected;
atomic_t sqtail;
@@ -213,6 +216,7 @@ static DEFINE_IDA(nvmet_fc_tgtport_cnt);
static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work);
static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work);
+static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work);
static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc);
static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue);
@@ -414,9 +418,13 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
for (i = 0; i < queue->sqsize; fod++, i++) {
INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work);
+ INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work);
fod->tgtport = tgtport;
fod->queue = queue;
fod->active = false;
+ fod->abort = false;
+ fod->aborted = false;
+ fod->fcpreq = NULL;
list_add_tail(&fod->fcp_list, &queue->fod_list);
spin_lock_init(&fod->flock);
@@ -463,7 +471,6 @@ nvmet_fc_alloc_fcp_iod(struct nvmet_fc_tgt_queue *queue)
if (fod) {
list_del(&fod->fcp_list);
fod->active = true;
- fod->abort = false;
/*
* no queue reference is taken, as it was taken by the
* queue lookup just prior to the allocation. The iod
@@ -479,17 +486,30 @@ static void
nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
struct nvmet_fc_fcp_iod *fod)
{
+ struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+ struct nvmet_fc_tgtport *tgtport = fod->tgtport;
unsigned long flags;
+ fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
+ sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+
+ fcpreq->nvmet_fc_private = NULL;
+
spin_lock_irqsave(&queue->qlock, flags);
list_add_tail(&fod->fcp_list, &fod->queue->fod_list);
fod->active = false;
+ fod->abort = false;
+ fod->aborted = false;
+ fod->writedataactive = false;
+ fod->fcpreq = NULL;
spin_unlock_irqrestore(&queue->qlock, flags);
/*
* release the reference taken at queue lookup and fod allocation
*/
nvmet_fc_tgt_q_put(queue);
+
+ tgtport->ops->fcp_req_release(&tgtport->fc_target_port, fcpreq);
}
static int
@@ -616,32 +636,12 @@ nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue)
static void
-nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
- struct nvmefc_tgt_fcp_req *fcpreq)
-{
- int ret;
-
- fcpreq->op = NVMET_FCOP_ABORT;
- fcpreq->offset = 0;
- fcpreq->timeout = 0;
- fcpreq->transfer_length = 0;
- fcpreq->transferred_length = 0;
- fcpreq->fcp_error = 0;
- fcpreq->sg_cnt = 0;
-
- ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fcpreq);
- if (ret)
- /* should never reach here !! */
- WARN_ON(1);
-}
-
-
-static void
nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
{
+ struct nvmet_fc_tgtport *tgtport = queue->assoc->tgtport;
struct nvmet_fc_fcp_iod *fod = queue->fod;
unsigned long flags;
- int i;
+ int i, writedataactive;
bool disconnect;
disconnect = atomic_xchg(&queue->connected, 0);
@@ -652,7 +652,20 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
if (fod->active) {
spin_lock(&fod->flock);
fod->abort = true;
+ writedataactive = fod->writedataactive;
spin_unlock(&fod->flock);
+ /*
+ * only call lldd abort routine if waiting for
+ * writedata. other outstanding ops should finish
+ * on their own.
+ */
+ if (writedataactive) {
+ spin_lock(&fod->flock);
+ fod->aborted = true;
+ spin_unlock(&fod->flock);
+ tgtport->ops->fcp_abort(
+ &tgtport->fc_target_port, fod->fcpreq);
+ }
}
}
spin_unlock_irqrestore(&queue->qlock, flags);
@@ -846,7 +859,8 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo,
int ret, idx;
if (!template->xmt_ls_rsp || !template->fcp_op ||
- !template->targetport_delete ||
+ !template->fcp_abort ||
+ !template->fcp_req_release || !template->targetport_delete ||
!template->max_hw_queues || !template->max_sgl_segments ||
!template->max_dif_sgl_segments || !template->dma_boundary) {
ret = -EINVAL;
@@ -1044,7 +1058,7 @@ EXPORT_SYMBOL_GPL(nvmet_fc_unregister_targetport);
static void
-nvmet_fc_format_rsp_hdr(void *buf, u8 ls_cmd, u32 desc_len, u8 rqst_ls_cmd)
+nvmet_fc_format_rsp_hdr(void *buf, u8 ls_cmd, __be32 desc_len, u8 rqst_ls_cmd)
{
struct fcnvme_ls_acc_hdr *acc = buf;
@@ -1189,8 +1203,8 @@ nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport,
validation_errors[ret]);
iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
- ELS_RJT_LOGIC,
- ELS_EXPL_NONE, 0);
+ FCNVME_RJT_RC_LOGIC,
+ FCNVME_RJT_EXP_NONE, 0);
return;
}
@@ -1281,8 +1295,9 @@ nvmet_fc_ls_create_connection(struct nvmet_fc_tgtport *tgtport,
iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
(ret == VERR_NO_ASSOC) ?
- ELS_RJT_PROT : ELS_RJT_LOGIC,
- ELS_EXPL_NONE, 0);
+ FCNVME_RJT_RC_INV_ASSOC :
+ FCNVME_RJT_RC_LOGIC,
+ FCNVME_RJT_EXP_NONE, 0);
return;
}
@@ -1369,8 +1384,12 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
validation_errors[ret]);
iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
- (ret == 8) ? ELS_RJT_PROT : ELS_RJT_LOGIC,
- ELS_EXPL_NONE, 0);
+ (ret == VERR_NO_ASSOC) ?
+ FCNVME_RJT_RC_INV_ASSOC :
+ (ret == VERR_NO_CONN) ?
+ FCNVME_RJT_RC_INV_CONN :
+ FCNVME_RJT_RC_LOGIC,
+ FCNVME_RJT_EXP_NONE, 0);
return;
}
@@ -1479,7 +1498,7 @@ nvmet_fc_handle_ls_rqst(struct nvmet_fc_tgtport *tgtport,
default:
iod->lsreq->rsplen = nvmet_fc_format_rjt(iod->rspbuf,
NVME_FC_MAX_LS_BUFFER_SIZE, w0->ls_cmd,
- ELS_RJT_INVAL, ELS_EXPL_NONE, 0);
+ FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0);
}
nvmet_fc_xmt_ls_rsp(tgtport, iod);
@@ -1619,6 +1638,8 @@ nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count)
__free_page(sg_page(sg));
kfree(fod->data_sg);
+ fod->data_sg = NULL;
+ fod->data_sg_cnt = 0;
}
@@ -1679,7 +1700,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
xfr_length != fod->total_length ||
(le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] ||
(sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) ||
- queue_90percent_full(fod->queue, cqe->sq_head))
+ queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head)))
send_ersp = true;
/* re-set the fields */
@@ -1704,6 +1725,26 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
static void nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq);
static void
+nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
+ struct nvmet_fc_fcp_iod *fod)
+{
+ struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+
+ /* data no longer needed */
+ nvmet_fc_free_tgt_pgs(fod);
+
+ /*
+ * if an ABTS was received or we issued the fcp_abort early
+ * don't call abort routine again.
+ */
+ /* no need to take lock - lock was taken earlier to get here */
+ if (!fod->aborted)
+ tgtport->ops->fcp_abort(&tgtport->fc_target_port, fcpreq);
+
+ nvmet_fc_free_fcp_iod(fod->queue, fod);
+}
+
+static void
nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
struct nvmet_fc_fcp_iod *fod)
{
@@ -1716,7 +1757,7 @@ nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq);
if (ret)
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
+ nvmet_fc_abort_op(tgtport, fod);
}
static void
@@ -1725,6 +1766,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
{
struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
struct scatterlist *sg, *datasg;
+ unsigned long flags;
u32 tlen, sg_off;
int ret;
@@ -1789,10 +1831,13 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
*/
fod->abort = true;
- if (op == NVMET_FCOP_WRITEDATA)
+ if (op == NVMET_FCOP_WRITEDATA) {
+ spin_lock_irqsave(&fod->flock, flags);
+ fod->writedataactive = false;
+ spin_unlock_irqrestore(&fod->flock, flags);
nvmet_req_complete(&fod->req,
NVME_SC_FC_TRANSPORT_ERROR);
- else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
+ } else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
fcpreq->fcp_error = ret;
fcpreq->transferred_length = 0;
nvmet_fc_xmt_fcp_op_done(fod->fcpreq);
@@ -1800,32 +1845,54 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
}
}
+static inline bool
+__nvmet_fc_fod_op_abort(struct nvmet_fc_fcp_iod *fod, bool abort)
+{
+ struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+ struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+
+ /* if in the middle of an io and we need to tear down */
+ if (abort) {
+ if (fcpreq->op == NVMET_FCOP_WRITEDATA) {
+ nvmet_req_complete(&fod->req,
+ NVME_SC_FC_TRANSPORT_ERROR);
+ return true;
+ }
+
+ nvmet_fc_abort_op(tgtport, fod);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * actual done handler for FCP operations when completed by the lldd
+ */
static void
-nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
{
- struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+ struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
struct nvmet_fc_tgtport *tgtport = fod->tgtport;
unsigned long flags;
bool abort;
spin_lock_irqsave(&fod->flock, flags);
abort = fod->abort;
+ fod->writedataactive = false;
spin_unlock_irqrestore(&fod->flock, flags);
- /* if in the middle of an io and we need to tear down */
- if (abort && fcpreq->op != NVMET_FCOP_ABORT) {
- /* data no longer needed */
- nvmet_fc_free_tgt_pgs(fod);
-
- nvmet_req_complete(&fod->req, fcpreq->fcp_error);
- return;
- }
-
switch (fcpreq->op) {
case NVMET_FCOP_WRITEDATA:
+ if (__nvmet_fc_fod_op_abort(fod, abort))
+ return;
if (fcpreq->fcp_error ||
fcpreq->transferred_length != fcpreq->transfer_length) {
+ spin_lock(&fod->flock);
+ fod->abort = true;
+ spin_unlock(&fod->flock);
+
nvmet_req_complete(&fod->req,
NVME_SC_FC_TRANSPORT_ERROR);
return;
@@ -1833,6 +1900,10 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
fod->offset += fcpreq->transferred_length;
if (fod->offset != fod->total_length) {
+ spin_lock_irqsave(&fod->flock, flags);
+ fod->writedataactive = true;
+ spin_unlock_irqrestore(&fod->flock, flags);
+
/* transfer the next chunk */
nvmet_fc_transfer_fcp_data(tgtport, fod,
NVMET_FCOP_WRITEDATA);
@@ -1847,12 +1918,11 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
case NVMET_FCOP_READDATA:
case NVMET_FCOP_READDATA_RSP:
+ if (__nvmet_fc_fod_op_abort(fod, abort))
+ return;
if (fcpreq->fcp_error ||
fcpreq->transferred_length != fcpreq->transfer_length) {
- /* data no longer needed */
- nvmet_fc_free_tgt_pgs(fod);
-
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
+ nvmet_fc_abort_op(tgtport, fod);
return;
}
@@ -1861,8 +1931,6 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
if (fcpreq->op == NVMET_FCOP_READDATA_RSP) {
/* data no longer needed */
nvmet_fc_free_tgt_pgs(fod);
- fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
- sizeof(fod->rspiubuf), DMA_TO_DEVICE);
nvmet_fc_free_fcp_iod(fod->queue, fod);
return;
}
@@ -1885,19 +1953,38 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
break;
case NVMET_FCOP_RSP:
- case NVMET_FCOP_ABORT:
- fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
- sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+ if (__nvmet_fc_fod_op_abort(fod, abort))
+ return;
nvmet_fc_free_fcp_iod(fod->queue, fod);
break;
default:
- nvmet_fc_free_tgt_pgs(fod);
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
break;
}
}
+static void
+nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work)
+{
+ struct nvmet_fc_fcp_iod *fod =
+ container_of(work, struct nvmet_fc_fcp_iod, done_work);
+
+ nvmet_fc_fod_op_done(fod);
+}
+
+static void
+nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+{
+ struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+ struct nvmet_fc_tgt_queue *queue = fod->queue;
+
+ if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR)
+ /* context switch so completion is not in ISR context */
+ queue_work_on(queue->cpu, queue->work_q, &fod->done_work);
+ else
+ nvmet_fc_fod_op_done(fod);
+}
+
/*
* actual completion handler after execution by the nvmet layer
*/
@@ -1919,10 +2006,7 @@ __nvmet_fc_fcp_nvme_cmd_done(struct nvmet_fc_tgtport *tgtport,
fod->queue->sqhd = cqe->sq_head;
if (abort) {
- /* data no longer needed */
- nvmet_fc_free_tgt_pgs(fod);
-
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
+ nvmet_fc_abort_op(tgtport, fod);
return;
}
@@ -1971,7 +2055,7 @@ nvmet_fc_fcp_nvme_cmd_done(struct nvmet_req *nvme_req)
/*
* Actual processing routine for received FC-NVME LS Requests from the LLD
*/
-void
+static void
nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
struct nvmet_fc_fcp_iod *fod)
{
@@ -2018,8 +2102,8 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
&fod->queue->nvme_cq,
&fod->queue->nvme_sq,
&nvmet_fc_tgt_fcp_ops);
- if (!ret) { /* bad SQE content */
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
+ if (!ret) { /* bad SQE content or invalid ctrl state */
+ nvmet_fc_abort_op(tgtport, fod);
return;
}
@@ -2059,7 +2143,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
return;
transport_error:
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
+ nvmet_fc_abort_op(tgtport, fod);
}
/*
@@ -2089,7 +2173,7 @@ nvmet_fc_handle_fcp_rqst_work(struct work_struct *work)
* If this routine returns error, the lldd should abort the exchange.
*
* @target_port: pointer to the (registered) target port the FCP CMD IU
- * was receive on.
+ * was received on.
* @fcpreq: pointer to a fcpreq request structure to be used to reference
* the exchange corresponding to the FCP Exchange.
* @cmdiubuf: pointer to the buffer containing the FCP CMD IU
@@ -2112,7 +2196,6 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
(be16_to_cpu(cmdiu->iu_len) != (sizeof(*cmdiu)/4)))
return -EIO;
-
queue = nvmet_fc_find_target_queue(tgtport,
be64_to_cpu(cmdiu->connection_id));
if (!queue)
@@ -2142,12 +2225,68 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0;
memcpy(&fod->cmdiubuf, cmdiubuf, cmdiubuf_len);
- queue_work_on(queue->cpu, queue->work_q, &fod->work);
+ if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR)
+ queue_work_on(queue->cpu, queue->work_q, &fod->work);
+ else
+ nvmet_fc_handle_fcp_rqst(tgtport, fod);
return 0;
}
EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_req);
+/**
+ * nvmet_fc_rcv_fcp_abort - transport entry point called by an LLDD
+ * upon the reception of an ABTS for a FCP command
+ *
+ * Notify the transport that an ABTS has been received for a FCP command
+ * that had been given to the transport via nvmet_fc_rcv_fcp_req(). The
+ * LLDD believes the command is still being worked on
+ * (template_ops->fcp_req_release() has not been called).
+ *
+ * The transport will wait for any outstanding work (an op to the LLDD,
+ * which the lldd should complete with error due to the ABTS; or the
+ * completion from the nvmet layer of the nvme command), then will
+ * stop processing and call the nvmet_fc_rcv_fcp_req() callback to
+ * return the i/o context to the LLDD. The LLDD may send the BA_ACC
+ * to the ABTS either after return from this function (assuming any
+ * outstanding op work has been terminated) or upon the callback being
+ * called.
+ *
+ * @target_port: pointer to the (registered) target port the FCP CMD IU
+ * was received on.
+ * @fcpreq: pointer to the fcpreq request structure that corresponds
+ * to the exchange that received the ABTS.
+ */
+void
+nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *target_port,
+ struct nvmefc_tgt_fcp_req *fcpreq)
+{
+ struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+ struct nvmet_fc_tgt_queue *queue;
+ unsigned long flags;
+
+ if (!fod || fod->fcpreq != fcpreq)
+ /* job appears to have already completed, ignore abort */
+ return;
+
+ queue = fod->queue;
+
+ spin_lock_irqsave(&queue->qlock, flags);
+ if (fod->active) {
+ /*
+ * mark as abort. The abort handler, invoked upon completion
+ * of any work, will detect the aborted status and do the
+ * callback.
+ */
+ spin_lock(&fod->flock);
+ fod->abort = true;
+ fod->aborted = true;
+ spin_unlock(&fod->flock);
+ }
+ spin_unlock_irqrestore(&queue->qlock, flags);
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_abort);
+
enum {
FCT_TRADDR_ERR = 0,
FCT_TRADDR_WWNN = 1 << 0,
@@ -2177,7 +2316,7 @@ nvmet_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf)
if (!options)
return -ENOMEM;
- while ((p = strsep(&o, ",\n")) != NULL) {
+ while ((p = strsep(&o, ":\n")) != NULL) {
if (!*p)
continue;
@@ -2238,6 +2377,7 @@ nvmet_fc_add_port(struct nvmet_port *port)
if (!tgtport->port) {
tgtport->port = port;
port->priv = tgtport;
+ nvmet_fc_tgtport_get(tgtport);
ret = 0;
} else
ret = -EALREADY;
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 4e8e6a22bce1..15551ef79c8c 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -246,11 +246,19 @@ struct fcloop_lsreq {
struct fcloop_fcpreq {
struct fcloop_tport *tport;
struct nvmefc_fcp_req *fcpreq;
+ spinlock_t reqlock;
u16 status;
+ bool active;
+ bool aborted;
struct work_struct work;
struct nvmefc_tgt_fcp_req tgt_fcp_req;
};
+struct fcloop_ini_fcpreq {
+ struct nvmefc_fcp_req *fcpreq;
+ struct fcloop_fcpreq *tfcp_req;
+ struct work_struct iniwork;
+};
static inline struct fcloop_lsreq *
tgt_ls_req_to_lsreq(struct nvmefc_tgt_ls_req *tgt_lsreq)
@@ -341,7 +349,21 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
}
/*
- * FCP IO operation done. call back up initiator "done" flows.
+ * FCP IO operation done by initiator abort.
+ * call back up initiator "done" flows.
+ */
+static void
+fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
+{
+ struct fcloop_ini_fcpreq *inireq =
+ container_of(work, struct fcloop_ini_fcpreq, iniwork);
+
+ inireq->fcpreq->done(inireq->fcpreq);
+}
+
+/*
+ * FCP IO operation done by target completion.
+ * call back up initiator "done" flows.
*/
static void
fcloop_tgt_fcprqst_done_work(struct work_struct *work)
@@ -349,12 +371,18 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work)
struct fcloop_fcpreq *tfcp_req =
container_of(work, struct fcloop_fcpreq, work);
struct fcloop_tport *tport = tfcp_req->tport;
- struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+ struct nvmefc_fcp_req *fcpreq;
- if (tport->remoteport) {
+ spin_lock(&tfcp_req->reqlock);
+ fcpreq = tfcp_req->fcpreq;
+ spin_unlock(&tfcp_req->reqlock);
+
+ if (tport->remoteport && fcpreq) {
fcpreq->status = tfcp_req->status;
fcpreq->done(fcpreq);
}
+
+ kfree(tfcp_req);
}
@@ -364,20 +392,25 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
void *hw_queue_handle,
struct nvmefc_fcp_req *fcpreq)
{
- struct fcloop_fcpreq *tfcp_req = fcpreq->private;
struct fcloop_rport *rport = remoteport->private;
+ struct fcloop_ini_fcpreq *inireq = fcpreq->private;
+ struct fcloop_fcpreq *tfcp_req;
int ret = 0;
- INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
+ if (!rport->targetport)
+ return -ECONNREFUSED;
- if (!rport->targetport) {
- tfcp_req->status = NVME_SC_FC_TRANSPORT_ERROR;
- schedule_work(&tfcp_req->work);
- return ret;
- }
+ tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_KERNEL);
+ if (!tfcp_req)
+ return -ENOMEM;
+ inireq->fcpreq = fcpreq;
+ inireq->tfcp_req = tfcp_req;
+ INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
tfcp_req->fcpreq = fcpreq;
tfcp_req->tport = rport->targetport->private;
+ spin_lock_init(&tfcp_req->reqlock);
+ INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
fcpreq->cmdaddr, fcpreq->cmdlen);
@@ -444,63 +477,129 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
struct nvmefc_tgt_fcp_req *tgt_fcpreq)
{
struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
- struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+ struct nvmefc_fcp_req *fcpreq;
u32 rsplen = 0, xfrlen = 0;
- int fcp_err = 0;
+ int fcp_err = 0, active, aborted;
u8 op = tgt_fcpreq->op;
+ spin_lock(&tfcp_req->reqlock);
+ fcpreq = tfcp_req->fcpreq;
+ active = tfcp_req->active;
+ aborted = tfcp_req->aborted;
+ tfcp_req->active = true;
+ spin_unlock(&tfcp_req->reqlock);
+
+ if (unlikely(active))
+ /* illegal - call while i/o active */
+ return -EALREADY;
+
+ if (unlikely(aborted)) {
+ /* target transport has aborted i/o prior */
+ spin_lock(&tfcp_req->reqlock);
+ tfcp_req->active = false;
+ spin_unlock(&tfcp_req->reqlock);
+ tgt_fcpreq->transferred_length = 0;
+ tgt_fcpreq->fcp_error = -ECANCELED;
+ tgt_fcpreq->done(tgt_fcpreq);
+ return 0;
+ }
+
+ /*
+ * if fcpreq is NULL, the I/O has been aborted (from
+ * initiator side). For the target side, act as if all is well
+ * but don't actually move data.
+ */
+
switch (op) {
case NVMET_FCOP_WRITEDATA:
xfrlen = tgt_fcpreq->transfer_length;
- fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
- tgt_fcpreq->offset, xfrlen);
- fcpreq->transferred_length += xfrlen;
+ if (fcpreq) {
+ fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+ fcpreq->first_sgl, tgt_fcpreq->offset,
+ xfrlen);
+ fcpreq->transferred_length += xfrlen;
+ }
break;
case NVMET_FCOP_READDATA:
case NVMET_FCOP_READDATA_RSP:
xfrlen = tgt_fcpreq->transfer_length;
- fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
- tgt_fcpreq->offset, xfrlen);
- fcpreq->transferred_length += xfrlen;
+ if (fcpreq) {
+ fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+ fcpreq->first_sgl, tgt_fcpreq->offset,
+ xfrlen);
+ fcpreq->transferred_length += xfrlen;
+ }
if (op == NVMET_FCOP_READDATA)
break;
/* Fall-Thru to RSP handling */
case NVMET_FCOP_RSP:
- rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
- fcpreq->rsplen : tgt_fcpreq->rsplen);
- memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
- if (rsplen < tgt_fcpreq->rsplen)
- fcp_err = -E2BIG;
- fcpreq->rcv_rsplen = rsplen;
- fcpreq->status = 0;
+ if (fcpreq) {
+ rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
+ fcpreq->rsplen : tgt_fcpreq->rsplen);
+ memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
+ if (rsplen < tgt_fcpreq->rsplen)
+ fcp_err = -E2BIG;
+ fcpreq->rcv_rsplen = rsplen;
+ fcpreq->status = 0;
+ }
tfcp_req->status = 0;
break;
- case NVMET_FCOP_ABORT:
- tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED;
- break;
-
default:
fcp_err = -EINVAL;
break;
}
+ spin_lock(&tfcp_req->reqlock);
+ tfcp_req->active = false;
+ spin_unlock(&tfcp_req->reqlock);
+
tgt_fcpreq->transferred_length = xfrlen;
tgt_fcpreq->fcp_error = fcp_err;
tgt_fcpreq->done(tgt_fcpreq);
- if ((!fcp_err) && (op == NVMET_FCOP_RSP ||
- op == NVMET_FCOP_READDATA_RSP ||
- op == NVMET_FCOP_ABORT))
- schedule_work(&tfcp_req->work);
-
return 0;
}
static void
+fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+ struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+ int active;
+
+ /*
+ * mark aborted only in case there were 2 threads in transport
+ * (one doing io, other doing abort) and only kills ops posted
+ * after the abort request
+ */
+ spin_lock(&tfcp_req->reqlock);
+ active = tfcp_req->active;
+ tfcp_req->aborted = true;
+ spin_unlock(&tfcp_req->reqlock);
+
+ tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED;
+
+ /*
+ * nothing more to do. If io wasn't active, the transport should
+ * immediately call the req_release. If it was active, the op
+ * will complete, and the lldd should call req_release.
+ */
+}
+
+static void
+fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+ struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+
+ schedule_work(&tfcp_req->work);
+}
+
+static void
fcloop_ls_abort(struct nvme_fc_local_port *localport,
struct nvme_fc_remote_port *remoteport,
struct nvmefc_ls_req *lsreq)
@@ -513,6 +612,27 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
void *hw_queue_handle,
struct nvmefc_fcp_req *fcpreq)
{
+ struct fcloop_rport *rport = remoteport->private;
+ struct fcloop_ini_fcpreq *inireq = fcpreq->private;
+ struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
+
+ if (!tfcp_req)
+ /* abort has already been called */
+ return;
+
+ if (rport->targetport)
+ nvmet_fc_rcv_fcp_abort(rport->targetport,
+ &tfcp_req->tgt_fcp_req);
+
+ /* break initiator/target relationship for io */
+ spin_lock(&tfcp_req->reqlock);
+ inireq->tfcp_req = NULL;
+ tfcp_req->fcpreq = NULL;
+ spin_unlock(&tfcp_req->reqlock);
+
+ /* post the aborted io completion */
+ fcpreq->status = -ECANCELED;
+ schedule_work(&inireq->iniwork);
}
static void
@@ -546,7 +666,7 @@ fcloop_targetport_delete(struct nvmet_fc_target_port *targetport)
#define FCLOOP_SGL_SEGS 256
#define FCLOOP_DMABOUND_4G 0xFFFFFFFF
-struct nvme_fc_port_template fctemplate = {
+static struct nvme_fc_port_template fctemplate = {
.localport_delete = fcloop_localport_delete,
.remoteport_delete = fcloop_remoteport_delete,
.create_queue = fcloop_create_queue,
@@ -563,20 +683,23 @@ struct nvme_fc_port_template fctemplate = {
.local_priv_sz = sizeof(struct fcloop_lport),
.remote_priv_sz = sizeof(struct fcloop_rport),
.lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
- .fcprqst_priv_sz = sizeof(struct fcloop_fcpreq),
+ .fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq),
};
-struct nvmet_fc_target_template tgttemplate = {
+static struct nvmet_fc_target_template tgttemplate = {
.targetport_delete = fcloop_targetport_delete,
.xmt_ls_rsp = fcloop_xmt_ls_rsp,
.fcp_op = fcloop_fcp_op,
+ .fcp_abort = fcloop_tgt_fcp_abort,
+ .fcp_req_release = fcloop_fcp_req_release,
.max_hw_queues = FCLOOP_HW_QUEUES,
.max_sgl_segments = FCLOOP_SGL_SEGS,
.max_dif_sgl_segments = FCLOOP_SGL_SEGS,
.dma_boundary = FCLOOP_DMABOUND_4G,
/* optional features */
- .target_features = NVMET_FCTGTFEAT_READDATA_RSP |
- NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED,
+ .target_features = NVMET_FCTGTFEAT_CMD_IN_ISR |
+ NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
+ NVMET_FCTGTFEAT_OPDONE_IN_ISR,
/* sizes of additional private data for data structures */
.target_priv_sz = sizeof(struct fcloop_tport),
};
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 6b0baa9caab9..c77940d80fc8 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -184,7 +184,7 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
(req->ns->blksize_shift - 9)) + 1;
if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
- GFP_KERNEL, &bio, true))
+ GFP_KERNEL, &bio, 0))
status = NVME_SC_INTERNAL | NVME_SC_DNR;
if (bio) {
@@ -196,26 +196,19 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
}
}
-int nvmet_parse_io_cmd(struct nvmet_req *req)
+u16 nvmet_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
+ u16 ret;
- if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
- pr_err("nvmet: got io cmd %d while CC.EN == 0\n",
- cmd->common.opcode);
+ ret = nvmet_check_ctrl_status(req, cmd);
+ if (unlikely(ret)) {
req->ns = NULL;
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
- }
-
- if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
- pr_err("nvmet: got io cmd %d while CSTS.RDY == 0\n",
- cmd->common.opcode);
- req->ns = NULL;
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+ return ret;
}
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
- if (!req->ns)
+ if (unlikely(!req->ns))
return NVME_SC_INVALID_NS | NVME_SC_DNR;
switch (cmd->common.opcode) {
@@ -237,7 +230,8 @@ int nvmet_parse_io_cmd(struct nvmet_req *req)
req->execute = nvmet_execute_write_zeroes;
return 0;
default:
- pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+ pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+ req->sq->qid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index c7b0b6a52708..304f1c87c160 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -13,12 +13,10 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/scatterlist.h>
-#include <linux/delay.h>
#include <linux/blk-mq.h>
#include <linux/nvme.h>
#include <linux/module.h>
#include <linux/parser.h>
-#include <linux/t10-pi.h>
#include "nvmet.h"
#include "../host/nvme.h"
#include "../host/fabrics.h"
@@ -93,31 +91,26 @@ static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue)
static void nvme_loop_complete_rq(struct request *req)
{
struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
- int error = 0;
nvme_cleanup_cmd(req);
sg_free_table_chained(&iod->sg_table, true);
+ nvme_complete_rq(req);
+}
- if (unlikely(req->errors)) {
- if (nvme_req_needs_retry(req, req->errors)) {
- nvme_requeue_req(req);
- return;
- }
-
- if (blk_rq_is_passthrough(req))
- error = req->errors;
- else
- error = nvme_error_status(req->errors);
- }
+static struct blk_mq_tags *nvme_loop_tagset(struct nvme_loop_queue *queue)
+{
+ u32 queue_idx = nvme_loop_queue_idx(queue);
- blk_mq_end_request(req, error);
+ if (queue_idx == 0)
+ return queue->ctrl->admin_tag_set.tags[queue_idx];
+ return queue->ctrl->tag_set.tags[queue_idx - 1];
}
static void nvme_loop_queue_response(struct nvmet_req *req)
{
- struct nvme_loop_iod *iod =
- container_of(req, struct nvme_loop_iod, req);
- struct nvme_completion *cqe = &iod->rsp;
+ struct nvme_loop_queue *queue =
+ container_of(req->sq, struct nvme_loop_queue, nvme_sq);
+ struct nvme_completion *cqe = req->rsp;
/*
* AEN requests are special as they don't time out and can
@@ -125,15 +118,22 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
- if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 &&
+ if (unlikely(nvme_loop_queue_idx(queue) == 0 &&
cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) {
- nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe->status,
+ nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
} else {
- struct request *rq = blk_mq_rq_from_pdu(iod);
+ struct request *rq;
+
+ rq = blk_mq_tag_to_rq(nvme_loop_tagset(queue), cqe->command_id);
+ if (!rq) {
+ dev_err(queue->ctrl->ctrl.device,
+ "tag 0x%x on queue %d not found\n",
+ cqe->command_id, nvme_loop_queue_idx(queue));
+ return;
+ }
- iod->nvme_req.result = cqe->result;
- blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1);
+ nvme_end_request(rq, cqe->status, cqe->result);
}
}
@@ -154,7 +154,7 @@ nvme_loop_timeout(struct request *rq, bool reserved)
schedule_work(&iod->queue->ctrl->reset_work);
/* fail with DNR on admin cmd timeout */
- rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+ nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
return BLK_EH_HANDLED;
}
@@ -268,7 +268,7 @@ static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
return 0;
}
-static struct blk_mq_ops nvme_loop_mq_ops = {
+static const struct blk_mq_ops nvme_loop_mq_ops = {
.queue_rq = nvme_loop_queue_rq,
.complete = nvme_loop_complete_rq,
.init_request = nvme_loop_init_request,
@@ -276,7 +276,7 @@ static struct blk_mq_ops nvme_loop_mq_ops = {
.timeout = nvme_loop_timeout,
};
-static struct blk_mq_ops nvme_loop_admin_mq_ops = {
+static const struct blk_mq_ops nvme_loop_admin_mq_ops = {
.queue_rq = nvme_loop_queue_rq,
.complete = nvme_loop_complete_rq,
.init_request = nvme_loop_init_admin_request,
@@ -349,6 +349,19 @@ out_destroy_queues:
return ret;
}
+static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl)
+{
+ int i, ret;
+
+ for (i = 1; i < ctrl->queue_count; i++) {
+ ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
{
int error;
@@ -490,7 +503,7 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
struct nvme_loop_ctrl *ctrl = container_of(work,
struct nvme_loop_ctrl, reset_work);
bool changed;
- int i, ret;
+ int ret;
nvme_loop_shutdown_ctrl(ctrl);
@@ -502,11 +515,9 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
if (ret)
goto out_destroy_admin;
- for (i = 1; i < ctrl->queue_count; i++) {
- ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
- if (ret)
- goto out_destroy_io;
- }
+ ret = nvme_loop_connect_io_queues(ctrl);
+ if (ret)
+ goto out_destroy_io;
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
@@ -559,7 +570,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
{
- int ret, i;
+ int ret;
ret = nvme_loop_init_io_queues(ctrl);
if (ret)
@@ -588,11 +599,9 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
goto out_free_tagset;
}
- for (i = 1; i < ctrl->queue_count; i++) {
- ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
- if (ret)
- goto out_cleanup_connect_q;
- }
+ ret = nvme_loop_connect_io_queues(ctrl);
+ if (ret)
+ goto out_cleanup_connect_q;
return 0;
@@ -736,7 +745,12 @@ static int __init nvme_loop_init_module(void)
ret = nvmet_register_transport(&nvme_loop_ops);
if (ret)
return ret;
- return nvmf_register_transport(&nvme_loop_transport);
+
+ ret = nvmf_register_transport(&nvme_loop_transport);
+ if (ret)
+ nvmet_unregister_transport(&nvme_loop_ops);
+
+ return ret;
}
static void __exit nvme_loop_cleanup_module(void)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index f7ff15f17ca9..7cb77ba5993b 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -253,11 +253,11 @@ struct nvmet_async_event {
u8 log_page;
};
-int nvmet_parse_connect_cmd(struct nvmet_req *req);
-int nvmet_parse_io_cmd(struct nvmet_req *req);
-int nvmet_parse_admin_cmd(struct nvmet_req *req);
-int nvmet_parse_discovery_cmd(struct nvmet_req *req);
-int nvmet_parse_fabrics_cmd(struct nvmet_req *req);
+u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
+u16 nvmet_parse_io_cmd(struct nvmet_req *req);
+u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
+u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
+u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops);
@@ -278,6 +278,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
struct nvmet_req *req, struct nvmet_ctrl **ret);
void nvmet_ctrl_put(struct nvmet_ctrl *ctrl);
+u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd);
struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
enum nvme_subsys_type type);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index ecc4fe862561..99c69018a35f 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1199,6 +1199,11 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
}
queue->port = cm_id->context;
+ if (queue->host_qid == 0) {
+ /* Let inflight controller teardown complete */
+ flush_scheduled_work();
+ }
+
ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
if (ret)
goto release_queue;
@@ -1427,12 +1432,16 @@ restart:
static int nvmet_rdma_add_port(struct nvmet_port *port)
{
struct rdma_cm_id *cm_id;
- struct sockaddr_in addr_in;
- u16 port_in;
+ struct sockaddr_storage addr = { };
+ __kernel_sa_family_t af;
int ret;
switch (port->disc_addr.adrfam) {
case NVMF_ADDR_FAMILY_IP4:
+ af = AF_INET;
+ break;
+ case NVMF_ADDR_FAMILY_IP6:
+ af = AF_INET6;
break;
default:
pr_err("address family %d not supported\n",
@@ -1440,13 +1449,13 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
return -EINVAL;
}
- ret = kstrtou16(port->disc_addr.trsvcid, 0, &port_in);
- if (ret)
+ ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
+ port->disc_addr.trsvcid, &addr);
+ if (ret) {
+ pr_err("malformed ip/port passed: %s:%s\n",
+ port->disc_addr.traddr, port->disc_addr.trsvcid);
return ret;
-
- addr_in.sin_family = AF_INET;
- addr_in.sin_addr.s_addr = in_aton(port->disc_addr.traddr);
- addr_in.sin_port = htons(port_in);
+ }
cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
RDMA_PS_TCP, IB_QPT_RC);
@@ -1455,20 +1464,32 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
return PTR_ERR(cm_id);
}
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr_in);
+ /*
+ * Allow both IPv4 and IPv6 sockets to bind a single port
+ * at the same time.
+ */
+ ret = rdma_set_afonly(cm_id, 1);
+ if (ret) {
+ pr_err("rdma_set_afonly failed (%d)\n", ret);
+ goto out_destroy_id;
+ }
+
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
if (ret) {
- pr_err("binding CM ID to %pISpc failed (%d)\n", &addr_in, ret);
+ pr_err("binding CM ID to %pISpcs failed (%d)\n",
+ (struct sockaddr *)&addr, ret);
goto out_destroy_id;
}
ret = rdma_listen(cm_id, 128);
if (ret) {
- pr_err("listening to %pISpc failed (%d)\n", &addr_in, ret);
+ pr_err("listening to %pISpcs failed (%d)\n",
+ (struct sockaddr *)&addr, ret);
goto out_destroy_id;
}
- pr_info("enabling port %d (%pISpc)\n",
- le16_to_cpu(port->disc_addr.portid), &addr_in);
+ pr_info("enabling port %d (%pISpcs)\n",
+ le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
port->priv = cm_id;
return 0;
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 4bc88eb52712..e1bffc9bb194 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -141,6 +141,14 @@ config DELL_WMI_AIO
To compile this driver as a module, choose M here: the module will
be called dell-wmi-aio.
+config DELL_WMI_LED
+ tristate "External LED on Dell Business Netbooks"
+ depends on LEDS_CLASS
+ depends on ACPI_WMI
+ help
+ This adds support for the Latitude 2100 and similar
+ notebooks that have an external LED.
+
config DELL_SMO8800
tristate "Dell Latitude freefall driver (ACPI SMO88XX)"
depends on ACPI
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 299d0f9e40f7..776b3a7a4984 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_DELL_SMBIOS) += dell-smbios.o
obj-$(CONFIG_DELL_LAPTOP) += dell-laptop.o
obj-$(CONFIG_DELL_WMI) += dell-wmi.o
obj-$(CONFIG_DELL_WMI_AIO) += dell-wmi-aio.o
+obj-$(CONFIG_DELL_WMI_LED) += dell-wmi-led.o
obj-$(CONFIG_DELL_SMO8800) += dell-smo8800.o
obj-$(CONFIG_DELL_RBTN) += dell-rbtn.o
obj-$(CONFIG_ACER_WMI) += acer-wmi.o
diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c
index f57dd282a002..2e237bad4995 100644
--- a/drivers/platform/x86/dell-laptop.c
+++ b/drivers/platform/x86/dell-laptop.c
@@ -29,6 +29,7 @@
#include <linux/mm.h>
#include <linux/i8042.h>
#include <linux/debugfs.h>
+#include <linux/dell-led.h>
#include <linux/seq_file.h>
#include <acpi/video.h>
#include "dell-rbtn.h"
@@ -42,6 +43,8 @@
#define KBD_LED_AUTO_50_TOKEN 0x02EB
#define KBD_LED_AUTO_75_TOKEN 0x02EC
#define KBD_LED_AUTO_100_TOKEN 0x02F6
+#define GLOBAL_MIC_MUTE_ENABLE 0x0364
+#define GLOBAL_MIC_MUTE_DISABLE 0x0365
struct quirk_entry {
u8 touchpad_led;
@@ -1978,6 +1981,31 @@ static void kbd_led_exit(void)
led_classdev_unregister(&kbd_led);
}
+int dell_micmute_led_set(int state)
+{
+ struct calling_interface_buffer *buffer;
+ struct calling_interface_token *token;
+
+ if (state == 0)
+ token = dell_smbios_find_token(GLOBAL_MIC_MUTE_DISABLE);
+ else if (state == 1)
+ token = dell_smbios_find_token(GLOBAL_MIC_MUTE_ENABLE);
+ else
+ return -EINVAL;
+
+ if (!token)
+ return -ENODEV;
+
+ buffer = dell_smbios_get_buffer();
+ buffer->input[0] = token->location;
+ buffer->input[1] = token->value;
+ dell_smbios_send_request(1, 0);
+ dell_smbios_release_buffer();
+
+ return state;
+}
+EXPORT_SYMBOL_GPL(dell_micmute_led_set);
+
static int __init dell_init(void)
{
struct calling_interface_buffer *buffer;
diff --git a/drivers/leds/dell-led.c b/drivers/platform/x86/dell-wmi-led.c
index b3d6e9c15cf9..a0c7e99530ef 100644
--- a/drivers/leds/dell-led.c
+++ b/drivers/platform/x86/dell-wmi-led.c
@@ -1,6 +1,4 @@
/*
- * dell_led.c - Dell LED Driver
- *
* Copyright (C) 2010 Dell Inc.
* Louis Davis <louis_davis@dell.com>
* Jim Dailey <jim_dailey@dell.com>
@@ -15,16 +13,12 @@
#include <linux/leds.h>
#include <linux/slab.h>
#include <linux/module.h>
-#include <linux/dmi.h>
-#include <linux/dell-led.h>
-#include "../platform/x86/dell-smbios.h"
MODULE_AUTHOR("Louis Davis/Jim Dailey");
MODULE_DESCRIPTION("Dell LED Control Driver");
MODULE_LICENSE("GPL");
#define DELL_LED_BIOS_GUID "F6E4FE6E-909D-47cb-8BAB-C9F6F2F8D396"
-#define DELL_APP_GUID "A80593CE-A997-11DA-B012-B622A1EF5492"
MODULE_ALIAS("wmi:" DELL_LED_BIOS_GUID);
/* Error Result Codes: */
@@ -43,53 +37,6 @@ MODULE_ALIAS("wmi:" DELL_LED_BIOS_GUID);
#define CMD_LED_OFF 17
#define CMD_LED_BLINK 18
-#define GLOBAL_MIC_MUTE_ENABLE 0x364
-#define GLOBAL_MIC_MUTE_DISABLE 0x365
-
-static int dell_micmute_led_set(int state)
-{
- struct calling_interface_buffer *buffer;
- struct calling_interface_token *token;
-
- if (!wmi_has_guid(DELL_APP_GUID))
- return -ENODEV;
-
- if (state == 0)
- token = dell_smbios_find_token(GLOBAL_MIC_MUTE_DISABLE);
- else if (state == 1)
- token = dell_smbios_find_token(GLOBAL_MIC_MUTE_ENABLE);
- else
- return -EINVAL;
-
- if (!token)
- return -ENODEV;
-
- buffer = dell_smbios_get_buffer();
- buffer->input[0] = token->location;
- buffer->input[1] = token->value;
- dell_smbios_send_request(1, 0);
- dell_smbios_release_buffer();
-
- return state;
-}
-
-int dell_app_wmi_led_set(int whichled, int on)
-{
- int state = 0;
-
- switch (whichled) {
- case DELL_LED_MICMUTE:
- state = dell_micmute_led_set(on);
- break;
- default:
- pr_warn("led type %x is not supported\n", whichled);
- break;
- }
-
- return state;
-}
-EXPORT_SYMBOL_GPL(dell_app_wmi_led_set);
-
struct bios_args {
unsigned char length;
unsigned char result_code;
@@ -99,37 +46,29 @@ struct bios_args {
unsigned char off_time;
};
-static int dell_led_perform_fn(u8 length,
- u8 result_code,
- u8 device_id,
- u8 command,
- u8 on_time,
- u8 off_time)
+static int dell_led_perform_fn(u8 length, u8 result_code, u8 device_id,
+ u8 command, u8 on_time, u8 off_time)
{
- struct bios_args *bios_return;
- u8 return_code;
- union acpi_object *obj;
struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
+ struct bios_args *bios_return;
struct acpi_buffer input;
+ union acpi_object *obj;
acpi_status status;
+ u8 return_code;
- struct bios_args args;
- args.length = length;
- args.result_code = result_code;
- args.device_id = device_id;
- args.command = command;
- args.on_time = on_time;
- args.off_time = off_time;
+ struct bios_args args = {
+ .length = length,
+ .result_code = result_code,
+ .device_id = device_id,
+ .command = command,
+ .on_time = on_time,
+ .off_time = off_time
+ };
input.length = sizeof(struct bios_args);
input.pointer = &args;
- status = wmi_evaluate_method(DELL_LED_BIOS_GUID,
- 1,
- 1,
- &input,
- &output);
-
+ status = wmi_evaluate_method(DELL_LED_BIOS_GUID, 1, 1, &input, &output);
if (ACPI_FAILURE(status))
return status;
@@ -137,7 +76,7 @@ static int dell_led_perform_fn(u8 length,
if (!obj)
return -EINVAL;
- else if (obj->type != ACPI_TYPE_BUFFER) {
+ if (obj->type != ACPI_TYPE_BUFFER) {
kfree(obj);
return -EINVAL;
}
@@ -170,8 +109,7 @@ static int led_off(void)
0); /* not used */
}
-static int led_blink(unsigned char on_eighths,
- unsigned char off_eighths)
+static int led_blink(unsigned char on_eighths, unsigned char off_eighths)
{
return dell_led_perform_fn(5, /* Length of command */
INTERFACE_ERROR, /* Init to INTERFACE_ERROR */
@@ -182,7 +120,7 @@ static int led_blink(unsigned char on_eighths,
}
static void dell_led_set(struct led_classdev *led_cdev,
- enum led_brightness value)
+ enum led_brightness value)
{
if (value == LED_OFF)
led_off();
@@ -191,27 +129,22 @@ static void dell_led_set(struct led_classdev *led_cdev,
}
static int dell_led_blink(struct led_classdev *led_cdev,
- unsigned long *delay_on,
- unsigned long *delay_off)
+ unsigned long *delay_on, unsigned long *delay_off)
{
unsigned long on_eighths;
unsigned long off_eighths;
- /* The Dell LED delay is based on 125ms intervals.
- Need to round up to next interval. */
+ /*
+ * The Dell LED delay is based on 125ms intervals.
+ * Need to round up to next interval.
+ */
- on_eighths = (*delay_on + 124) / 125;
- if (0 == on_eighths)
- on_eighths = 1;
- if (on_eighths > 255)
- on_eighths = 255;
+ on_eighths = DIV_ROUND_UP(*delay_on, 125);
+ on_eighths = clamp_t(unsigned long, on_eighths, 1, 255);
*delay_on = on_eighths * 125;
- off_eighths = (*delay_off + 124) / 125;
- if (0 == off_eighths)
- off_eighths = 1;
- if (off_eighths > 255)
- off_eighths = 255;
+ off_eighths = DIV_ROUND_UP(*delay_off, 125);
+ off_eighths = clamp_t(unsigned long, off_eighths, 1, 255);
*delay_off = off_eighths * 125;
led_blink(on_eighths, off_eighths);
@@ -232,29 +165,21 @@ static int __init dell_led_init(void)
{
int error = 0;
- if (!wmi_has_guid(DELL_LED_BIOS_GUID) && !wmi_has_guid(DELL_APP_GUID))
+ if (!wmi_has_guid(DELL_LED_BIOS_GUID))
return -ENODEV;
- if (wmi_has_guid(DELL_LED_BIOS_GUID)) {
- error = led_off();
- if (error != 0)
- return -ENODEV;
-
- error = led_classdev_register(NULL, &dell_led);
- }
+ error = led_off();
+ if (error != 0)
+ return -ENODEV;
- return error;
+ return led_classdev_register(NULL, &dell_led);
}
static void __exit dell_led_exit(void)
{
- int error = 0;
+ led_classdev_unregister(&dell_led);
- if (wmi_has_guid(DELL_LED_BIOS_GUID)) {
- error = led_off();
- if (error == 0)
- led_classdev_unregister(&dell_led);
- }
+ led_off();
}
module_init(dell_led_init);
diff --git a/drivers/power/reset/Kconfig b/drivers/power/reset/Kconfig
index b8cacccf18c8..13f1714cf6f7 100644
--- a/drivers/power/reset/Kconfig
+++ b/drivers/power/reset/Kconfig
@@ -67,6 +67,15 @@ config POWER_RESET_BRCMSTB
Say Y here if you have a Broadcom STB board and you wish
to have restart support.
+config POWER_RESET_GEMINI_POWEROFF
+ bool "Cortina Gemini power-off driver"
+ depends on ARCH_GEMINI || COMPILE_TEST
+ depends on OF && HAS_IOMEM
+ default ARCH_GEMINI
+ help
+ This driver supports turning off the Cortina Gemini SoC.
+ Select this if you're building a kernel with Gemini SoC support.
+
config POWER_RESET_GPIO
bool "GPIO power-off driver"
depends on OF_GPIO
diff --git a/drivers/power/reset/Makefile b/drivers/power/reset/Makefile
index 11dae3b56ff9..58cf5b30559f 100644
--- a/drivers/power/reset/Makefile
+++ b/drivers/power/reset/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_POWER_RESET_AT91_SAMA5D2_SHDWC) += at91-sama5d2_shdwc.o
obj-$(CONFIG_POWER_RESET_AXXIA) += axxia-reset.o
obj-$(CONFIG_POWER_RESET_BRCMKONA) += brcm-kona-reset.o
obj-$(CONFIG_POWER_RESET_BRCMSTB) += brcmstb-reboot.o
+obj-$(CONFIG_POWER_RESET_GEMINI_POWEROFF) += gemini-poweroff.o
obj-$(CONFIG_POWER_RESET_GPIO) += gpio-poweroff.o
obj-$(CONFIG_POWER_RESET_GPIO_RESTART) += gpio-restart.o
obj-$(CONFIG_POWER_RESET_HISI) += hisi-reboot.o
diff --git a/drivers/power/reset/gemini-poweroff.c b/drivers/power/reset/gemini-poweroff.c
new file mode 100644
index 000000000000..de878fd26f27
--- /dev/null
+++ b/drivers/power/reset/gemini-poweroff.c
@@ -0,0 +1,160 @@
+/*
+ * Gemini power management controller
+ * Copyright (C) 2017 Linus Walleij <linus.walleij@linaro.org>
+ *
+ * Inspired by code from the SL3516 board support by Jason Lee
+ * Inspired by code from Janos Laube <janos.dev@gmail.com>
+ */
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/reboot.h>
+
+#define GEMINI_PWC_ID 0x00010500
+#define GEMINI_PWC_IDREG 0x00
+#define GEMINI_PWC_CTRLREG 0x04
+#define GEMINI_PWC_STATREG 0x08
+
+#define GEMINI_CTRL_SHUTDOWN BIT(0)
+#define GEMINI_CTRL_ENABLE BIT(1)
+#define GEMINI_CTRL_IRQ_CLR BIT(2)
+
+#define GEMINI_STAT_CIR BIT(4)
+#define GEMINI_STAT_RTC BIT(5)
+#define GEMINI_STAT_POWERBUTTON BIT(6)
+
+struct gemini_powercon {
+ struct device *dev;
+ void __iomem *base;
+};
+
+static irqreturn_t gemini_powerbutton_interrupt(int irq, void *data)
+{
+ struct gemini_powercon *gpw = data;
+ u32 val;
+
+ /* ACK the IRQ */
+ val = readl(gpw->base + GEMINI_PWC_CTRLREG);
+ val |= GEMINI_CTRL_IRQ_CLR;
+ writel(val, gpw->base + GEMINI_PWC_CTRLREG);
+
+ val = readl(gpw->base + GEMINI_PWC_STATREG);
+ val &= 0x70U;
+ switch (val) {
+ case GEMINI_STAT_CIR:
+ dev_info(gpw->dev, "infrared poweroff\n");
+ orderly_poweroff(true);
+ break;
+ case GEMINI_STAT_RTC:
+ dev_info(gpw->dev, "RTC poweroff\n");
+ orderly_poweroff(true);
+ break;
+ case GEMINI_STAT_POWERBUTTON:
+ dev_info(gpw->dev, "poweroff button pressed\n");
+ orderly_poweroff(true);
+ break;
+ default:
+ dev_info(gpw->dev, "other power management IRQ\n");
+ break;
+ }
+
+ return IRQ_HANDLED;
+}
+
+/* This callback needs this static local as it has void as argument */
+static struct gemini_powercon *gpw_poweroff;
+
+static void gemini_poweroff(void)
+{
+ struct gemini_powercon *gpw = gpw_poweroff;
+ u32 val;
+
+ dev_crit(gpw->dev, "Gemini power off\n");
+ val = readl(gpw->base + GEMINI_PWC_CTRLREG);
+ val |= GEMINI_CTRL_ENABLE | GEMINI_CTRL_IRQ_CLR;
+ writel(val, gpw->base + GEMINI_PWC_CTRLREG);
+
+ val &= ~GEMINI_CTRL_ENABLE;
+ val |= GEMINI_CTRL_SHUTDOWN;
+ writel(val, gpw->base + GEMINI_PWC_CTRLREG);
+}
+
+static int gemini_poweroff_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct resource *res;
+ struct gemini_powercon *gpw;
+ u32 val;
+ int irq;
+ int ret;
+
+ gpw = devm_kzalloc(dev, sizeof(*gpw), GFP_KERNEL);
+ if (!gpw)
+ return -ENOMEM;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ gpw->base = devm_ioremap_resource(dev, res);
+ if (IS_ERR(gpw->base))
+ return PTR_ERR(gpw->base);
+
+ irq = platform_get_irq(pdev, 0);
+ if (!irq)
+ return -EINVAL;
+
+ gpw->dev = dev;
+
+ val = readl(gpw->base + GEMINI_PWC_IDREG);
+ val &= 0xFFFFFF00U;
+ if (val != GEMINI_PWC_ID) {
+ dev_err(dev, "wrong power controller ID: %08x\n",
+ val);
+ return -ENODEV;
+ }
+
+ /* Clear the power management IRQ */
+ val = readl(gpw->base + GEMINI_PWC_CTRLREG);
+ val |= GEMINI_CTRL_IRQ_CLR;
+ writel(val, gpw->base + GEMINI_PWC_CTRLREG);
+
+ ret = devm_request_irq(dev, irq, gemini_powerbutton_interrupt, 0,
+ "poweroff", gpw);
+ if (ret)
+ return ret;
+
+ pm_power_off = gemini_poweroff;
+ gpw_poweroff = gpw;
+
+ /*
+ * Enable the power controller. This is crucial on Gemini
+ * systems: if this is not done, pressing the power button
+ * will result in unconditional poweroff without any warning.
+ * This makes the kernel handle the poweroff.
+ */
+ val = readl(gpw->base + GEMINI_PWC_CTRLREG);
+ val |= GEMINI_CTRL_ENABLE;
+ writel(val, gpw->base + GEMINI_PWC_CTRLREG);
+
+ dev_info(dev, "Gemini poweroff driver registered\n");
+
+ return 0;
+}
+
+static const struct of_device_id gemini_poweroff_of_match[] = {
+ {
+ .compatible = "cortina,gemini-power-controller",
+ },
+ {}
+};
+
+static struct platform_driver gemini_poweroff_driver = {
+ .probe = gemini_poweroff_probe,
+ .driver = {
+ .name = "gemini-poweroff",
+ .of_match_table = gemini_poweroff_of_match,
+ },
+};
+builtin_platform_driver(gemini_poweroff_driver);
diff --git a/drivers/power/reset/syscon-poweroff.c b/drivers/power/reset/syscon-poweroff.c
index b68338399e5e..f9f1cb54fbf9 100644
--- a/drivers/power/reset/syscon-poweroff.c
+++ b/drivers/power/reset/syscon-poweroff.c
@@ -28,12 +28,13 @@
static struct regmap *map;
static u32 offset;
+static u32 value;
static u32 mask;
static void syscon_poweroff(void)
{
/* Issue the poweroff */
- regmap_write(map, offset, mask);
+ regmap_update_bits(map, offset, mask, value);
mdelay(1000);
@@ -43,6 +44,7 @@ static void syscon_poweroff(void)
static int syscon_poweroff_probe(struct platform_device *pdev)
{
char symname[KSYM_NAME_LEN];
+ int mask_err, value_err;
map = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, "regmap");
if (IS_ERR(map)) {
@@ -55,11 +57,22 @@ static int syscon_poweroff_probe(struct platform_device *pdev)
return -EINVAL;
}
- if (of_property_read_u32(pdev->dev.of_node, "mask", &mask)) {
- dev_err(&pdev->dev, "unable to read 'mask'");
+ value_err = of_property_read_u32(pdev->dev.of_node, "value", &value);
+ mask_err = of_property_read_u32(pdev->dev.of_node, "mask", &mask);
+ if (value_err && mask_err) {
+ dev_err(&pdev->dev, "unable to read 'value' and 'mask'");
return -EINVAL;
}
+ if (value_err) {
+ /* support old binding */
+ value = mask;
+ mask = 0xFFFFFFFF;
+ } else if (mask_err) {
+ /* support value without mask*/
+ mask = 0xFFFFFFFF;
+ }
+
if (pm_power_off) {
lookup_symbol_name((ulong)pm_power_off, symname);
dev_err(&pdev->dev,
diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig
index da54ac88f068..da922756149f 100644
--- a/drivers/power/supply/Kconfig
+++ b/drivers/power/supply/Kconfig
@@ -117,6 +117,12 @@ config BATTERY_DS2782
Say Y here to enable support for the DS2782/DS2786 standalone battery
gas-gauge.
+config BATTERY_LEGO_EV3
+ tristate "LEGO MINDSTORMS EV3 battery"
+ depends on OF && IIO && GPIOLIB
+ help
+ Say Y here to enable support for the LEGO MINDSTORMS EV3 battery.
+
config BATTERY_PMU
tristate "Apple PMU battery"
depends on PPC32 && ADB_PMU
@@ -317,6 +323,14 @@ config BATTERY_RX51
Say Y here to enable support for battery information on Nokia
RX-51, also known as N900 tablet.
+config CHARGER_CPCAP
+ tristate "CPCAP PMIC Charger Driver"
+ depends on MFD_CPCAP && IIO
+ default MFD_CPCAP
+ help
+ Say Y to enable support for CPCAP PMIC charger driver for Motorola
+ mobile devices such as Droid 4.
+
config CHARGER_ISP1704
tristate "ISP1704 USB Charger Detection"
depends on USB_PHY
@@ -438,6 +452,7 @@ config CHARGER_BQ2415X
config CHARGER_BQ24190
tristate "TI BQ24190 battery charger driver"
depends on I2C
+ depends on EXTCON
depends on GPIOLIB || COMPILE_TEST
help
Say Y to enable support for the TI BQ24190 battery charger.
diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile
index 3789a2c06fdf..39fc733e6cc4 100644
--- a/drivers/power/supply/Makefile
+++ b/drivers/power/supply/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_BATTERY_DS2781) += ds2781_battery.o
obj-$(CONFIG_BATTERY_DS2782) += ds2782_battery.o
obj-$(CONFIG_BATTERY_GAUGE_LTC2941) += ltc2941-battery-gauge.o
obj-$(CONFIG_BATTERY_GOLDFISH) += goldfish_battery.o
+obj-$(CONFIG_BATTERY_LEGO_EV3) += lego_ev3_battery.o
obj-$(CONFIG_BATTERY_PMU) += pmu_battery.o
obj-$(CONFIG_BATTERY_OLPC) += olpc_battery.o
obj-$(CONFIG_BATTERY_TOSA) += tosa_battery.o
@@ -51,6 +52,7 @@ obj-$(CONFIG_CHARGER_PCF50633) += pcf50633-charger.o
obj-$(CONFIG_BATTERY_JZ4740) += jz4740-battery.o
obj-$(CONFIG_BATTERY_RX51) += rx51_battery.o
obj-$(CONFIG_AB8500_BM) += ab8500_bmdata.o ab8500_charger.o ab8500_fg.o ab8500_btemp.o abx500_chargalg.o pm2301_charger.o
+obj-$(CONFIG_CHARGER_CPCAP) += cpcap-charger.o
obj-$(CONFIG_CHARGER_ISP1704) += isp1704_charger.o
obj-$(CONFIG_CHARGER_MAX8903) += max8903_charger.o
obj-$(CONFIG_CHARGER_TWL4030) += twl4030_charger.o
diff --git a/drivers/power/supply/ab8500_bmdata.c b/drivers/power/supply/ab8500_bmdata.c
index d29864533093..8c49586015d0 100644
--- a/drivers/power/supply/ab8500_bmdata.c
+++ b/drivers/power/supply/ab8500_bmdata.c
@@ -430,10 +430,10 @@ static const struct abx500_maxim_parameters ab8500_maxi_params = {
};
static const struct abx500_maxim_parameters abx540_maxi_params = {
- .ena_maxi = true,
- .chg_curr = 3000,
- .wait_cycles = 10,
- .charger_curr_step = 200,
+ .ena_maxi = true,
+ .chg_curr = 3000,
+ .wait_cycles = 10,
+ .charger_curr_step = 200,
};
static const struct abx500_bm_charger_parameters chg = {
diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c
index a4f08492abeb..bd9e5c3d8cc2 100644
--- a/drivers/power/supply/bq24190_charger.c
+++ b/drivers/power/supply/bq24190_charger.c
@@ -11,16 +11,15 @@
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
+#include <linux/extcon.h>
#include <linux/of_irq.h>
#include <linux/of_device.h>
#include <linux/pm_runtime.h>
#include <linux/power_supply.h>
+#include <linux/workqueue.h>
#include <linux/gpio.h>
#include <linux/i2c.h>
-#include <linux/power/bq24190_charger.h>
-
-
#define BQ24190_MANUFACTURER "Texas Instruments"
#define BQ24190_REG_ISC 0x00 /* Input Source Control */
@@ -39,6 +38,9 @@
#define BQ24190_REG_POC_WDT_RESET_SHIFT 6
#define BQ24190_REG_POC_CHG_CONFIG_MASK (BIT(5) | BIT(4))
#define BQ24190_REG_POC_CHG_CONFIG_SHIFT 4
+#define BQ24190_REG_POC_CHG_CONFIG_DISABLE 0x0
+#define BQ24190_REG_POC_CHG_CONFIG_CHARGE 0x1
+#define BQ24190_REG_POC_CHG_CONFIG_OTG 0x2
#define BQ24190_REG_POC_SYS_MIN_MASK (BIT(3) | BIT(2) | BIT(1))
#define BQ24190_REG_POC_SYS_MIN_SHIFT 1
#define BQ24190_REG_POC_BOOST_LIM_MASK BIT(0)
@@ -151,10 +153,12 @@ struct bq24190_dev_info {
struct device *dev;
struct power_supply *charger;
struct power_supply *battery;
+ struct extcon_dev *extcon;
+ struct notifier_block extcon_nb;
+ struct delayed_work extcon_work;
char model_name[I2C_NAME_SIZE];
- kernel_ulong_t model;
- unsigned int gpio_int;
- unsigned int irq;
+ bool initialized;
+ bool irq_event;
struct mutex f_reg_lock;
u8 f_reg;
u8 ss_reg;
@@ -168,6 +172,12 @@ struct bq24190_dev_info {
* number at that index in the array is the real-world value that it
* represents.
*/
+
+/* REG00[2:0] (IINLIM) in uAh */
+static const int bq24190_isc_iinlim_values[] = {
+ 100000, 150000, 500000, 900000, 1200000, 1500000, 2000000, 3000000
+};
+
/* REG02[7:2] (ICHG) in uAh */
static const int bq24190_ccc_ichg_values[] = {
512000, 576000, 640000, 704000, 768000, 832000, 896000, 960000,
@@ -418,6 +428,7 @@ static ssize_t bq24190_sysfs_show(struct device *dev,
struct power_supply *psy = dev_get_drvdata(dev);
struct bq24190_dev_info *bdi = power_supply_get_drvdata(psy);
struct bq24190_sysfs_field_info *info;
+ ssize_t count;
int ret;
u8 v;
@@ -425,11 +436,20 @@ static ssize_t bq24190_sysfs_show(struct device *dev,
if (!info)
return -EINVAL;
+ ret = pm_runtime_get_sync(bdi->dev);
+ if (ret < 0)
+ return ret;
+
ret = bq24190_read_mask(bdi, info->reg, info->mask, info->shift, &v);
if (ret)
- return ret;
+ count = ret;
+ else
+ count = scnprintf(buf, PAGE_SIZE, "%hhx\n", v);
+
+ pm_runtime_mark_last_busy(bdi->dev);
+ pm_runtime_put_autosuspend(bdi->dev);
- return scnprintf(buf, PAGE_SIZE, "%hhx\n", v);
+ return count;
}
static ssize_t bq24190_sysfs_store(struct device *dev,
@@ -449,9 +469,16 @@ static ssize_t bq24190_sysfs_store(struct device *dev,
if (ret < 0)
return ret;
+ ret = pm_runtime_get_sync(bdi->dev);
+ if (ret < 0)
+ return ret;
+
ret = bq24190_write_mask(bdi, info->reg, info->mask, info->shift, v);
if (ret)
- return ret;
+ count = ret;
+
+ pm_runtime_mark_last_busy(bdi->dev);
+ pm_runtime_put_autosuspend(bdi->dev);
return count;
}
@@ -523,16 +550,13 @@ static int bq24190_register_reset(struct bq24190_dev_info *bdi)
if (ret < 0)
return ret;
- if (!v)
- break;
+ if (v == 0)
+ return 0;
- udelay(10);
+ usleep_range(100, 200);
} while (--limit);
- if (!limit)
- return -EIO;
-
- return 0;
+ return -EIO;
}
/* Charger power supply property routines */
@@ -793,7 +817,9 @@ static int bq24190_charger_get_property(struct power_supply *psy,
dev_dbg(bdi->dev, "prop: %d\n", psp);
- pm_runtime_get_sync(bdi->dev);
+ ret = pm_runtime_get_sync(bdi->dev);
+ if (ret < 0)
+ return ret;
switch (psp) {
case POWER_SUPPLY_PROP_CHARGE_TYPE:
@@ -833,7 +859,9 @@ static int bq24190_charger_get_property(struct power_supply *psy,
ret = -ENODATA;
}
- pm_runtime_put_sync(bdi->dev);
+ pm_runtime_mark_last_busy(bdi->dev);
+ pm_runtime_put_autosuspend(bdi->dev);
+
return ret;
}
@@ -846,7 +874,9 @@ static int bq24190_charger_set_property(struct power_supply *psy,
dev_dbg(bdi->dev, "prop: %d\n", psp);
- pm_runtime_get_sync(bdi->dev);
+ ret = pm_runtime_get_sync(bdi->dev);
+ if (ret < 0)
+ return ret;
switch (psp) {
case POWER_SUPPLY_PROP_CHARGE_TYPE:
@@ -862,7 +892,9 @@ static int bq24190_charger_set_property(struct power_supply *psy,
ret = -EINVAL;
}
- pm_runtime_put_sync(bdi->dev);
+ pm_runtime_mark_last_busy(bdi->dev);
+ pm_runtime_put_autosuspend(bdi->dev);
+
return ret;
}
@@ -1063,7 +1095,9 @@ static int bq24190_battery_get_property(struct power_supply *psy,
dev_dbg(bdi->dev, "prop: %d\n", psp);
- pm_runtime_get_sync(bdi->dev);
+ ret = pm_runtime_get_sync(bdi->dev);
+ if (ret < 0)
+ return ret;
switch (psp) {
case POWER_SUPPLY_PROP_STATUS:
@@ -1091,7 +1125,9 @@ static int bq24190_battery_get_property(struct power_supply *psy,
ret = -ENODATA;
}
- pm_runtime_put_sync(bdi->dev);
+ pm_runtime_mark_last_busy(bdi->dev);
+ pm_runtime_put_autosuspend(bdi->dev);
+
return ret;
}
@@ -1104,7 +1140,9 @@ static int bq24190_battery_set_property(struct power_supply *psy,
dev_dbg(bdi->dev, "prop: %d\n", psp);
- pm_runtime_get_sync(bdi->dev);
+ ret = pm_runtime_get_sync(bdi->dev);
+ if (ret < 0)
+ return ret;
switch (psp) {
case POWER_SUPPLY_PROP_ONLINE:
@@ -1117,7 +1155,9 @@ static int bq24190_battery_set_property(struct power_supply *psy,
ret = -EINVAL;
}
- pm_runtime_put_sync(bdi->dev);
+ pm_runtime_mark_last_busy(bdi->dev);
+ pm_runtime_put_autosuspend(bdi->dev);
+
return ret;
}
@@ -1157,9 +1197,8 @@ static const struct power_supply_desc bq24190_battery_desc = {
.property_is_writeable = bq24190_battery_property_is_writeable,
};
-static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
+static void bq24190_check_status(struct bq24190_dev_info *bdi)
{
- struct bq24190_dev_info *bdi = data;
const u8 battery_mask_ss = BQ24190_REG_SS_CHRG_STAT_MASK;
const u8 battery_mask_f = BQ24190_REG_F_BAT_FAULT_MASK
| BQ24190_REG_F_NTC_FAULT_MASK;
@@ -1167,12 +1206,10 @@ static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
u8 ss_reg = 0, f_reg = 0;
int i, ret;
- pm_runtime_get_sync(bdi->dev);
-
ret = bq24190_read(bdi, BQ24190_REG_SS, &ss_reg);
if (ret < 0) {
dev_err(bdi->dev, "Can't read SS reg: %d\n", ret);
- goto out;
+ return;
}
i = 0;
@@ -1180,12 +1217,17 @@ static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
ret = bq24190_read(bdi, BQ24190_REG_F, &f_reg);
if (ret < 0) {
dev_err(bdi->dev, "Can't read F reg: %d\n", ret);
- goto out;
+ return;
}
} while (f_reg && ++i < 2);
+ /* ignore over/under voltage fault after disconnect */
+ if (f_reg == (1 << BQ24190_REG_F_CHRG_FAULT_SHIFT) &&
+ !(ss_reg & BQ24190_REG_SS_PG_STAT_MASK))
+ f_reg = 0;
+
if (f_reg != bdi->f_reg) {
- dev_info(bdi->dev,
+ dev_warn(bdi->dev,
"Fault: boost %d, charge %d, battery %d, ntc %d\n",
!!(f_reg & BQ24190_REG_F_BOOST_FAULT_MASK),
!!(f_reg & BQ24190_REG_F_CHRG_FAULT_MASK),
@@ -1229,90 +1271,126 @@ static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
if (alert_battery)
power_supply_changed(bdi->battery);
-out:
- pm_runtime_put_sync(bdi->dev);
-
dev_dbg(bdi->dev, "ss_reg: 0x%02x, f_reg: 0x%02x\n", ss_reg, f_reg);
+}
+
+static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
+{
+ struct bq24190_dev_info *bdi = data;
+ int error;
+
+ bdi->irq_event = true;
+ error = pm_runtime_get_sync(bdi->dev);
+ if (error < 0) {
+ dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error);
+ pm_runtime_put_noidle(bdi->dev);
+ return IRQ_NONE;
+ }
+ bq24190_check_status(bdi);
+ pm_runtime_mark_last_busy(bdi->dev);
+ pm_runtime_put_autosuspend(bdi->dev);
+ bdi->irq_event = false;
return IRQ_HANDLED;
}
-static int bq24190_hw_init(struct bq24190_dev_info *bdi)
+static void bq24190_extcon_work(struct work_struct *work)
{
+ struct bq24190_dev_info *bdi =
+ container_of(work, struct bq24190_dev_info, extcon_work.work);
+ int error, iinlim = 0;
u8 v;
- int ret;
-
- pm_runtime_get_sync(bdi->dev);
- /* First check that the device really is what its supposed to be */
- ret = bq24190_read_mask(bdi, BQ24190_REG_VPRS,
- BQ24190_REG_VPRS_PN_MASK,
- BQ24190_REG_VPRS_PN_SHIFT,
- &v);
- if (ret < 0)
- goto out;
+ error = pm_runtime_get_sync(bdi->dev);
+ if (error < 0) {
+ dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error);
+ pm_runtime_put_noidle(bdi->dev);
+ return;
+ }
- if (v != bdi->model) {
- ret = -ENODEV;
- goto out;
+ if (extcon_get_state(bdi->extcon, EXTCON_CHG_USB_SDP) == 1)
+ iinlim = 500000;
+ else if (extcon_get_state(bdi->extcon, EXTCON_CHG_USB_CDP) == 1 ||
+ extcon_get_state(bdi->extcon, EXTCON_CHG_USB_ACA) == 1)
+ iinlim = 1500000;
+ else if (extcon_get_state(bdi->extcon, EXTCON_CHG_USB_DCP) == 1)
+ iinlim = 2000000;
+
+ if (iinlim) {
+ error = bq24190_set_field_val(bdi, BQ24190_REG_ISC,
+ BQ24190_REG_ISC_IINLIM_MASK,
+ BQ24190_REG_ISC_IINLIM_SHIFT,
+ bq24190_isc_iinlim_values,
+ ARRAY_SIZE(bq24190_isc_iinlim_values),
+ iinlim);
+ if (error < 0)
+ dev_err(bdi->dev, "Can't set IINLIM: %d\n", error);
}
- ret = bq24190_register_reset(bdi);
- if (ret < 0)
- goto out;
+ /* if no charger found and in USB host mode, set OTG 5V boost, else normal */
+ if (!iinlim && extcon_get_state(bdi->extcon, EXTCON_USB_HOST) == 1)
+ v = BQ24190_REG_POC_CHG_CONFIG_OTG;
+ else
+ v = BQ24190_REG_POC_CHG_CONFIG_CHARGE;
- ret = bq24190_set_mode_host(bdi);
- if (ret < 0)
- goto out;
+ error = bq24190_write_mask(bdi, BQ24190_REG_POC,
+ BQ24190_REG_POC_CHG_CONFIG_MASK,
+ BQ24190_REG_POC_CHG_CONFIG_SHIFT,
+ v);
+ if (error < 0)
+ dev_err(bdi->dev, "Can't set CHG_CONFIG: %d\n", error);
- ret = bq24190_read(bdi, BQ24190_REG_SS, &bdi->ss_reg);
-out:
- pm_runtime_put_sync(bdi->dev);
- return ret;
+ pm_runtime_mark_last_busy(bdi->dev);
+ pm_runtime_put_autosuspend(bdi->dev);
}
-#ifdef CONFIG_OF
-static int bq24190_setup_dt(struct bq24190_dev_info *bdi)
+static int bq24190_extcon_event(struct notifier_block *nb, unsigned long event,
+ void *param)
{
- bdi->irq = irq_of_parse_and_map(bdi->dev->of_node, 0);
- if (bdi->irq <= 0)
- return -1;
+ struct bq24190_dev_info *bdi =
+ container_of(nb, struct bq24190_dev_info, extcon_nb);
- return 0;
-}
-#else
-static int bq24190_setup_dt(struct bq24190_dev_info *bdi)
-{
- return -1;
+ /*
+ * The Power-Good detection may take up to 220ms, sometimes
+ * the external charger detection is quicker, and the bq24190 will
+ * reset to iinlim based on its own charger detection (which is not
+ * hooked up when using external charger detection) resulting in
+ * a too low default 500mA iinlim. Delay applying the extcon value
+ * for 300ms to avoid this.
+ */
+ queue_delayed_work(system_wq, &bdi->extcon_work, msecs_to_jiffies(300));
+
+ return NOTIFY_OK;
}
-#endif
-static int bq24190_setup_pdata(struct bq24190_dev_info *bdi,
- struct bq24190_platform_data *pdata)
+static int bq24190_hw_init(struct bq24190_dev_info *bdi)
{
+ u8 v;
int ret;
- if (!gpio_is_valid(pdata->gpio_int))
- return -1;
-
- ret = gpio_request(pdata->gpio_int, dev_name(bdi->dev));
+ /* First check that the device really is what its supposed to be */
+ ret = bq24190_read_mask(bdi, BQ24190_REG_VPRS,
+ BQ24190_REG_VPRS_PN_MASK,
+ BQ24190_REG_VPRS_PN_SHIFT,
+ &v);
if (ret < 0)
- return -1;
+ return ret;
- ret = gpio_direction_input(pdata->gpio_int);
- if (ret < 0)
- goto out;
+ if (v != BQ24190_REG_VPRS_PN_24190 &&
+ v != BQ24190_REG_VPRS_PN_24192I) {
+ dev_err(bdi->dev, "Error unknown model: 0x%02x\n", v);
+ return -ENODEV;
+ }
- bdi->irq = gpio_to_irq(pdata->gpio_int);
- if (!bdi->irq)
- goto out;
+ ret = bq24190_register_reset(bdi);
+ if (ret < 0)
+ return ret;
- bdi->gpio_int = pdata->gpio_int;
- return 0;
+ ret = bq24190_set_mode_host(bdi);
+ if (ret < 0)
+ return ret;
-out:
- gpio_free(pdata->gpio_int);
- return -1;
+ return bq24190_read(bdi, BQ24190_REG_SS, &bdi->ss_reg);
}
static int bq24190_probe(struct i2c_client *client,
@@ -1320,9 +1398,9 @@ static int bq24190_probe(struct i2c_client *client,
{
struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
struct device *dev = &client->dev;
- struct bq24190_platform_data *pdata = client->dev.platform_data;
struct power_supply_config charger_cfg = {}, battery_cfg = {};
struct bq24190_dev_info *bdi;
+ const char *name;
int ret;
if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA)) {
@@ -1338,7 +1416,6 @@ static int bq24190_probe(struct i2c_client *client,
bdi->client = client;
bdi->dev = dev;
- bdi->model = id->driver_data;
strncpy(bdi->model_name, id->name, I2C_NAME_SIZE);
mutex_init(&bdi->f_reg_lock);
bdi->f_reg = 0;
@@ -1346,23 +1423,43 @@ static int bq24190_probe(struct i2c_client *client,
i2c_set_clientdata(client, bdi);
- if (dev->of_node)
- ret = bq24190_setup_dt(bdi);
- else
- ret = bq24190_setup_pdata(bdi, pdata);
-
- if (ret) {
+ if (!client->irq) {
dev_err(dev, "Can't get irq info\n");
return -EINVAL;
}
+ /*
+ * Devicetree platforms should get extcon via phandle (not yet supported).
+ * On ACPI platforms, extcon clients may invoke us with:
+ * struct property_entry pe[] =
+ * { PROPERTY_ENTRY_STRING("extcon-name", client_name), ... };
+ * struct i2c_board_info bi =
+ * { .type = "bq24190", .addr = 0x6b, .properties = pe, .irq = irq };
+ * struct i2c_adapter ad = { ... };
+ * i2c_add_adapter(&ad);
+ * i2c_new_device(&ad, &bi);
+ */
+ if (device_property_read_string(dev, "extcon-name", &name) == 0) {
+ bdi->extcon = extcon_get_extcon_dev(name);
+ if (!bdi->extcon)
+ return -EPROBE_DEFER;
+
+ dev_info(bdi->dev, "using extcon device %s\n", name);
+ }
+
pm_runtime_enable(dev);
- pm_runtime_resume(dev);
+ pm_runtime_use_autosuspend(dev);
+ pm_runtime_set_autosuspend_delay(dev, 600);
+ ret = pm_runtime_get_sync(dev);
+ if (ret < 0) {
+ dev_err(dev, "pm_runtime_get failed: %i\n", ret);
+ goto out_pmrt;
+ }
ret = bq24190_hw_init(bdi);
if (ret < 0) {
dev_err(dev, "Hardware init failed\n");
- goto out1;
+ goto out_pmrt;
}
charger_cfg.drv_data = bdi;
@@ -1373,7 +1470,7 @@ static int bq24190_probe(struct i2c_client *client,
if (IS_ERR(bdi->charger)) {
dev_err(dev, "Can't register charger\n");
ret = PTR_ERR(bdi->charger);
- goto out1;
+ goto out_pmrt;
}
battery_cfg.drv_data = bdi;
@@ -1382,87 +1479,160 @@ static int bq24190_probe(struct i2c_client *client,
if (IS_ERR(bdi->battery)) {
dev_err(dev, "Can't register battery\n");
ret = PTR_ERR(bdi->battery);
- goto out2;
+ goto out_charger;
}
ret = bq24190_sysfs_create_group(bdi);
if (ret) {
dev_err(dev, "Can't create sysfs entries\n");
- goto out3;
+ goto out_battery;
}
- ret = devm_request_threaded_irq(dev, bdi->irq, NULL,
+ bdi->initialized = true;
+
+ ret = devm_request_threaded_irq(dev, client->irq, NULL,
bq24190_irq_handler_thread,
IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
"bq24190-charger", bdi);
if (ret < 0) {
dev_err(dev, "Can't set up irq handler\n");
- goto out4;
+ goto out_sysfs;
+ }
+
+ if (bdi->extcon) {
+ INIT_DELAYED_WORK(&bdi->extcon_work, bq24190_extcon_work);
+ bdi->extcon_nb.notifier_call = bq24190_extcon_event;
+ ret = devm_extcon_register_notifier_all(dev, bdi->extcon,
+ &bdi->extcon_nb);
+ if (ret) {
+ dev_err(dev, "Can't register extcon\n");
+ goto out_sysfs;
+ }
+
+ /* Sync initial cable state */
+ queue_delayed_work(system_wq, &bdi->extcon_work, 0);
}
+ enable_irq_wake(client->irq);
+
+ pm_runtime_mark_last_busy(dev);
+ pm_runtime_put_autosuspend(dev);
+
return 0;
-out4:
+out_sysfs:
bq24190_sysfs_remove_group(bdi);
-out3:
+out_battery:
power_supply_unregister(bdi->battery);
-out2:
+out_charger:
power_supply_unregister(bdi->charger);
-out1:
+out_pmrt:
+ pm_runtime_put_sync(dev);
+ pm_runtime_dont_use_autosuspend(dev);
pm_runtime_disable(dev);
- if (bdi->gpio_int)
- gpio_free(bdi->gpio_int);
return ret;
}
static int bq24190_remove(struct i2c_client *client)
{
struct bq24190_dev_info *bdi = i2c_get_clientdata(client);
+ int error;
- pm_runtime_get_sync(bdi->dev);
- bq24190_register_reset(bdi);
- pm_runtime_put_sync(bdi->dev);
+ error = pm_runtime_get_sync(bdi->dev);
+ if (error < 0) {
+ dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error);
+ pm_runtime_put_noidle(bdi->dev);
+ }
+ bq24190_register_reset(bdi);
bq24190_sysfs_remove_group(bdi);
power_supply_unregister(bdi->battery);
power_supply_unregister(bdi->charger);
+ if (error >= 0)
+ pm_runtime_put_sync(bdi->dev);
+ pm_runtime_dont_use_autosuspend(bdi->dev);
pm_runtime_disable(bdi->dev);
- if (bdi->gpio_int)
- gpio_free(bdi->gpio_int);
+ return 0;
+}
+
+static __maybe_unused int bq24190_runtime_suspend(struct device *dev)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct bq24190_dev_info *bdi = i2c_get_clientdata(client);
+
+ if (!bdi->initialized)
+ return 0;
+
+ dev_dbg(bdi->dev, "%s\n", __func__);
+
+ return 0;
+}
+
+static __maybe_unused int bq24190_runtime_resume(struct device *dev)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct bq24190_dev_info *bdi = i2c_get_clientdata(client);
+
+ if (!bdi->initialized)
+ return 0;
+
+ if (!bdi->irq_event) {
+ dev_dbg(bdi->dev, "checking events on possible wakeirq\n");
+ bq24190_check_status(bdi);
+ }
return 0;
}
-#ifdef CONFIG_PM_SLEEP
-static int bq24190_pm_suspend(struct device *dev)
+static __maybe_unused int bq24190_pm_suspend(struct device *dev)
{
struct i2c_client *client = to_i2c_client(dev);
struct bq24190_dev_info *bdi = i2c_get_clientdata(client);
+ int error;
+
+ error = pm_runtime_get_sync(bdi->dev);
+ if (error < 0) {
+ dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error);
+ pm_runtime_put_noidle(bdi->dev);
+ }
- pm_runtime_get_sync(bdi->dev);
bq24190_register_reset(bdi);
- pm_runtime_put_sync(bdi->dev);
+
+ if (error >= 0) {
+ pm_runtime_mark_last_busy(bdi->dev);
+ pm_runtime_put_autosuspend(bdi->dev);
+ }
return 0;
}
-static int bq24190_pm_resume(struct device *dev)
+static __maybe_unused int bq24190_pm_resume(struct device *dev)
{
struct i2c_client *client = to_i2c_client(dev);
struct bq24190_dev_info *bdi = i2c_get_clientdata(client);
+ int error;
bdi->f_reg = 0;
bdi->ss_reg = BQ24190_REG_SS_VBUS_STAT_MASK; /* impossible state */
- pm_runtime_get_sync(bdi->dev);
+ error = pm_runtime_get_sync(bdi->dev);
+ if (error < 0) {
+ dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error);
+ pm_runtime_put_noidle(bdi->dev);
+ }
+
bq24190_register_reset(bdi);
bq24190_set_mode_host(bdi);
bq24190_read(bdi, BQ24190_REG_SS, &bdi->ss_reg);
- pm_runtime_put_sync(bdi->dev);
+
+ if (error >= 0) {
+ pm_runtime_mark_last_busy(bdi->dev);
+ pm_runtime_put_autosuspend(bdi->dev);
+ }
/* Things may have changed while suspended so alert upper layer */
power_supply_changed(bdi->charger);
@@ -1470,17 +1640,16 @@ static int bq24190_pm_resume(struct device *dev)
return 0;
}
-#endif
-static SIMPLE_DEV_PM_OPS(bq24190_pm_ops, bq24190_pm_suspend, bq24190_pm_resume);
+static const struct dev_pm_ops bq24190_pm_ops = {
+ SET_RUNTIME_PM_OPS(bq24190_runtime_suspend, bq24190_runtime_resume,
+ NULL)
+ SET_SYSTEM_SLEEP_PM_OPS(bq24190_pm_suspend, bq24190_pm_resume)
+};
-/*
- * Only support the bq24190 right now. The bq24192, bq24192i, and bq24193
- * are similar but not identical so the driver needs to be extended to
- * support them.
- */
static const struct i2c_device_id bq24190_i2c_ids[] = {
- { "bq24190", BQ24190_REG_VPRS_PN_24190 },
+ { "bq24190" },
+ { "bq24192i" },
{ },
};
MODULE_DEVICE_TABLE(i2c, bq24190_i2c_ids);
diff --git a/drivers/power/supply/bq25890_charger.c b/drivers/power/supply/bq25890_charger.c
index f993a55cde20..8e2c41ded171 100644
--- a/drivers/power/supply/bq25890_charger.c
+++ b/drivers/power/supply/bq25890_charger.c
@@ -723,7 +723,7 @@ static int bq25890_irq_probe(struct bq25890_device *bq)
{
struct gpio_desc *irq;
- irq = devm_gpiod_get_index(bq->dev, BQ25890_IRQ_PIN, 0, GPIOD_IN);
+ irq = devm_gpiod_get(bq->dev, BQ25890_IRQ_PIN, GPIOD_IN);
if (IS_ERR(irq)) {
dev_err(bq->dev, "Could not probe irq pin.\n");
return PTR_ERR(irq);
diff --git a/drivers/power/supply/charger-manager.c b/drivers/power/supply/charger-manager.c
index e664ca7c0afd..adc3761831e1 100644
--- a/drivers/power/supply/charger-manager.c
+++ b/drivers/power/supply/charger-manager.c
@@ -1198,7 +1198,7 @@ static int charger_extcon_notifier(struct notifier_block *self,
static int charger_extcon_init(struct charger_manager *cm,
struct charger_cable *cable)
{
- int ret = 0;
+ int ret;
/*
* Charger manager use Extcon framework to identify
@@ -1232,7 +1232,7 @@ static int charger_manager_register_extcon(struct charger_manager *cm)
{
struct charger_desc *desc = cm->desc;
struct charger_regulator *charger;
- int ret = 0;
+ int ret;
int i;
int j;
@@ -1255,15 +1255,14 @@ static int charger_manager_register_extcon(struct charger_manager *cm)
if (ret < 0) {
dev_err(cm->dev, "Cannot initialize charger(%s)\n",
charger->regulator_name);
- goto err;
+ return ret;
}
cable->charger = charger;
cable->cm = cm;
}
}
-err:
- return ret;
+ return 0;
}
/* help function of sysfs node to control charger(regulator) */
@@ -1372,7 +1371,7 @@ static int charger_manager_register_sysfs(struct charger_manager *cm)
int chargers_externally_control = 1;
char buf[11];
char *str;
- int ret = 0;
+ int ret;
int i;
/* Create sysfs entry to control charger(regulator) */
@@ -1382,10 +1381,9 @@ static int charger_manager_register_sysfs(struct charger_manager *cm)
snprintf(buf, 10, "charger.%d", i);
str = devm_kzalloc(cm->dev,
sizeof(char) * (strlen(buf) + 1), GFP_KERNEL);
- if (!str) {
- ret = -ENOMEM;
- goto err;
- }
+ if (!str)
+ return -ENOMEM;
+
strcpy(str, buf);
charger->attrs[0] = &charger->attr_name.attr;
@@ -1426,19 +1424,16 @@ static int charger_manager_register_sysfs(struct charger_manager *cm)
if (ret < 0) {
dev_err(cm->dev, "Cannot create sysfs entry of %s regulator\n",
charger->regulator_name);
- ret = -EINVAL;
- goto err;
+ return ret;
}
}
if (chargers_externally_control) {
dev_err(cm->dev, "Cannot register regulator because charger-manager must need at least one charger for charging battery\n");
- ret = -EINVAL;
- goto err;
+ return -EINVAL;
}
-err:
- return ret;
+ return 0;
}
static int cm_init_thermal_data(struct charger_manager *cm,
@@ -1626,7 +1621,7 @@ static int charger_manager_probe(struct platform_device *pdev)
{
struct charger_desc *desc = cm_get_drv_data(pdev);
struct charger_manager *cm;
- int ret = 0, i = 0;
+ int ret, i = 0;
int j = 0;
union power_supply_propval val;
struct power_supply *fuel_gauge;
@@ -1887,14 +1882,12 @@ MODULE_DEVICE_TABLE(platform, charger_manager_id);
static int cm_suspend_noirq(struct device *dev)
{
- int ret = 0;
-
if (device_may_wakeup(dev)) {
device_set_wakeup_capable(dev, false);
- ret = -EAGAIN;
+ return -EAGAIN;
}
- return ret;
+ return 0;
}
static bool cm_need_to_awake(void)
diff --git a/drivers/power/supply/cpcap-charger.c b/drivers/power/supply/cpcap-charger.c
new file mode 100644
index 000000000000..543a1bd21ab9
--- /dev/null
+++ b/drivers/power/supply/cpcap-charger.c
@@ -0,0 +1,681 @@
+/*
+ * Motorola CPCAP PMIC battery charger driver
+ *
+ * Copyright (C) 2017 Tony Lindgren <tony@atomide.com>
+ *
+ * Rewritten for Linux power framework with some parts based on
+ * on earlier driver found in the Motorola Linux kernel:
+ *
+ * Copyright (C) 2009-2010 Motorola, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+#include <linux/regmap.h>
+
+#include <linux/gpio/consumer.h>
+#include <linux/usb/phy_companion.h>
+#include <linux/phy/omap_usb.h>
+#include <linux/usb/otg.h>
+#include <linux/iio/consumer.h>
+#include <linux/mfd/motorola-cpcap.h>
+
+/* CPCAP_REG_CRM register bits */
+#define CPCAP_REG_CRM_UNUSED_641_15 BIT(15) /* 641 = register number */
+#define CPCAP_REG_CRM_UNUSED_641_14 BIT(14) /* 641 = register number */
+#define CPCAP_REG_CRM_CHRG_LED_EN BIT(13)
+#define CPCAP_REG_CRM_RVRSMODE BIT(12)
+#define CPCAP_REG_CRM_ICHRG_TR1 BIT(11)
+#define CPCAP_REG_CRM_ICHRG_TR0 BIT(10)
+#define CPCAP_REG_CRM_FET_OVRD BIT(9)
+#define CPCAP_REG_CRM_FET_CTRL BIT(8)
+#define CPCAP_REG_CRM_VCHRG3 BIT(7)
+#define CPCAP_REG_CRM_VCHRG2 BIT(6)
+#define CPCAP_REG_CRM_VCHRG1 BIT(5)
+#define CPCAP_REG_CRM_VCHRG0 BIT(4)
+#define CPCAP_REG_CRM_ICHRG3 BIT(3)
+#define CPCAP_REG_CRM_ICHRG2 BIT(2)
+#define CPCAP_REG_CRM_ICHRG1 BIT(1)
+#define CPCAP_REG_CRM_ICHRG0 BIT(0)
+
+/* CPCAP_REG_CRM trickle charge voltages */
+#define CPCAP_REG_CRM_TR(val) (((val) & 0x3) << 10)
+#define CPCAP_REG_CRM_TR_0A00 CPCAP_REG_CRM_TR(0x0)
+#define CPCAP_REG_CRM_TR_0A24 CPCAP_REG_CRM_TR(0x1)
+#define CPCAP_REG_CRM_TR_0A48 CPCAP_REG_CRM_TR(0x2)
+#define CPCAP_REG_CRM_TR_0A72 CPCAP_REG_CRM_TR(0x4)
+
+/* CPCAP_REG_CRM charge voltages */
+#define CPCAP_REG_CRM_VCHRG(val) (((val) & 0xf) << 4)
+#define CPCAP_REG_CRM_VCHRG_3V80 CPCAP_REG_CRM_VCHRG(0x0)
+#define CPCAP_REG_CRM_VCHRG_4V10 CPCAP_REG_CRM_VCHRG(0x1)
+#define CPCAP_REG_CRM_VCHRG_4V15 CPCAP_REG_CRM_VCHRG(0x2)
+#define CPCAP_REG_CRM_VCHRG_4V20 CPCAP_REG_CRM_VCHRG(0x3)
+#define CPCAP_REG_CRM_VCHRG_4V22 CPCAP_REG_CRM_VCHRG(0x4)
+#define CPCAP_REG_CRM_VCHRG_4V24 CPCAP_REG_CRM_VCHRG(0x5)
+#define CPCAP_REG_CRM_VCHRG_4V26 CPCAP_REG_CRM_VCHRG(0x6)
+#define CPCAP_REG_CRM_VCHRG_4V28 CPCAP_REG_CRM_VCHRG(0x7)
+#define CPCAP_REG_CRM_VCHRG_4V30 CPCAP_REG_CRM_VCHRG(0x8)
+#define CPCAP_REG_CRM_VCHRG_4V32 CPCAP_REG_CRM_VCHRG(0x9)
+#define CPCAP_REG_CRM_VCHRG_4V34 CPCAP_REG_CRM_VCHRG(0xa)
+#define CPCAP_REG_CRM_VCHRG_4V36 CPCAP_REG_CRM_VCHRG(0xb)
+#define CPCAP_REG_CRM_VCHRG_4V38 CPCAP_REG_CRM_VCHRG(0xc)
+#define CPCAP_REG_CRM_VCHRG_4V40 CPCAP_REG_CRM_VCHRG(0xd)
+#define CPCAP_REG_CRM_VCHRG_4V42 CPCAP_REG_CRM_VCHRG(0xe)
+#define CPCAP_REG_CRM_VCHRG_4V44 CPCAP_REG_CRM_VCHRG(0xf)
+
+/* CPCAP_REG_CRM charge currents */
+#define CPCAP_REG_CRM_ICHRG(val) (((val) & 0xf) << 0)
+#define CPCAP_REG_CRM_ICHRG_0A000 CPCAP_REG_CRM_ICHRG(0x0)
+#define CPCAP_REG_CRM_ICHRG_0A070 CPCAP_REG_CRM_ICHRG(0x1)
+#define CPCAP_REG_CRM_ICHRG_0A176 CPCAP_REG_CRM_ICHRG(0x2)
+#define CPCAP_REG_CRM_ICHRG_0A264 CPCAP_REG_CRM_ICHRG(0x3)
+#define CPCAP_REG_CRM_ICHRG_0A352 CPCAP_REG_CRM_ICHRG(0x4)
+#define CPCAP_REG_CRM_ICHRG_0A440 CPCAP_REG_CRM_ICHRG(0x5)
+#define CPCAP_REG_CRM_ICHRG_0A528 CPCAP_REG_CRM_ICHRG(0x6)
+#define CPCAP_REG_CRM_ICHRG_0A616 CPCAP_REG_CRM_ICHRG(0x7)
+#define CPCAP_REG_CRM_ICHRG_0A704 CPCAP_REG_CRM_ICHRG(0x8)
+#define CPCAP_REG_CRM_ICHRG_0A792 CPCAP_REG_CRM_ICHRG(0x9)
+#define CPCAP_REG_CRM_ICHRG_0A880 CPCAP_REG_CRM_ICHRG(0xa)
+#define CPCAP_REG_CRM_ICHRG_0A968 CPCAP_REG_CRM_ICHRG(0xb)
+#define CPCAP_REG_CRM_ICHRG_1A056 CPCAP_REG_CRM_ICHRG(0xc)
+#define CPCAP_REG_CRM_ICHRG_1A144 CPCAP_REG_CRM_ICHRG(0xd)
+#define CPCAP_REG_CRM_ICHRG_1A584 CPCAP_REG_CRM_ICHRG(0xe)
+#define CPCAP_REG_CRM_ICHRG_NO_LIMIT CPCAP_REG_CRM_ICHRG(0xf)
+
+enum {
+ CPCAP_CHARGER_IIO_BATTDET,
+ CPCAP_CHARGER_IIO_VOLTAGE,
+ CPCAP_CHARGER_IIO_VBUS,
+ CPCAP_CHARGER_IIO_CHRG_CURRENT,
+ CPCAP_CHARGER_IIO_BATT_CURRENT,
+ CPCAP_CHARGER_IIO_NR,
+};
+
+struct cpcap_charger_ddata {
+ struct device *dev;
+ struct regmap *reg;
+ struct list_head irq_list;
+ struct delayed_work detect_work;
+ struct delayed_work vbus_work;
+ struct gpio_desc *gpio[2]; /* gpio_reven0 & 1 */
+
+ struct iio_channel *channels[CPCAP_CHARGER_IIO_NR];
+
+ struct power_supply *usb;
+
+ struct phy_companion comparator; /* For USB VBUS */
+ bool vbus_enabled;
+ atomic_t active;
+
+ int status;
+};
+
+struct cpcap_interrupt_desc {
+ int irq;
+ struct list_head node;
+ const char *name;
+};
+
+struct cpcap_charger_ints_state {
+ bool chrg_det;
+ bool rvrs_chrg;
+ bool vbusov;
+
+ bool chrg_se1b;
+ bool rvrs_mode;
+ bool chrgcurr1;
+ bool vbusvld;
+
+ bool battdetb;
+};
+
+static enum power_supply_property cpcap_charger_props[] = {
+ POWER_SUPPLY_PROP_STATUS,
+ POWER_SUPPLY_PROP_ONLINE,
+ POWER_SUPPLY_PROP_VOLTAGE_NOW,
+ POWER_SUPPLY_PROP_CURRENT_NOW,
+};
+
+static bool cpcap_charger_battery_found(struct cpcap_charger_ddata *ddata)
+{
+ struct iio_channel *channel;
+ int error, value;
+
+ channel = ddata->channels[CPCAP_CHARGER_IIO_BATTDET];
+ error = iio_read_channel_raw(channel, &value);
+ if (error < 0) {
+ dev_warn(ddata->dev, "%s failed: %i\n", __func__, error);
+
+ return false;
+ }
+
+ return value == 1;
+}
+
+static int cpcap_charger_get_charge_voltage(struct cpcap_charger_ddata *ddata)
+{
+ struct iio_channel *channel;
+ int error, value = 0;
+
+ channel = ddata->channels[CPCAP_CHARGER_IIO_VOLTAGE];
+ error = iio_read_channel_processed(channel, &value);
+ if (error < 0) {
+ dev_warn(ddata->dev, "%s failed: %i\n", __func__, error);
+
+ return 0;
+ }
+
+ return value;
+}
+
+static int cpcap_charger_get_charge_current(struct cpcap_charger_ddata *ddata)
+{
+ struct iio_channel *channel;
+ int error, value = 0;
+
+ channel = ddata->channels[CPCAP_CHARGER_IIO_CHRG_CURRENT];
+ error = iio_read_channel_processed(channel, &value);
+ if (error < 0) {
+ dev_warn(ddata->dev, "%s failed: %i\n", __func__, error);
+
+ return 0;
+ }
+
+ return value;
+}
+
+static int cpcap_charger_get_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ union power_supply_propval *val)
+{
+ struct cpcap_charger_ddata *ddata = dev_get_drvdata(psy->dev.parent);
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_STATUS:
+ val->intval = ddata->status;
+ break;
+ case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+ if (ddata->status == POWER_SUPPLY_STATUS_CHARGING)
+ val->intval = cpcap_charger_get_charge_voltage(ddata) *
+ 1000;
+ else
+ val->intval = 0;
+ break;
+ case POWER_SUPPLY_PROP_CURRENT_NOW:
+ if (ddata->status == POWER_SUPPLY_STATUS_CHARGING)
+ val->intval = cpcap_charger_get_charge_current(ddata) *
+ 1000;
+ else
+ val->intval = 0;
+ break;
+ case POWER_SUPPLY_PROP_ONLINE:
+ val->intval = ddata->status == POWER_SUPPLY_STATUS_CHARGING;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void cpcap_charger_set_cable_path(struct cpcap_charger_ddata *ddata,
+ bool enabled)
+{
+ if (!ddata->gpio[0])
+ return;
+
+ gpiod_set_value(ddata->gpio[0], enabled);
+}
+
+static void cpcap_charger_set_inductive_path(struct cpcap_charger_ddata *ddata,
+ bool enabled)
+{
+ if (!ddata->gpio[1])
+ return;
+
+ gpiod_set_value(ddata->gpio[1], enabled);
+}
+
+static int cpcap_charger_set_state(struct cpcap_charger_ddata *ddata,
+ int max_voltage, int charge_current,
+ int trickle_current)
+{
+ bool enable;
+ int error;
+
+ enable = max_voltage && (charge_current || trickle_current);
+ dev_dbg(ddata->dev, "%s enable: %i\n", __func__, enable);
+
+ if (!enable) {
+ error = regmap_update_bits(ddata->reg, CPCAP_REG_CRM,
+ 0x3fff,
+ CPCAP_REG_CRM_FET_OVRD |
+ CPCAP_REG_CRM_FET_CTRL);
+ if (error) {
+ ddata->status = POWER_SUPPLY_STATUS_UNKNOWN;
+ goto out_err;
+ }
+
+ ddata->status = POWER_SUPPLY_STATUS_DISCHARGING;
+
+ return 0;
+ }
+
+ error = regmap_update_bits(ddata->reg, CPCAP_REG_CRM, 0x3fff,
+ CPCAP_REG_CRM_CHRG_LED_EN |
+ trickle_current |
+ CPCAP_REG_CRM_FET_OVRD |
+ CPCAP_REG_CRM_FET_CTRL |
+ max_voltage |
+ charge_current);
+ if (error) {
+ ddata->status = POWER_SUPPLY_STATUS_UNKNOWN;
+ goto out_err;
+ }
+
+ ddata->status = POWER_SUPPLY_STATUS_CHARGING;
+
+ return 0;
+
+out_err:
+ dev_err(ddata->dev, "%s failed with %i\n", __func__, error);
+
+ return error;
+}
+
+static bool cpcap_charger_vbus_valid(struct cpcap_charger_ddata *ddata)
+{
+ int error, value = 0;
+ struct iio_channel *channel =
+ ddata->channels[CPCAP_CHARGER_IIO_VBUS];
+
+ error = iio_read_channel_processed(channel, &value);
+ if (error >= 0)
+ return value > 3900 ? true : false;
+
+ dev_err(ddata->dev, "error reading VBUS: %i\n", error);
+
+ return false;
+}
+
+/* VBUS control functions for the USB PHY companion */
+
+static void cpcap_charger_vbus_work(struct work_struct *work)
+{
+ struct cpcap_charger_ddata *ddata;
+ bool vbus = false;
+ int error;
+
+ ddata = container_of(work, struct cpcap_charger_ddata,
+ vbus_work.work);
+
+ if (ddata->vbus_enabled) {
+ vbus = cpcap_charger_vbus_valid(ddata);
+ if (vbus) {
+ dev_info(ddata->dev, "VBUS already provided\n");
+
+ return;
+ }
+
+ cpcap_charger_set_cable_path(ddata, false);
+ cpcap_charger_set_inductive_path(ddata, false);
+
+ error = cpcap_charger_set_state(ddata, 0, 0, 0);
+ if (error)
+ goto out_err;
+
+ error = regmap_update_bits(ddata->reg, CPCAP_REG_CRM,
+ CPCAP_REG_CRM_RVRSMODE,
+ CPCAP_REG_CRM_RVRSMODE);
+ if (error)
+ goto out_err;
+ } else {
+ error = regmap_update_bits(ddata->reg, CPCAP_REG_CRM,
+ CPCAP_REG_CRM_RVRSMODE, 0);
+ if (error)
+ goto out_err;
+
+ cpcap_charger_set_cable_path(ddata, true);
+ cpcap_charger_set_inductive_path(ddata, true);
+ }
+
+ return;
+
+out_err:
+ dev_err(ddata->dev, "%s could not %s vbus: %i\n", __func__,
+ ddata->vbus_enabled ? "enable" : "disable", error);
+}
+
+static int cpcap_charger_set_vbus(struct phy_companion *comparator,
+ bool enabled)
+{
+ struct cpcap_charger_ddata *ddata =
+ container_of(comparator, struct cpcap_charger_ddata,
+ comparator);
+
+ ddata->vbus_enabled = enabled;
+ schedule_delayed_work(&ddata->vbus_work, 0);
+
+ return 0;
+}
+
+/* Charger interrupt handling functions */
+
+static int cpcap_charger_get_ints_state(struct cpcap_charger_ddata *ddata,
+ struct cpcap_charger_ints_state *s)
+{
+ int val, error;
+
+ error = regmap_read(ddata->reg, CPCAP_REG_INTS1, &val);
+ if (error)
+ return error;
+
+ s->chrg_det = val & BIT(13);
+ s->rvrs_chrg = val & BIT(12);
+ s->vbusov = val & BIT(11);
+
+ error = regmap_read(ddata->reg, CPCAP_REG_INTS2, &val);
+ if (error)
+ return error;
+
+ s->chrg_se1b = val & BIT(13);
+ s->rvrs_mode = val & BIT(6);
+ s->chrgcurr1 = val & BIT(4);
+ s->vbusvld = val & BIT(3);
+
+ error = regmap_read(ddata->reg, CPCAP_REG_INTS4, &val);
+ if (error)
+ return error;
+
+ s->battdetb = val & BIT(6);
+
+ return 0;
+}
+
+static void cpcap_usb_detect(struct work_struct *work)
+{
+ struct cpcap_charger_ddata *ddata;
+ struct cpcap_charger_ints_state s;
+ int error;
+
+ ddata = container_of(work, struct cpcap_charger_ddata,
+ detect_work.work);
+
+ error = cpcap_charger_get_ints_state(ddata, &s);
+ if (error)
+ return;
+
+ if (cpcap_charger_vbus_valid(ddata) && s.chrgcurr1) {
+ int max_current;
+
+ if (cpcap_charger_battery_found(ddata))
+ max_current = CPCAP_REG_CRM_ICHRG_1A584;
+ else
+ max_current = CPCAP_REG_CRM_ICHRG_0A528;
+
+ error = cpcap_charger_set_state(ddata,
+ CPCAP_REG_CRM_VCHRG_4V20,
+ max_current,
+ CPCAP_REG_CRM_TR_0A72);
+ if (error)
+ goto out_err;
+ } else {
+ error = cpcap_charger_set_state(ddata, 0, 0, 0);
+ if (error)
+ goto out_err;
+ }
+
+ return;
+
+out_err:
+ dev_err(ddata->dev, "%s failed with %i\n", __func__, error);
+}
+
+static irqreturn_t cpcap_charger_irq_thread(int irq, void *data)
+{
+ struct cpcap_charger_ddata *ddata = data;
+
+ if (!atomic_read(&ddata->active))
+ return IRQ_NONE;
+
+ schedule_delayed_work(&ddata->detect_work, 0);
+
+ return IRQ_HANDLED;
+}
+
+static int cpcap_usb_init_irq(struct platform_device *pdev,
+ struct cpcap_charger_ddata *ddata,
+ const char *name)
+{
+ struct cpcap_interrupt_desc *d;
+ int irq, error;
+
+ irq = platform_get_irq_byname(pdev, name);
+ if (!irq)
+ return -ENODEV;
+
+ error = devm_request_threaded_irq(ddata->dev, irq, NULL,
+ cpcap_charger_irq_thread,
+ IRQF_SHARED,
+ name, ddata);
+ if (error) {
+ dev_err(ddata->dev, "could not get irq %s: %i\n",
+ name, error);
+
+ return error;
+ }
+
+ d = devm_kzalloc(ddata->dev, sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ d->name = name;
+ d->irq = irq;
+ list_add(&d->node, &ddata->irq_list);
+
+ return 0;
+}
+
+static const char * const cpcap_charger_irqs[] = {
+ /* REG_INT_0 */
+ "chrg_det", "rvrs_chrg",
+
+ /* REG_INT1 */
+ "chrg_se1b", "se0conn", "rvrs_mode", "chrgcurr1", "vbusvld",
+
+ /* REG_INT_3 */
+ "battdetb",
+};
+
+static int cpcap_usb_init_interrupts(struct platform_device *pdev,
+ struct cpcap_charger_ddata *ddata)
+{
+ int i, error;
+
+ for (i = 0; i < ARRAY_SIZE(cpcap_charger_irqs); i++) {
+ error = cpcap_usb_init_irq(pdev, ddata, cpcap_charger_irqs[i]);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+static void cpcap_charger_init_optional_gpios(struct cpcap_charger_ddata *ddata)
+{
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ ddata->gpio[i] = devm_gpiod_get_index(ddata->dev, "mode",
+ i, GPIOD_OUT_HIGH);
+ if (IS_ERR(ddata->gpio[i])) {
+ dev_info(ddata->dev, "no mode change GPIO%i: %li\n",
+ i, PTR_ERR(ddata->gpio[i]));
+ ddata->gpio[i] = NULL;
+ }
+ }
+}
+
+static int cpcap_charger_init_iio(struct cpcap_charger_ddata *ddata)
+{
+ const char * const names[CPCAP_CHARGER_IIO_NR] = {
+ "battdetb", "battp", "vbus", "chg_isense", "batti",
+ };
+ int error, i;
+
+ for (i = 0; i < CPCAP_CHARGER_IIO_NR; i++) {
+ ddata->channels[i] = devm_iio_channel_get(ddata->dev,
+ names[i]);
+ if (IS_ERR(ddata->channels[i])) {
+ error = PTR_ERR(ddata->channels[i]);
+ goto out_err;
+ }
+
+ if (!ddata->channels[i]->indio_dev) {
+ error = -ENXIO;
+ goto out_err;
+ }
+ }
+
+ return 0;
+
+out_err:
+ dev_err(ddata->dev, "could not initialize VBUS or ID IIO: %i\n",
+ error);
+
+ return error;
+}
+
+static const struct power_supply_desc cpcap_charger_usb_desc = {
+ .name = "cpcap_usb",
+ .type = POWER_SUPPLY_TYPE_USB,
+ .properties = cpcap_charger_props,
+ .num_properties = ARRAY_SIZE(cpcap_charger_props),
+ .get_property = cpcap_charger_get_property,
+};
+
+#ifdef CONFIG_OF
+static const struct of_device_id cpcap_charger_id_table[] = {
+ {
+ .compatible = "motorola,mapphone-cpcap-charger",
+ },
+ {},
+};
+MODULE_DEVICE_TABLE(of, cpcap_charger_id_table);
+#endif
+
+static int cpcap_charger_probe(struct platform_device *pdev)
+{
+ struct cpcap_charger_ddata *ddata;
+ const struct of_device_id *of_id;
+ int error;
+
+ of_id = of_match_device(of_match_ptr(cpcap_charger_id_table),
+ &pdev->dev);
+ if (!of_id)
+ return -EINVAL;
+
+ ddata = devm_kzalloc(&pdev->dev, sizeof(*ddata), GFP_KERNEL);
+ if (!ddata)
+ return -ENOMEM;
+
+ ddata->dev = &pdev->dev;
+
+ ddata->reg = dev_get_regmap(ddata->dev->parent, NULL);
+ if (!ddata->reg)
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&ddata->irq_list);
+ INIT_DELAYED_WORK(&ddata->detect_work, cpcap_usb_detect);
+ INIT_DELAYED_WORK(&ddata->vbus_work, cpcap_charger_vbus_work);
+ platform_set_drvdata(pdev, ddata);
+
+ error = cpcap_charger_init_iio(ddata);
+ if (error)
+ return error;
+
+ atomic_set(&ddata->active, 1);
+
+ ddata->usb = devm_power_supply_register(ddata->dev,
+ &cpcap_charger_usb_desc,
+ NULL);
+ if (IS_ERR(ddata->usb)) {
+ error = PTR_ERR(ddata->usb);
+ dev_err(ddata->dev, "failed to register USB charger: %i\n",
+ error);
+
+ return error;
+ }
+
+ error = cpcap_usb_init_interrupts(pdev, ddata);
+ if (error)
+ return error;
+
+ ddata->comparator.set_vbus = cpcap_charger_set_vbus;
+ error = omap_usb2_set_comparator(&ddata->comparator);
+ if (error == -ENODEV) {
+ dev_info(ddata->dev, "charger needs phy, deferring probe\n");
+ return -EPROBE_DEFER;
+ }
+
+ cpcap_charger_init_optional_gpios(ddata);
+
+ schedule_delayed_work(&ddata->detect_work, 0);
+
+ return 0;
+}
+
+static int cpcap_charger_remove(struct platform_device *pdev)
+{
+ struct cpcap_charger_ddata *ddata = platform_get_drvdata(pdev);
+ int error;
+
+ atomic_set(&ddata->active, 0);
+ error = omap_usb2_set_comparator(NULL);
+ if (error)
+ dev_warn(ddata->dev, "could not clear USB comparator: %i\n",
+ error);
+
+ error = cpcap_charger_set_state(ddata, 0, 0, 0);
+ if (error)
+ dev_warn(ddata->dev, "could not clear charger: %i\n",
+ error);
+ cancel_delayed_work_sync(&ddata->vbus_work);
+ cancel_delayed_work_sync(&ddata->detect_work);
+
+ return 0;
+}
+
+static struct platform_driver cpcap_charger_driver = {
+ .probe = cpcap_charger_probe,
+ .driver = {
+ .name = "cpcap-charger",
+ .of_match_table = of_match_ptr(cpcap_charger_id_table),
+ },
+ .remove = cpcap_charger_remove,
+};
+module_platform_driver(cpcap_charger_driver);
+
+MODULE_AUTHOR("Tony Lindgren <tony@atomide.com>");
+MODULE_DESCRIPTION("CPCAP Battery Charger Interface driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:cpcap-charger");
diff --git a/drivers/power/supply/lego_ev3_battery.c b/drivers/power/supply/lego_ev3_battery.c
new file mode 100644
index 000000000000..7b993d669f7f
--- /dev/null
+++ b/drivers/power/supply/lego_ev3_battery.c
@@ -0,0 +1,228 @@
+/*
+ * Battery driver for LEGO MINDSTORMS EV3
+ *
+ * Copyright (C) 2017 David Lechner <david@lechnology.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/gpio/consumer.h>
+#include <linux/iio/consumer.h>
+#include <linux/iio/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+
+struct lego_ev3_battery {
+ struct iio_channel *iio_v;
+ struct iio_channel *iio_i;
+ struct gpio_desc *rechargeable_gpio;
+ struct power_supply *psy;
+ int technology;
+ int v_max;
+ int v_min;
+};
+
+static int lego_ev3_battery_get_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ union power_supply_propval *val)
+{
+ struct lego_ev3_battery *batt = power_supply_get_drvdata(psy);
+ int val2;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_TECHNOLOGY:
+ val->intval = batt->technology;
+ break;
+ case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+ /* battery voltage is iio channel * 2 + Vce of transistor */
+ iio_read_channel_processed(batt->iio_v, &val->intval);
+ val->intval *= 2000;
+ val->intval += 200000;
+ /* plus adjust for shunt resistor drop */
+ iio_read_channel_processed(batt->iio_i, &val2);
+ val2 *= 1000;
+ val2 /= 15;
+ val->intval += val2;
+ break;
+ case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN:
+ val->intval = batt->v_max;
+ break;
+ case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN:
+ val->intval = batt->v_min;
+ break;
+ case POWER_SUPPLY_PROP_CURRENT_NOW:
+ /* battery current is iio channel / 15 / 0.05 ohms */
+ iio_read_channel_processed(batt->iio_i, &val->intval);
+ val->intval *= 20000;
+ val->intval /= 15;
+ break;
+ case POWER_SUPPLY_PROP_SCOPE:
+ val->intval = POWER_SUPPLY_SCOPE_SYSTEM;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int lego_ev3_battery_set_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ const union power_supply_propval *val)
+{
+ struct lego_ev3_battery *batt = power_supply_get_drvdata(psy);
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_TECHNOLOGY:
+ /*
+ * Only allow changing technology from Unknown to NiMH. Li-ion
+ * batteries are automatically detected and should not be
+ * overridden. Rechargeable AA batteries, on the other hand,
+ * cannot be automatically detected, and so must be manually
+ * specified. This should only be set once during system init,
+ * so there is no mechanism to go back to Unknown.
+ */
+ if (batt->technology != POWER_SUPPLY_TECHNOLOGY_UNKNOWN)
+ return -EINVAL;
+ switch (val->intval) {
+ case POWER_SUPPLY_TECHNOLOGY_NiMH:
+ batt->technology = POWER_SUPPLY_TECHNOLOGY_NiMH;
+ batt->v_max = 7800000;
+ batt->v_min = 5400000;
+ break;
+ default:
+ return -EINVAL;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int lego_ev3_battery_property_is_writeable(struct power_supply *psy,
+ enum power_supply_property psp)
+{
+ struct lego_ev3_battery *batt = power_supply_get_drvdata(psy);
+
+ return psp == POWER_SUPPLY_PROP_TECHNOLOGY &&
+ batt->technology == POWER_SUPPLY_TECHNOLOGY_UNKNOWN;
+}
+
+static enum power_supply_property lego_ev3_battery_props[] = {
+ POWER_SUPPLY_PROP_TECHNOLOGY,
+ POWER_SUPPLY_PROP_VOLTAGE_NOW,
+ POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN,
+ POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
+ POWER_SUPPLY_PROP_CURRENT_NOW,
+ POWER_SUPPLY_PROP_SCOPE,
+};
+
+static const struct power_supply_desc lego_ev3_battery_desc = {
+ .name = "lego-ev3-battery",
+ .type = POWER_SUPPLY_TYPE_BATTERY,
+ .properties = lego_ev3_battery_props,
+ .num_properties = ARRAY_SIZE(lego_ev3_battery_props),
+ .get_property = lego_ev3_battery_get_property,
+ .set_property = lego_ev3_battery_set_property,
+ .property_is_writeable = lego_ev3_battery_property_is_writeable,
+};
+
+static int lego_ev3_battery_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct lego_ev3_battery *batt;
+ struct power_supply_config psy_cfg = {};
+ int err;
+
+ batt = devm_kzalloc(dev, sizeof(*batt), GFP_KERNEL);
+ if (!batt)
+ return -ENOMEM;
+
+ platform_set_drvdata(pdev, batt);
+
+ batt->iio_v = devm_iio_channel_get(dev, "voltage");
+ err = PTR_ERR_OR_ZERO(batt->iio_v);
+ if (err) {
+ if (err != -EPROBE_DEFER)
+ dev_err(dev, "Failed to get voltage iio channel\n");
+ return err;
+ }
+
+ batt->iio_i = devm_iio_channel_get(dev, "current");
+ err = PTR_ERR_OR_ZERO(batt->iio_i);
+ if (err) {
+ if (err != -EPROBE_DEFER)
+ dev_err(dev, "Failed to get current iio channel\n");
+ return err;
+ }
+
+ batt->rechargeable_gpio = devm_gpiod_get(dev, "rechargeable", GPIOD_IN);
+ err = PTR_ERR_OR_ZERO(batt->rechargeable_gpio);
+ if (err) {
+ if (err != -EPROBE_DEFER)
+ dev_err(dev, "Failed to get rechargeable gpio\n");
+ return err;
+ }
+
+ /*
+ * The rechargeable battery indication switch cannot be changed without
+ * removing the battery, so we only need to read it once.
+ */
+ if (gpiod_get_value(batt->rechargeable_gpio)) {
+ /* 2-cell Li-ion, 7.4V nominal */
+ batt->technology = POWER_SUPPLY_TECHNOLOGY_LION;
+ batt->v_max = 84000000;
+ batt->v_min = 60000000;
+ } else {
+ /* 6x AA Alkaline, 9V nominal */
+ batt->technology = POWER_SUPPLY_TECHNOLOGY_UNKNOWN;
+ batt->v_max = 90000000;
+ batt->v_min = 48000000;
+ }
+
+ psy_cfg.of_node = pdev->dev.of_node;
+ psy_cfg.drv_data = batt;
+
+ batt->psy = devm_power_supply_register(dev, &lego_ev3_battery_desc,
+ &psy_cfg);
+ err = PTR_ERR_OR_ZERO(batt->psy);
+ if (err) {
+ dev_err(dev, "failed to register power supply\n");
+ return err;
+ }
+
+ return 0;
+}
+
+static const struct of_device_id of_lego_ev3_battery_match[] = {
+ { .compatible = "lego,ev3-battery", },
+ { }
+};
+MODULE_DEVICE_TABLE(of, of_lego_ev3_battery_match);
+
+static struct platform_driver lego_ev3_battery_driver = {
+ .driver = {
+ .name = "lego-ev3-battery",
+ .of_match_table = of_lego_ev3_battery_match,
+ },
+ .probe = lego_ev3_battery_probe,
+};
+module_platform_driver(lego_ev3_battery_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David Lechner <david@lechnology.com>");
+MODULE_DESCRIPTION("LEGO MINDSTORMS EV3 Battery Driver");
diff --git a/drivers/power/supply/lp8788-charger.c b/drivers/power/supply/lp8788-charger.c
index 509e2b341bd6..677f7c40b25a 100644
--- a/drivers/power/supply/lp8788-charger.c
+++ b/drivers/power/supply/lp8788-charger.c
@@ -651,7 +651,7 @@ static ssize_t lp8788_show_eoc_time(struct device *dev,
{
struct lp8788_charger *pchg = dev_get_drvdata(dev);
char *stime[] = { "400ms", "5min", "10min", "15min",
- "20min", "25min", "30min" "No timeout" };
+ "20min", "25min", "30min", "No timeout" };
u8 val;
lp8788_read_byte(pchg->lp, LP8788_CHG_EOC, &val);
diff --git a/drivers/power/supply/ltc2941-battery-gauge.c b/drivers/power/supply/ltc2941-battery-gauge.c
index 4adf2ba021ce..7efb908f4451 100644
--- a/drivers/power/supply/ltc2941-battery-gauge.c
+++ b/drivers/power/supply/ltc2941-battery-gauge.c
@@ -9,6 +9,7 @@
*/
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/of_device.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/swab.h>
@@ -61,7 +62,7 @@ struct ltc294x_info {
struct power_supply *supply; /* Supply pointer */
struct power_supply_desc supply_desc; /* Supply description */
struct delayed_work work; /* Work scheduler */
- int num_regs; /* Number of registers (chip type) */
+ unsigned long num_regs; /* Number of registers (chip type) */
int charge; /* Last charge register content */
int r_sense; /* mOhm */
int Qlsb; /* nAh */
@@ -387,7 +388,7 @@ static int ltc294x_i2c_probe(struct i2c_client *client,
np = of_node_get(client->dev.of_node);
- info->num_regs = id->driver_data;
+ info->num_regs = (unsigned long)of_device_get_match_data(&client->dev);
info->supply_desc.name = np->name;
/* r_sense can be negative, when sense+ is connected to the battery
@@ -497,9 +498,23 @@ static const struct i2c_device_id ltc294x_i2c_id[] = {
};
MODULE_DEVICE_TABLE(i2c, ltc294x_i2c_id);
+static const struct of_device_id ltc294x_i2c_of_match[] = {
+ {
+ .compatible = "lltc,ltc2941",
+ .data = (void *)LTC2941_NUM_REGS
+ },
+ {
+ .compatible = "lltc,ltc2943",
+ .data = (void *)LTC2943_NUM_REGS
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(of, ltc294x_i2c_of_match);
+
static struct i2c_driver ltc294x_driver = {
.driver = {
.name = "LTC2941",
+ .of_match_table = ltc294x_i2c_of_match,
.pm = LTC294X_PM_OPS,
},
.probe = ltc294x_i2c_probe,
diff --git a/drivers/power/supply/max17040_battery.c b/drivers/power/supply/max17040_battery.c
index e7c3649b31a0..33c40f79d23d 100644
--- a/drivers/power/supply/max17040_battery.c
+++ b/drivers/power/supply/max17040_battery.c
@@ -277,9 +277,17 @@ static const struct i2c_device_id max17040_id[] = {
};
MODULE_DEVICE_TABLE(i2c, max17040_id);
+static const struct of_device_id max17040_of_match[] = {
+ { .compatible = "maxim,max17040" },
+ { .compatible = "maxim,max77836-battery" },
+ { },
+};
+MODULE_DEVICE_TABLE(of, max17040_of_match);
+
static struct i2c_driver max17040_i2c_driver = {
.driver = {
.name = "max17040",
+ .of_match_table = max17040_of_match,
.pm = MAX17040_PM_OPS,
},
.probe = max17040_probe,
diff --git a/drivers/power/supply/sbs-charger.c b/drivers/power/supply/sbs-charger.c
index 353765a5f44c..15947dbb511e 100644
--- a/drivers/power/supply/sbs-charger.c
+++ b/drivers/power/supply/sbs-charger.c
@@ -137,10 +137,7 @@ static enum power_supply_property sbs_properties[] = {
static bool sbs_readable_reg(struct device *dev, unsigned int reg)
{
- if (reg < SBS_CHARGER_REG_SPEC_INFO)
- return false;
- else
- return true;
+ return reg >= SBS_CHARGER_REG_SPEC_INFO;
}
static bool sbs_volatile_reg(struct device *dev, unsigned int reg)
diff --git a/drivers/power/supply/tps65217_charger.c b/drivers/power/supply/tps65217_charger.c
index 29b61e81b385..1f5234098aaf 100644
--- a/drivers/power/supply/tps65217_charger.c
+++ b/drivers/power/supply/tps65217_charger.c
@@ -58,8 +58,6 @@ static int tps65217_config_charger(struct tps65217_charger *charger)
{
int ret;
- dev_dbg(charger->dev, "%s\n", __func__);
-
/*
* tps65217 rev. G, p. 31 (see p. 32 for NTC schematic)
*
@@ -205,8 +203,6 @@ static int tps65217_charger_probe(struct platform_device *pdev)
int ret;
int i;
- dev_dbg(&pdev->dev, "%s\n", __func__);
-
charger = devm_kzalloc(&pdev->dev, sizeof(*charger), GFP_KERNEL);
if (!charger)
return -ENOMEM;
diff --git a/drivers/power/supply/twl4030_charger.c b/drivers/power/supply/twl4030_charger.c
index bcd4dc304f27..990ff3d218bc 100644
--- a/drivers/power/supply/twl4030_charger.c
+++ b/drivers/power/supply/twl4030_charger.c
@@ -1117,7 +1117,7 @@ fail:
return ret;
}
-static int __exit twl4030_bci_remove(struct platform_device *pdev)
+static int twl4030_bci_remove(struct platform_device *pdev)
{
struct twl4030_bci *bci = platform_get_drvdata(pdev);
@@ -1148,11 +1148,11 @@ MODULE_DEVICE_TABLE(of, twl_bci_of_match);
static struct platform_driver twl4030_bci_driver = {
.probe = twl4030_bci_probe,
+ .remove = twl4030_bci_remove,
.driver = {
.name = "twl4030_bci",
.of_match_table = of_match_ptr(twl_bci_of_match),
},
- .remove = __exit_p(twl4030_bci_remove),
};
module_platform_driver(twl4030_bci_driver);
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index 6ff61dad5e21..62fed9dc893e 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -183,11 +183,33 @@ static void jsfd_read(char *buf, unsigned long p, size_t togo) {
}
}
-static void jsfd_do_request(struct request_queue *q)
+static int jsfd_queue;
+
+static struct request *jsfd_next_request(void)
+{
+ struct request_queue *q;
+ struct request *rq;
+ int old_pos = jsfd_queue;
+
+ do {
+ q = jsfd_disk[jsfd_queue]->queue;
+ if (++jsfd_queue == JSF_MAX)
+ jsfd_queue = 0;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ return rq;
+ }
+ } while (jsfd_queue != old_pos);
+
+ return NULL;
+}
+
+static void jsfd_request(void)
{
struct request *req;
- req = blk_fetch_request(q);
+ req = jsfd_next_request();
while (req) {
struct jsfd_part *jdp = req->rq_disk->private_data;
unsigned long offset = blk_rq_pos(req) << 9;
@@ -211,10 +233,15 @@ static void jsfd_do_request(struct request_queue *q)
err = 0;
end:
if (!__blk_end_request_cur(req, err))
- req = blk_fetch_request(q);
+ req = jsfd_next_request();
}
}
+static void jsfd_do_request(struct request_queue *q)
+{
+ jsfd_request();
+}
+
/*
* The memory devices use the full 32/64 bits of the offset, and so we cannot
* check against negative addresses: they are ok. The return value is weird,
@@ -544,8 +571,6 @@ static int jsflash_init(void)
return 0;
}
-static struct request_queue *jsf_queue;
-
static int jsfd_init(void)
{
static DEFINE_SPINLOCK(lock);
@@ -562,6 +587,11 @@ static int jsfd_init(void)
struct gendisk *disk = alloc_disk(1);
if (!disk)
goto out;
+ disk->queue = blk_init_queue(jsfd_do_request, &lock);
+ if (!disk->queue) {
+ put_disk(disk);
+ goto out;
+ }
jsfd_disk[i] = disk;
}
@@ -570,13 +600,6 @@ static int jsfd_init(void)
goto out;
}
- jsf_queue = blk_init_queue(jsfd_do_request, &lock);
- if (!jsf_queue) {
- err = -ENOMEM;
- unregister_blkdev(JSFD_MAJOR, "jsfd");
- goto out;
- }
-
for (i = 0; i < JSF_MAX; i++) {
struct gendisk *disk = jsfd_disk[i];
if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
@@ -589,7 +612,6 @@ static int jsfd_init(void)
disk->fops = &jsfd_fops;
set_capacity(disk, jdp->dsize >> 9);
disk->private_data = jdp;
- disk->queue = jsf_queue;
add_disk(disk);
set_disk_ro(disk, 1);
}
@@ -619,6 +641,7 @@ static void __exit jsflash_cleanup_module(void)
for (i = 0; i < JSF_MAX; i++) {
if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
del_gendisk(jsfd_disk[i]);
+ blk_cleanup_queue(jsfd_disk[i]->queue);
put_disk(jsfd_disk[i]);
}
if (jsf0.busy)
@@ -628,7 +651,6 @@ static void __exit jsflash_cleanup_module(void)
misc_deregister(&jsf_dev);
unregister_blkdev(JSFD_MAJOR, "jsfd");
- blk_cleanup_queue(jsf_queue);
}
module_init(jsflash_init_module);
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index fc2855565a51..93dbe58c47c8 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -166,6 +166,7 @@ scsi_mod-y += scsi_scan.o scsi_sysfs.o scsi_devinfo.o
scsi_mod-$(CONFIG_SCSI_NETLINK) += scsi_netlink.o
scsi_mod-$(CONFIG_SYSCTL) += scsi_sysctl.o
scsi_mod-$(CONFIG_SCSI_PROC_FS) += scsi_proc.o
+scsi_mod-$(CONFIG_BLK_DEBUG_FS) += scsi_debugfs.o
scsi_mod-y += scsi_trace.o scsi_logging.o
scsi_mod-$(CONFIG_PM) += scsi_pm.o
scsi_mod-$(CONFIG_SCSI_DH) += scsi_dh.o
diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h
index 257bbdd0f0b8..6d7840b096e6 100644
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -56,7 +56,7 @@ struct lpfc_sli2_slim;
#define LPFC_MAX_SG_SEG_CNT 4096 /* sg element count per scsi cmnd */
#define LPFC_MAX_SGL_SEG_CNT 512 /* SGL element count per scsi cmnd */
#define LPFC_MAX_BPL_SEG_CNT 4096 /* BPL element count per scsi cmnd */
-#define LPFC_MIN_NVME_SEG_CNT 254
+#define LPFC_MAX_NVME_SEG_CNT 128 /* max SGL element cnt per NVME cmnd */
#define LPFC_MAX_SGE_SIZE 0x80000000 /* Maximum data allowed in a SGE */
#define LPFC_IOCB_LIST_CNT 2250 /* list of IOCBs for fast-path usage. */
@@ -474,6 +474,8 @@ struct lpfc_vport {
unsigned long rcv_buffer_time_stamp;
uint32_t vport_flag;
#define STATIC_VPORT 1
+#define FAWWPN_SET 2
+#define FAWWPN_PARAM_CHG 4
uint16_t fdmi_num_disc;
uint32_t fdmi_hba_mask;
@@ -781,6 +783,7 @@ struct lpfc_hba {
uint32_t cfg_nvmet_fb_size;
uint32_t cfg_total_seg_cnt;
uint32_t cfg_sg_seg_cnt;
+ uint32_t cfg_nvme_seg_cnt;
uint32_t cfg_sg_dma_buf_size;
uint64_t cfg_soft_wwnn;
uint64_t cfg_soft_wwpn;
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index 22819afbaef5..513fd07715cd 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -2292,6 +2292,8 @@ lpfc_soft_wwn_enable_store(struct device *dev, struct device_attribute *attr,
struct lpfc_vport *vport = (struct lpfc_vport *) shost->hostdata;
struct lpfc_hba *phba = vport->phba;
unsigned int cnt = count;
+ uint8_t vvvl = vport->fc_sparam.cmn.valid_vendor_ver_level;
+ u32 *fawwpn_key = (uint32_t *)&vport->fc_sparam.un.vendorVersion[0];
/*
* We're doing a simple sanity check for soft_wwpn setting.
@@ -2305,6 +2307,12 @@ lpfc_soft_wwn_enable_store(struct device *dev, struct device_attribute *attr,
* here. The intent is to protect against the random user or
* application that is just writing attributes.
*/
+ if (vvvl == 1 && cpu_to_be32(*fawwpn_key) == FAPWWN_KEY_VENDOR) {
+ lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+ "0051 "LPFC_DRIVER_NAME" soft wwpn can not"
+ " be enabled: fawwpn is enabled\n");
+ return -EINVAL;
+ }
/* count may include a LF at end of string */
if (buf[cnt-1] == '\n')
@@ -3335,7 +3343,7 @@ LPFC_ATTR_R(enable_fc4_type, LPFC_ENABLE_FCP,
* percentage will go to NVME.
*/
LPFC_ATTR_R(xri_split, 50, 10, 90,
- "Division of XRI resources between SCSI and NVME");
+ "Division of XRI resources between SCSI and NVME");
/*
# lpfc_log_verbose: Only turn this flag on if you are willing to risk being
diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c
index 18157d2840a3..a1686c2d863c 100644
--- a/drivers/scsi/lpfc/lpfc_bsg.c
+++ b/drivers/scsi/lpfc/lpfc_bsg.c
@@ -2486,6 +2486,10 @@ static int lpfcdiag_loop_self_reg(struct lpfc_hba *phba, uint16_t *rpi)
mbox, *rpi);
else {
*rpi = lpfc_sli4_alloc_rpi(phba);
+ if (*rpi == LPFC_RPI_ALLOC_ERROR) {
+ mempool_free(mbox, phba->mbox_mem_pool);
+ return -EBUSY;
+ }
status = lpfc_reg_rpi(phba, phba->pport->vpi,
phba->pport->fc_myDID,
(uint8_t *)&phba->pport->fc_sparam,
diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h
index 54e6ac42fbcd..944b32ca4931 100644
--- a/drivers/scsi/lpfc/lpfc_crtn.h
+++ b/drivers/scsi/lpfc/lpfc_crtn.h
@@ -24,6 +24,7 @@ typedef int (*node_filter)(struct lpfc_nodelist *, void *);
struct fc_rport;
struct fc_frame_header;
+struct lpfc_nvmet_rcv_ctx;
void lpfc_down_link(struct lpfc_hba *, LPFC_MBOXQ_t *);
void lpfc_sli_read_link_ste(struct lpfc_hba *);
void lpfc_dump_mem(struct lpfc_hba *, LPFC_MBOXQ_t *, uint16_t, uint16_t);
@@ -99,7 +100,7 @@ void lpfc_issue_reg_vpi(struct lpfc_hba *, struct lpfc_vport *);
int lpfc_check_sli_ndlp(struct lpfc_hba *, struct lpfc_sli_ring *,
struct lpfc_iocbq *, struct lpfc_nodelist *);
-void lpfc_nlp_init(struct lpfc_vport *, struct lpfc_nodelist *, uint32_t);
+struct lpfc_nodelist *lpfc_nlp_init(struct lpfc_vport *vport, uint32_t did);
struct lpfc_nodelist *lpfc_nlp_get(struct lpfc_nodelist *);
int lpfc_nlp_put(struct lpfc_nodelist *);
int lpfc_nlp_not_used(struct lpfc_nodelist *ndlp);
@@ -245,6 +246,10 @@ struct hbq_dmabuf *lpfc_sli4_rb_alloc(struct lpfc_hba *);
void lpfc_sli4_rb_free(struct lpfc_hba *, struct hbq_dmabuf *);
struct rqb_dmabuf *lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba);
void lpfc_sli4_nvmet_free(struct lpfc_hba *phba, struct rqb_dmabuf *dmab);
+void lpfc_nvmet_rq_post(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp,
+ struct lpfc_dmabuf *mp);
+int lpfc_nvmet_rcv_unsol_abort(struct lpfc_vport *vport,
+ struct fc_frame_header *fc_hdr);
void lpfc_sli4_build_dflt_fcf_record(struct lpfc_hba *, struct fcf_record *,
uint16_t);
int lpfc_sli4_rq_put(struct lpfc_queue *hq, struct lpfc_queue *dq,
@@ -302,6 +307,8 @@ int lpfc_sli_check_eratt(struct lpfc_hba *);
void lpfc_sli_handle_slow_ring_event(struct lpfc_hba *,
struct lpfc_sli_ring *, uint32_t);
void lpfc_sli4_handle_received_buffer(struct lpfc_hba *, struct hbq_dmabuf *);
+void lpfc_sli4_seq_abort_rsp(struct lpfc_vport *vport,
+ struct fc_frame_header *fc_hdr, bool aborted);
void lpfc_sli_def_mbox_cmpl(struct lpfc_hba *, LPFC_MBOXQ_t *);
void lpfc_sli4_unreg_rpi_cmpl_clr(struct lpfc_hba *, LPFC_MBOXQ_t *);
int lpfc_sli_issue_iocb(struct lpfc_hba *, uint32_t,
diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c
index d3e9af983015..1487406aea77 100644
--- a/drivers/scsi/lpfc/lpfc_ct.c
+++ b/drivers/scsi/lpfc/lpfc_ct.c
@@ -537,19 +537,53 @@ lpfc_prep_node_fc4type(struct lpfc_vport *vport, uint32_t Did, uint8_t fc4_type)
}
}
+static void
+lpfc_ns_rsp_audit_did(struct lpfc_vport *vport, uint32_t Did, uint8_t fc4_type)
+{
+ struct lpfc_hba *phba = vport->phba;
+ struct lpfc_nodelist *ndlp = NULL;
+ struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
+
+ /*
+ * To conserve rpi's, filter out addresses for other
+ * vports on the same physical HBAs.
+ */
+ if (Did != vport->fc_myDID &&
+ (!lpfc_find_vport_by_did(phba, Did) ||
+ vport->cfg_peer_port_login)) {
+ if (!phba->nvmet_support) {
+ /* FCPI/NVMEI path. Process Did */
+ lpfc_prep_node_fc4type(vport, Did, fc4_type);
+ return;
+ }
+ /* NVMET path. NVMET only cares about NVMEI nodes. */
+ list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) {
+ if (ndlp->nlp_type != NLP_NVME_INITIATOR ||
+ ndlp->nlp_state != NLP_STE_UNMAPPED_NODE)
+ continue;
+ spin_lock_irq(shost->host_lock);
+ if (ndlp->nlp_DID == Did)
+ ndlp->nlp_flag &= ~NLP_NVMET_RECOV;
+ else
+ ndlp->nlp_flag |= NLP_NVMET_RECOV;
+ spin_unlock_irq(shost->host_lock);
+ }
+ }
+}
+
static int
lpfc_ns_rsp(struct lpfc_vport *vport, struct lpfc_dmabuf *mp, uint8_t fc4_type,
uint32_t Size)
{
- struct lpfc_hba *phba = vport->phba;
struct lpfc_sli_ct_request *Response =
(struct lpfc_sli_ct_request *) mp->virt;
- struct lpfc_nodelist *ndlp = NULL;
struct lpfc_dmabuf *mlast, *next_mp;
uint32_t *ctptr = (uint32_t *) & Response->un.gid.PortType;
uint32_t Did, CTentry;
int Cnt;
struct list_head head;
+ struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
+ struct lpfc_nodelist *ndlp = NULL;
lpfc_set_disctmo(vport);
vport->num_disc_nodes = 0;
@@ -574,19 +608,7 @@ lpfc_ns_rsp(struct lpfc_vport *vport, struct lpfc_dmabuf *mp, uint8_t fc4_type,
/* Get next DID from NameServer List */
CTentry = *ctptr++;
Did = ((be32_to_cpu(CTentry)) & Mask_DID);
-
- ndlp = NULL;
-
- /*
- * Check for rscn processing or not
- * To conserve rpi's, filter out addresses for other
- * vports on the same physical HBAs.
- */
- if ((Did != vport->fc_myDID) &&
- ((lpfc_find_vport_by_did(phba, Did) == NULL) ||
- vport->cfg_peer_port_login))
- lpfc_prep_node_fc4type(vport, Did, fc4_type);
-
+ lpfc_ns_rsp_audit_did(vport, Did, fc4_type);
if (CTentry & (cpu_to_be32(SLI_CT_LAST_ENTRY)))
goto nsout1;
@@ -596,6 +618,22 @@ lpfc_ns_rsp(struct lpfc_vport *vport, struct lpfc_dmabuf *mp, uint8_t fc4_type,
}
+ /* All GID_FT entries processed. If the driver is running in
+ * in target mode, put impacted nodes into recovery and drop
+ * the RPI to flush outstanding IO.
+ */
+ if (vport->phba->nvmet_support) {
+ list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) {
+ if (!(ndlp->nlp_flag & NLP_NVMET_RECOV))
+ continue;
+ lpfc_disc_state_machine(vport, ndlp, NULL,
+ NLP_EVT_DEVICE_RECOVERY);
+ spin_lock_irq(shost->host_lock);
+ ndlp->nlp_flag &= ~NLP_NVMET_RECOV;
+ spin_lock_irq(shost->host_lock);
+ }
+ }
+
nsout1:
list_del(&head);
return 0;
diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c
index 913eed822cb8..fce549a91911 100644
--- a/drivers/scsi/lpfc/lpfc_debugfs.c
+++ b/drivers/scsi/lpfc/lpfc_debugfs.c
@@ -745,73 +745,102 @@ lpfc_debugfs_nvmestat_data(struct lpfc_vport *vport, char *buf, int size)
{
struct lpfc_hba *phba = vport->phba;
struct lpfc_nvmet_tgtport *tgtp;
+ struct lpfc_nvmet_rcv_ctx *ctxp, *next_ctxp;
int len = 0;
+ int cnt;
if (phba->nvmet_support) {
if (!phba->targetport)
return len;
tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
- len += snprintf(buf+len, size-len,
+ len += snprintf(buf + len, size - len,
"\nNVME Targetport Statistics\n");
- len += snprintf(buf+len, size-len,
+ len += snprintf(buf + len, size - len,
"LS: Rcv %08x Drop %08x Abort %08x\n",
atomic_read(&tgtp->rcv_ls_req_in),
atomic_read(&tgtp->rcv_ls_req_drop),
atomic_read(&tgtp->xmt_ls_abort));
if (atomic_read(&tgtp->rcv_ls_req_in) !=
atomic_read(&tgtp->rcv_ls_req_out)) {
- len += snprintf(buf+len, size-len,
+ len += snprintf(buf + len, size - len,
"Rcv LS: in %08x != out %08x\n",
atomic_read(&tgtp->rcv_ls_req_in),
atomic_read(&tgtp->rcv_ls_req_out));
}
- len += snprintf(buf+len, size-len,
+ len += snprintf(buf + len, size - len,
"LS: Xmt %08x Drop %08x Cmpl %08x Err %08x\n",
atomic_read(&tgtp->xmt_ls_rsp),
atomic_read(&tgtp->xmt_ls_drop),
atomic_read(&tgtp->xmt_ls_rsp_cmpl),
atomic_read(&tgtp->xmt_ls_rsp_error));
- len += snprintf(buf+len, size-len,
+ len += snprintf(buf + len, size - len,
"FCP: Rcv %08x Drop %08x\n",
atomic_read(&tgtp->rcv_fcp_cmd_in),
atomic_read(&tgtp->rcv_fcp_cmd_drop));
if (atomic_read(&tgtp->rcv_fcp_cmd_in) !=
atomic_read(&tgtp->rcv_fcp_cmd_out)) {
- len += snprintf(buf+len, size-len,
+ len += snprintf(buf + len, size - len,
"Rcv FCP: in %08x != out %08x\n",
atomic_read(&tgtp->rcv_fcp_cmd_in),
atomic_read(&tgtp->rcv_fcp_cmd_out));
}
- len += snprintf(buf+len, size-len,
- "FCP Rsp: read %08x readrsp %08x write %08x rsp %08x\n",
+ len += snprintf(buf + len, size - len,
+ "FCP Rsp: read %08x readrsp %08x "
+ "write %08x rsp %08x\n",
atomic_read(&tgtp->xmt_fcp_read),
atomic_read(&tgtp->xmt_fcp_read_rsp),
atomic_read(&tgtp->xmt_fcp_write),
atomic_read(&tgtp->xmt_fcp_rsp));
- len += snprintf(buf+len, size-len,
+ len += snprintf(buf + len, size - len,
"FCP Rsp: abort %08x drop %08x\n",
atomic_read(&tgtp->xmt_fcp_abort),
atomic_read(&tgtp->xmt_fcp_drop));
- len += snprintf(buf+len, size-len,
+ len += snprintf(buf + len, size - len,
"FCP Rsp Cmpl: %08x err %08x drop %08x\n",
atomic_read(&tgtp->xmt_fcp_rsp_cmpl),
atomic_read(&tgtp->xmt_fcp_rsp_error),
atomic_read(&tgtp->xmt_fcp_rsp_drop));
- len += snprintf(buf+len, size-len,
+ len += snprintf(buf + len, size - len,
"ABORT: Xmt %08x Err %08x Cmpl %08x",
atomic_read(&tgtp->xmt_abort_rsp),
atomic_read(&tgtp->xmt_abort_rsp_error),
atomic_read(&tgtp->xmt_abort_cmpl));
- len += snprintf(buf+len, size-len, "\n");
+ len += snprintf(buf + len, size - len, "\n");
+
+ cnt = 0;
+ spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+ list_for_each_entry_safe(ctxp, next_ctxp,
+ &phba->sli4_hba.lpfc_abts_nvmet_ctx_list,
+ list) {
+ cnt++;
+ }
+ spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+ if (cnt) {
+ len += snprintf(buf + len, size - len,
+ "ABORT: %d ctx entries\n", cnt);
+ spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+ list_for_each_entry_safe(ctxp, next_ctxp,
+ &phba->sli4_hba.lpfc_abts_nvmet_ctx_list,
+ list) {
+ if (len >= (size - LPFC_DEBUG_OUT_LINE_SZ))
+ break;
+ len += snprintf(buf + len, size - len,
+ "Entry: oxid %x state %x "
+ "flag %x\n",
+ ctxp->oxid, ctxp->state,
+ ctxp->flag);
+ }
+ spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+ }
} else {
if (!(phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME))
return len;
@@ -3128,8 +3157,6 @@ __lpfc_idiag_print_rqpair(struct lpfc_queue *qp, struct lpfc_queue *datqp,
datqp->queue_id, datqp->entry_count,
datqp->entry_size, datqp->host_index,
datqp->hba_index);
- len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n");
-
return len;
}
@@ -5700,10 +5727,8 @@ lpfc_debugfs_terminate(struct lpfc_vport *vport)
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
struct lpfc_hba *phba = vport->phba;
- if (vport->disc_trc) {
- kfree(vport->disc_trc);
- vport->disc_trc = NULL;
- }
+ kfree(vport->disc_trc);
+ vport->disc_trc = NULL;
debugfs_remove(vport->debug_disc_trc); /* discovery_trace */
vport->debug_disc_trc = NULL;
@@ -5770,10 +5795,8 @@ lpfc_debugfs_terminate(struct lpfc_vport *vport)
debugfs_remove(phba->debug_readRef); /* readRef */
phba->debug_readRef = NULL;
- if (phba->slow_ring_trc) {
- kfree(phba->slow_ring_trc);
- phba->slow_ring_trc = NULL;
- }
+ kfree(phba->slow_ring_trc);
+ phba->slow_ring_trc = NULL;
/* slow_ring_trace */
debugfs_remove(phba->debug_slow_ring_trc);
diff --git a/drivers/scsi/lpfc/lpfc_disc.h b/drivers/scsi/lpfc/lpfc_disc.h
index f4ff99d95db3..9d5a379f4b15 100644
--- a/drivers/scsi/lpfc/lpfc_disc.h
+++ b/drivers/scsi/lpfc/lpfc_disc.h
@@ -157,6 +157,7 @@ struct lpfc_node_rrq {
#define NLP_LOGO_SND 0x00000100 /* sent LOGO request for this entry */
#define NLP_RNID_SND 0x00000400 /* sent RNID request for this entry */
#define NLP_ELS_SND_MASK 0x000007e0 /* sent ELS request for this entry */
+#define NLP_NVMET_RECOV 0x00001000 /* NVMET auditing node for recovery. */
#define NLP_DEFER_RM 0x00010000 /* Remove this ndlp if no longer used */
#define NLP_DELAY_TMO 0x00020000 /* delay timeout is running for node */
#define NLP_NPR_2B_DISC 0x00040000 /* node is included in num_disc_nodes */
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index a5ca37e45fb6..67827e397431 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -603,9 +603,11 @@ lpfc_check_clean_addr_bit(struct lpfc_vport *vport,
memcmp(&vport->fabric_portname, &sp->portName,
sizeof(struct lpfc_name)) ||
memcmp(&vport->fabric_nodename, &sp->nodeName,
- sizeof(struct lpfc_name)))
+ sizeof(struct lpfc_name)) ||
+ (vport->vport_flag & FAWWPN_PARAM_CHG)) {
fabric_param_changed = 1;
-
+ vport->vport_flag &= ~FAWWPN_PARAM_CHG;
+ }
/*
* Word 1 Bit 31 in common service parameter is overloaded.
* Word 1 Bit 31 in FLOGI request is multiple NPort request
@@ -895,10 +897,9 @@ lpfc_cmpl_els_flogi_nport(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
* Cannot find existing Fabric ndlp, so allocate a
* new one
*/
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, PT2PT_RemoteID);
if (!ndlp)
goto fail;
- lpfc_nlp_init(vport, ndlp, PT2PT_RemoteID);
} else if (!NLP_CHK_NODE_ACT(ndlp)) {
ndlp = lpfc_enable_node(vport, ndlp,
NLP_STE_UNUSED_NODE);
@@ -1364,7 +1365,6 @@ lpfc_els_abort_flogi(struct lpfc_hba *phba)
int
lpfc_initial_flogi(struct lpfc_vport *vport)
{
- struct lpfc_hba *phba = vport->phba;
struct lpfc_nodelist *ndlp;
vport->port_state = LPFC_FLOGI;
@@ -1374,10 +1374,9 @@ lpfc_initial_flogi(struct lpfc_vport *vport)
ndlp = lpfc_findnode_did(vport, Fabric_DID);
if (!ndlp) {
/* Cannot find existing Fabric ndlp, so allocate a new one */
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, Fabric_DID);
if (!ndlp)
return 0;
- lpfc_nlp_init(vport, ndlp, Fabric_DID);
/* Set the node type */
ndlp->nlp_type |= NLP_FABRIC;
/* Put ndlp onto node list */
@@ -1418,17 +1417,15 @@ lpfc_initial_flogi(struct lpfc_vport *vport)
int
lpfc_initial_fdisc(struct lpfc_vport *vport)
{
- struct lpfc_hba *phba = vport->phba;
struct lpfc_nodelist *ndlp;
/* First look for the Fabric ndlp */
ndlp = lpfc_findnode_did(vport, Fabric_DID);
if (!ndlp) {
/* Cannot find existing Fabric ndlp, so allocate a new one */
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, Fabric_DID);
if (!ndlp)
return 0;
- lpfc_nlp_init(vport, ndlp, Fabric_DID);
/* Put ndlp onto node list */
lpfc_enqueue_node(vport, ndlp);
} else if (!NLP_CHK_NODE_ACT(ndlp)) {
@@ -1564,14 +1561,13 @@ lpfc_plogi_confirm_nport(struct lpfc_hba *phba, uint32_t *prsp,
phba->active_rrq_pool);
return ndlp;
}
- new_ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_ATOMIC);
+ new_ndlp = lpfc_nlp_init(vport, ndlp->nlp_DID);
if (!new_ndlp) {
if (active_rrqs_xri_bitmap)
mempool_free(active_rrqs_xri_bitmap,
phba->active_rrq_pool);
return ndlp;
}
- lpfc_nlp_init(vport, new_ndlp, ndlp->nlp_DID);
} else if (!NLP_CHK_NODE_ACT(new_ndlp)) {
rc = memcmp(&ndlp->nlp_portname, name,
sizeof(struct lpfc_name));
@@ -2845,10 +2841,9 @@ lpfc_issue_els_scr(struct lpfc_vport *vport, uint32_t nportid, uint8_t retry)
ndlp = lpfc_findnode_did(vport, nportid);
if (!ndlp) {
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, nportid);
if (!ndlp)
return 1;
- lpfc_nlp_init(vport, ndlp, nportid);
lpfc_enqueue_node(vport, ndlp);
} else if (!NLP_CHK_NODE_ACT(ndlp)) {
ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_UNUSED_NODE);
@@ -2938,10 +2933,9 @@ lpfc_issue_els_farpr(struct lpfc_vport *vport, uint32_t nportid, uint8_t retry)
ndlp = lpfc_findnode_did(vport, nportid);
if (!ndlp) {
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, nportid);
if (!ndlp)
return 1;
- lpfc_nlp_init(vport, ndlp, nportid);
lpfc_enqueue_node(vport, ndlp);
} else if (!NLP_CHK_NODE_ACT(ndlp)) {
ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_UNUSED_NODE);
@@ -4403,7 +4397,7 @@ lpfc_els_rsp_prli_acc(struct lpfc_vport *vport, struct lpfc_iocbq *oldiocb,
pcmd = (uint8_t *) (((struct lpfc_dmabuf *) elsiocb->context2)->virt);
memset(pcmd, 0, cmdsize);
- *((uint32_t *) (pcmd)) = (ELS_CMD_ACC | (ELS_CMD_PRLI & ~ELS_RSP_MASK));
+ *((uint32_t *)(pcmd)) = elsrspcmd;
pcmd += sizeof(uint32_t);
/* For PRLI, remainder of payload is PRLI parameter page */
@@ -5867,8 +5861,11 @@ lpfc_rscn_recovery_check(struct lpfc_vport *vport)
(ndlp->nlp_state == NLP_STE_UNUSED_NODE) ||
!lpfc_rscn_payload_check(vport, ndlp->nlp_DID))
continue;
+
+ /* NVME Target mode does not do RSCN Recovery. */
if (vport->phba->nvmet_support)
continue;
+
lpfc_disc_state_machine(vport, ndlp, NULL,
NLP_EVT_DEVICE_RECOVERY);
lpfc_cancel_retry_delay_tmo(vport, ndlp);
@@ -6133,7 +6130,6 @@ int
lpfc_els_handle_rscn(struct lpfc_vport *vport)
{
struct lpfc_nodelist *ndlp;
- struct lpfc_hba *phba = vport->phba;
/* Ignore RSCN if the port is being torn down. */
if (vport->load_flag & FC_UNLOADING) {
@@ -6157,22 +6153,16 @@ lpfc_els_handle_rscn(struct lpfc_vport *vport)
ndlp = lpfc_findnode_did(vport, NameServer_DID);
if (ndlp && NLP_CHK_NODE_ACT(ndlp)
&& ndlp->nlp_state == NLP_STE_UNMAPPED_NODE) {
- /* Good ndlp, issue CT Request to NameServer */
+ /* Good ndlp, issue CT Request to NameServer. Need to
+ * know how many gidfts were issued. If none, then just
+ * flush the RSCN. Otherwise, the outstanding requests
+ * need to complete.
+ */
vport->gidft_inp = 0;
- if (lpfc_issue_gidft(vport) == 0)
- /* Wait for NameServer query cmpl before we can
- * continue
- */
+ if (lpfc_issue_gidft(vport) > 0)
return 1;
} else {
- /* If login to NameServer does not exist, issue one */
- /* Good status, issue PLOGI to NameServer */
- ndlp = lpfc_findnode_did(vport, NameServer_DID);
- if (ndlp && NLP_CHK_NODE_ACT(ndlp))
- /* Wait for NameServer login cmpl before we can
- continue */
- return 1;
-
+ /* Nameserver login in question. Revalidate. */
if (ndlp) {
ndlp = lpfc_enable_node(vport, ndlp,
NLP_STE_PLOGI_ISSUE);
@@ -6182,12 +6172,11 @@ lpfc_els_handle_rscn(struct lpfc_vport *vport)
}
ndlp->nlp_prev_state = NLP_STE_UNUSED_NODE;
} else {
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, NameServer_DID);
if (!ndlp) {
lpfc_els_flush_rscn(vport);
return 0;
}
- lpfc_nlp_init(vport, ndlp, NameServer_DID);
ndlp->nlp_prev_state = ndlp->nlp_state;
lpfc_nlp_set_state(vport, ndlp, NLP_STE_PLOGI_ISSUE);
}
@@ -7746,11 +7735,9 @@ lpfc_els_unsol_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
ndlp = lpfc_findnode_did(vport, did);
if (!ndlp) {
/* Cannot find existing Fabric ndlp, so allocate a new one */
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, did);
if (!ndlp)
goto dropit;
-
- lpfc_nlp_init(vport, ndlp, did);
lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
newnode = 1;
if ((did & Fabric_DID_MASK) == Fabric_DID_MASK)
@@ -8193,7 +8180,6 @@ lpfc_els_unsol_event(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
static void
lpfc_start_fdmi(struct lpfc_vport *vport)
{
- struct lpfc_hba *phba = vport->phba;
struct lpfc_nodelist *ndlp;
/* If this is the first time, allocate an ndlp and initialize
@@ -8202,9 +8188,8 @@ lpfc_start_fdmi(struct lpfc_vport *vport)
*/
ndlp = lpfc_findnode_did(vport, FDMI_DID);
if (!ndlp) {
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, FDMI_DID);
if (ndlp) {
- lpfc_nlp_init(vport, ndlp, FDMI_DID);
ndlp->nlp_type |= NLP_FABRIC;
} else {
return;
@@ -8257,7 +8242,7 @@ lpfc_do_scr_ns_plogi(struct lpfc_hba *phba, struct lpfc_vport *vport)
ndlp = lpfc_findnode_did(vport, NameServer_DID);
if (!ndlp) {
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, NameServer_DID);
if (!ndlp) {
if (phba->fc_topology == LPFC_TOPOLOGY_LOOP) {
lpfc_disc_start(vport);
@@ -8268,7 +8253,6 @@ lpfc_do_scr_ns_plogi(struct lpfc_hba *phba, struct lpfc_vport *vport)
"0251 NameServer login: no memory\n");
return;
}
- lpfc_nlp_init(vport, ndlp, NameServer_DID);
} else if (!NLP_CHK_NODE_ACT(ndlp)) {
ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_UNUSED_NODE);
if (!ndlp) {
@@ -8771,7 +8755,7 @@ lpfc_issue_els_fdisc(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
pcmd += sizeof(uint32_t); /* Node Name */
pcmd += sizeof(uint32_t); /* Node Name */
memcpy(pcmd, &vport->fc_nodename, 8);
-
+ memset(sp->un.vendorVersion, 0, sizeof(sp->un.vendorVersion));
lpfc_set_disctmo(vport);
phba->fc_stat.elsXmitFDISC++;
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index 180b072beef6..0482c5580331 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -3002,6 +3002,7 @@ lpfc_mbx_cmpl_read_sparam(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
MAILBOX_t *mb = &pmb->u.mb;
struct lpfc_dmabuf *mp = (struct lpfc_dmabuf *) pmb->context1;
struct lpfc_vport *vport = pmb->vport;
+ struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
struct serv_parm *sp = &vport->fc_sparam;
uint32_t ed_tov;
@@ -3031,6 +3032,7 @@ lpfc_mbx_cmpl_read_sparam(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
}
lpfc_update_vport_wwn(vport);
+ fc_host_port_name(shost) = wwn_to_u64(vport->fc_portname.u.wwn);
if (vport->port_type == LPFC_PHYSICAL_PORT) {
memcpy(&phba->wwnn, &vport->fc_nodename, sizeof(phba->wwnn));
memcpy(&phba->wwpn, &vport->fc_portname, sizeof(phba->wwnn));
@@ -3309,6 +3311,7 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
struct lpfc_sli_ring *pring;
MAILBOX_t *mb = &pmb->u.mb;
struct lpfc_dmabuf *mp = (struct lpfc_dmabuf *) (pmb->context1);
+ uint8_t attn_type;
/* Unblock ELS traffic */
pring = lpfc_phba_elsring(phba);
@@ -3325,6 +3328,7 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
}
la = (struct lpfc_mbx_read_top *) &pmb->u.mb.un.varReadTop;
+ attn_type = bf_get(lpfc_mbx_read_top_att_type, la);
memcpy(&phba->alpa_map[0], mp->virt, 128);
@@ -3337,7 +3341,7 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
if (phba->fc_eventTag <= la->eventTag) {
phba->fc_stat.LinkMultiEvent++;
- if (bf_get(lpfc_mbx_read_top_att_type, la) == LPFC_ATT_LINK_UP)
+ if (attn_type == LPFC_ATT_LINK_UP)
if (phba->fc_eventTag != 0)
lpfc_linkdown(phba);
}
@@ -3353,7 +3357,7 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
}
phba->link_events++;
- if ((bf_get(lpfc_mbx_read_top_att_type, la) == LPFC_ATT_LINK_UP) &&
+ if ((attn_type == LPFC_ATT_LINK_UP) &&
!(phba->sli.sli_flag & LPFC_MENLO_MAINT)) {
phba->fc_stat.LinkUp++;
if (phba->link_flag & LS_LOOPBACK_MODE) {
@@ -3379,8 +3383,8 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
phba->wait_4_mlo_maint_flg);
}
lpfc_mbx_process_link_up(phba, la);
- } else if (bf_get(lpfc_mbx_read_top_att_type, la) ==
- LPFC_ATT_LINK_DOWN) {
+ } else if (attn_type == LPFC_ATT_LINK_DOWN ||
+ attn_type == LPFC_ATT_UNEXP_WWPN) {
phba->fc_stat.LinkDown++;
if (phba->link_flag & LS_LOOPBACK_MODE)
lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
@@ -3389,6 +3393,14 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
"Data: x%x x%x x%x\n",
la->eventTag, phba->fc_eventTag,
phba->pport->port_state, vport->fc_flag);
+ else if (attn_type == LPFC_ATT_UNEXP_WWPN)
+ lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
+ "1313 Link Down UNEXP WWPN Event x%x received "
+ "Data: x%x x%x x%x x%x x%x\n",
+ la->eventTag, phba->fc_eventTag,
+ phba->pport->port_state, vport->fc_flag,
+ bf_get(lpfc_mbx_read_top_mm, la),
+ bf_get(lpfc_mbx_read_top_fa, la));
else
lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
"1305 Link Down Event x%x received "
@@ -3399,8 +3411,8 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
bf_get(lpfc_mbx_read_top_fa, la));
lpfc_mbx_issue_link_down(phba);
}
- if ((phba->sli.sli_flag & LPFC_MENLO_MAINT) &&
- ((bf_get(lpfc_mbx_read_top_att_type, la) == LPFC_ATT_LINK_UP))) {
+ if (phba->sli.sli_flag & LPFC_MENLO_MAINT &&
+ attn_type == LPFC_ATT_LINK_UP) {
if (phba->link_state != LPFC_LINK_DOWN) {
phba->fc_stat.LinkDown++;
lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
@@ -4136,7 +4148,6 @@ lpfc_nlp_state_cleanup(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
int old_state, int new_state)
{
struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
- struct lpfc_hba *phba = vport->phba;
if (new_state == NLP_STE_UNMAPPED_NODE) {
ndlp->nlp_flag &= ~NLP_NODEV_REMOVE;
@@ -4155,14 +4166,14 @@ lpfc_nlp_state_cleanup(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
lpfc_unregister_remote_port(ndlp);
}
- /* Notify the NVME transport of this rport's loss */
- if (((phba->cfg_enable_fc4_type == LPFC_ENABLE_BOTH) ||
- (phba->cfg_enable_fc4_type == LPFC_ENABLE_NVME)) &&
- (vport->phba->nvmet_support == 0) &&
- ((ndlp->nlp_fc4_type & NLP_FC4_NVME) ||
- (ndlp->nlp_DID == Fabric_DID))) {
+ /* Notify the NVME transport of this rport's loss on the
+ * Initiator. For NVME Target, should upcall transport
+ * in the else clause when API available.
+ */
+ if (ndlp->nlp_fc4_type & NLP_FC4_NVME) {
vport->phba->nport_event_cnt++;
- lpfc_nvme_unregister_port(vport, ndlp);
+ if (vport->phba->nvmet_support == 0)
+ lpfc_nvme_unregister_port(vport, ndlp);
}
}
@@ -4368,10 +4379,17 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
uint32_t did;
unsigned long flags;
unsigned long *active_rrqs_xri_bitmap = NULL;
+ int rpi = LPFC_RPI_ALLOC_ERROR;
if (!ndlp)
return NULL;
+ if (phba->sli_rev == LPFC_SLI_REV4) {
+ rpi = lpfc_sli4_alloc_rpi(vport->phba);
+ if (rpi == LPFC_RPI_ALLOC_ERROR)
+ return NULL;
+ }
+
spin_lock_irqsave(&phba->ndlp_lock, flags);
/* The ndlp should not be in memory free mode */
if (NLP_CHK_FREE_REQ(ndlp)) {
@@ -4381,7 +4399,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
"usgmap:x%x refcnt:%d\n",
(void *)ndlp, ndlp->nlp_usg_map,
kref_read(&ndlp->kref));
- return NULL;
+ goto free_rpi;
}
/* The ndlp should not already be in active mode */
if (NLP_CHK_NODE_ACT(ndlp)) {
@@ -4391,7 +4409,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
"usgmap:x%x refcnt:%d\n",
(void *)ndlp, ndlp->nlp_usg_map,
kref_read(&ndlp->kref));
- return NULL;
+ goto free_rpi;
}
/* Keep the original DID */
@@ -4409,7 +4427,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
spin_unlock_irqrestore(&phba->ndlp_lock, flags);
if (vport->phba->sli_rev == LPFC_SLI_REV4) {
- ndlp->nlp_rpi = lpfc_sli4_alloc_rpi(vport->phba);
+ ndlp->nlp_rpi = rpi;
lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE,
"0008 rpi:%x DID:%x flg:%x refcnt:%d "
"map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
@@ -4426,6 +4444,11 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
"node enable: did:x%x",
ndlp->nlp_DID, 0, 0);
return ndlp;
+
+free_rpi:
+ if (phba->sli_rev == LPFC_SLI_REV4)
+ lpfc_sli4_free_rpi(vport->phba, rpi);
+ return NULL;
}
void
@@ -5104,65 +5127,82 @@ lpfc_setup_disc_node(struct lpfc_vport *vport, uint32_t did)
ndlp = lpfc_findnode_did(vport, did);
if (!ndlp) {
+ if (vport->phba->nvmet_support)
+ return NULL;
if ((vport->fc_flag & FC_RSCN_MODE) != 0 &&
lpfc_rscn_payload_check(vport, did) == 0)
return NULL;
- ndlp = (struct lpfc_nodelist *)
- mempool_alloc(vport->phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, did);
if (!ndlp)
return NULL;
- lpfc_nlp_init(vport, ndlp, did);
lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
- if (vport->phba->nvmet_support)
- return ndlp;
spin_lock_irq(shost->host_lock);
ndlp->nlp_flag |= NLP_NPR_2B_DISC;
spin_unlock_irq(shost->host_lock);
return ndlp;
} else if (!NLP_CHK_NODE_ACT(ndlp)) {
+ if (vport->phba->nvmet_support)
+ return NULL;
ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_NPR_NODE);
if (!ndlp)
return NULL;
- if (vport->phba->nvmet_support)
- return ndlp;
spin_lock_irq(shost->host_lock);
ndlp->nlp_flag |= NLP_NPR_2B_DISC;
spin_unlock_irq(shost->host_lock);
return ndlp;
}
+ /* The NVME Target does not want to actively manage an rport.
+ * The goal is to allow the target to reset its state and clear
+ * pending IO in preparation for the initiator to recover.
+ */
if ((vport->fc_flag & FC_RSCN_MODE) &&
!(vport->fc_flag & FC_NDISC_ACTIVE)) {
if (lpfc_rscn_payload_check(vport, did)) {
- /* If we've already received a PLOGI from this NPort
- * we don't need to try to discover it again.
- */
- if (ndlp->nlp_flag & NLP_RCV_PLOGI)
- return NULL;
/* Since this node is marked for discovery,
* delay timeout is not needed.
*/
lpfc_cancel_retry_delay_tmo(vport, ndlp);
+
+ /* NVME Target mode waits until rport is known to be
+ * impacted by the RSCN before it transitions. No
+ * active management - just go to NPR provided the
+ * node had a valid login.
+ */
if (vport->phba->nvmet_support)
return ndlp;
+
+ /* If we've already received a PLOGI from this NPort
+ * we don't need to try to discover it again.
+ */
+ if (ndlp->nlp_flag & NLP_RCV_PLOGI)
+ return NULL;
+
spin_lock_irq(shost->host_lock);
ndlp->nlp_flag |= NLP_NPR_2B_DISC;
spin_unlock_irq(shost->host_lock);
} else
ndlp = NULL;
} else {
- /* If we've already received a PLOGI from this NPort,
- * or we are already in the process of discovery on it,
- * we don't need to try to discover it again.
+ /* If the initiator received a PLOGI from this NPort or if the
+ * initiator is already in the process of discovery on it,
+ * there's no need to try to discover it again.
*/
if (ndlp->nlp_state == NLP_STE_ADISC_ISSUE ||
ndlp->nlp_state == NLP_STE_PLOGI_ISSUE ||
- ndlp->nlp_flag & NLP_RCV_PLOGI)
+ (!vport->phba->nvmet_support &&
+ ndlp->nlp_flag & NLP_RCV_PLOGI))
return NULL;
- lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
+
if (vport->phba->nvmet_support)
return ndlp;
+
+ /* Moving to NPR state clears unsolicited flags and
+ * allows for rediscovery
+ */
+ lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
+
spin_lock_irq(shost->host_lock);
ndlp->nlp_flag |= NLP_NPR_2B_DISC;
spin_unlock_irq(shost->host_lock);
@@ -5887,16 +5927,31 @@ lpfc_find_vport_by_vpid(struct lpfc_hba *phba, uint16_t vpi)
return NULL;
}
-void
-lpfc_nlp_init(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
- uint32_t did)
+struct lpfc_nodelist *
+lpfc_nlp_init(struct lpfc_vport *vport, uint32_t did)
{
+ struct lpfc_nodelist *ndlp;
+ int rpi = LPFC_RPI_ALLOC_ERROR;
+
+ if (vport->phba->sli_rev == LPFC_SLI_REV4) {
+ rpi = lpfc_sli4_alloc_rpi(vport->phba);
+ if (rpi == LPFC_RPI_ALLOC_ERROR)
+ return NULL;
+ }
+
+ ndlp = mempool_alloc(vport->phba->nlp_mem_pool, GFP_KERNEL);
+ if (!ndlp) {
+ if (vport->phba->sli_rev == LPFC_SLI_REV4)
+ lpfc_sli4_free_rpi(vport->phba, rpi);
+ return NULL;
+ }
+
memset(ndlp, 0, sizeof (struct lpfc_nodelist));
lpfc_initialize_node(vport, ndlp, did);
INIT_LIST_HEAD(&ndlp->nlp_listp);
if (vport->phba->sli_rev == LPFC_SLI_REV4) {
- ndlp->nlp_rpi = lpfc_sli4_alloc_rpi(vport->phba);
+ ndlp->nlp_rpi = rpi;
lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE,
"0007 rpi:%x DID:%x flg:%x refcnt:%d "
"map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
@@ -5918,7 +5973,7 @@ lpfc_nlp_init(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
"node init: did:x%x",
ndlp->nlp_DID, 0, 0);
- return;
+ return ndlp;
}
/* This routine releases all resources associated with a specifc NPort's ndlp
diff --git a/drivers/scsi/lpfc/lpfc_hw.h b/drivers/scsi/lpfc/lpfc_hw.h
index 15ca21484150..26a5647e057e 100644
--- a/drivers/scsi/lpfc/lpfc_hw.h
+++ b/drivers/scsi/lpfc/lpfc_hw.h
@@ -509,6 +509,8 @@ struct class_parms {
uint8_t word3Reserved2; /* Fc Word 3, bit 0: 7 */
};
+#define FAPWWN_KEY_VENDOR 0x42524344 /*valid vendor version fawwpn key*/
+
struct serv_parm { /* Structure is in Big Endian format */
struct csp cmn;
struct lpfc_name portName;
@@ -2885,6 +2887,7 @@ struct lpfc_mbx_read_top {
#define LPFC_ATT_RESERVED 0x00 /* Reserved - attType */
#define LPFC_ATT_LINK_UP 0x01 /* Link is up */
#define LPFC_ATT_LINK_DOWN 0x02 /* Link is down */
+#define LPFC_ATT_UNEXP_WWPN 0x06 /* Link is down Unexpected WWWPN */
uint32_t word3;
#define lpfc_mbx_read_top_alpa_granted_SHIFT 24
#define lpfc_mbx_read_top_alpa_granted_MASK 0x000000FF
diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h
index 15277705cb6b..1d12f2be36bc 100644
--- a/drivers/scsi/lpfc/lpfc_hw4.h
+++ b/drivers/scsi/lpfc/lpfc_hw4.h
@@ -2720,6 +2720,9 @@ struct lpfc_mbx_request_features {
#define lpfc_mbx_rq_ftr_rq_ifip_SHIFT 7
#define lpfc_mbx_rq_ftr_rq_ifip_MASK 0x00000001
#define lpfc_mbx_rq_ftr_rq_ifip_WORD word2
+#define lpfc_mbx_rq_ftr_rq_iaar_SHIFT 9
+#define lpfc_mbx_rq_ftr_rq_iaar_MASK 0x00000001
+#define lpfc_mbx_rq_ftr_rq_iaar_WORD word2
#define lpfc_mbx_rq_ftr_rq_perfh_SHIFT 11
#define lpfc_mbx_rq_ftr_rq_perfh_MASK 0x00000001
#define lpfc_mbx_rq_ftr_rq_perfh_WORD word2
@@ -3853,6 +3856,7 @@ struct lpfc_acqe_fc_la {
#define LPFC_FC_LA_TYPE_NO_HARD_ALPA 0x3
#define LPFC_FC_LA_TYPE_MDS_LINK_DOWN 0x4
#define LPFC_FC_LA_TYPE_MDS_LOOPBACK 0x5
+#define LPFC_FC_LA_TYPE_UNEXP_WWPN 0x6
#define lpfc_acqe_fc_la_port_type_SHIFT 6
#define lpfc_acqe_fc_la_port_type_MASK 0x00000003
#define lpfc_acqe_fc_la_port_type_WORD word0
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 6cc561b04211..90ae354a9c45 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -42,6 +42,10 @@
#include <scsi/scsi_device.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_transport_fc.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/fc/fc_fs.h>
+
+#include <linux/nvme-fc-driver.h>
#include "lpfc_hw4.h"
#include "lpfc_hw.h"
@@ -52,6 +56,7 @@
#include "lpfc.h"
#include "lpfc_scsi.h"
#include "lpfc_nvme.h"
+#include "lpfc_nvmet.h"
#include "lpfc_logmsg.h"
#include "lpfc_crtn.h"
#include "lpfc_vport.h"
@@ -335,6 +340,9 @@ lpfc_dump_wakeup_param_cmpl(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmboxq)
void
lpfc_update_vport_wwn(struct lpfc_vport *vport)
{
+ uint8_t vvvl = vport->fc_sparam.cmn.valid_vendor_ver_level;
+ u32 *fawwpn_key = (u32 *)&vport->fc_sparam.un.vendorVersion[0];
+
/* If the soft name exists then update it using the service params */
if (vport->phba->cfg_soft_wwnn)
u64_to_wwn(vport->phba->cfg_soft_wwnn,
@@ -354,9 +362,25 @@ lpfc_update_vport_wwn(struct lpfc_vport *vport)
memcpy(&vport->fc_sparam.nodeName, &vport->fc_nodename,
sizeof(struct lpfc_name));
- if (vport->fc_portname.u.wwn[0] == 0 || vport->phba->cfg_soft_wwpn)
+ /*
+ * If the port name has changed, then set the Param changes flag
+ * to unreg the login
+ */
+ if (vport->fc_portname.u.wwn[0] != 0 &&
+ memcmp(&vport->fc_portname, &vport->fc_sparam.portName,
+ sizeof(struct lpfc_name)))
+ vport->vport_flag |= FAWWPN_PARAM_CHG;
+
+ if (vport->fc_portname.u.wwn[0] == 0 ||
+ vport->phba->cfg_soft_wwpn ||
+ (vvvl == 1 && cpu_to_be32(*fawwpn_key) == FAPWWN_KEY_VENDOR) ||
+ vport->vport_flag & FAWWPN_SET) {
memcpy(&vport->fc_portname, &vport->fc_sparam.portName,
sizeof(struct lpfc_name));
+ vport->vport_flag &= ~FAWWPN_SET;
+ if (vvvl == 1 && cpu_to_be32(*fawwpn_key) == FAPWWN_KEY_VENDOR)
+ vport->vport_flag |= FAWWPN_SET;
+ }
else
memcpy(&vport->fc_sparam.portName, &vport->fc_portname,
sizeof(struct lpfc_name));
@@ -1003,8 +1027,10 @@ static int
lpfc_hba_down_post_s4(struct lpfc_hba *phba)
{
struct lpfc_scsi_buf *psb, *psb_next;
+ struct lpfc_nvmet_rcv_ctx *ctxp, *ctxp_next;
LIST_HEAD(aborts);
LIST_HEAD(nvme_aborts);
+ LIST_HEAD(nvmet_aborts);
unsigned long iflag = 0;
struct lpfc_sglq *sglq_entry = NULL;
@@ -1027,16 +1053,10 @@ lpfc_hba_down_post_s4(struct lpfc_hba *phba)
list_for_each_entry(sglq_entry,
&phba->sli4_hba.lpfc_abts_els_sgl_list, list)
sglq_entry->state = SGL_FREED;
- list_for_each_entry(sglq_entry,
- &phba->sli4_hba.lpfc_abts_nvmet_sgl_list, list)
- sglq_entry->state = SGL_FREED;
list_splice_init(&phba->sli4_hba.lpfc_abts_els_sgl_list,
&phba->sli4_hba.lpfc_els_sgl_list);
- if (phba->sli4_hba.nvme_wq)
- list_splice_init(&phba->sli4_hba.lpfc_abts_nvmet_sgl_list,
- &phba->sli4_hba.lpfc_nvmet_sgl_list);
spin_unlock(&phba->sli4_hba.sgl_list_lock);
/* abts_scsi_buf_list_lock required because worker thread uses this
@@ -1053,6 +1073,8 @@ lpfc_hba_down_post_s4(struct lpfc_hba *phba)
spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
list_splice_init(&phba->sli4_hba.lpfc_abts_nvme_buf_list,
&nvme_aborts);
+ list_splice_init(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list,
+ &nvmet_aborts);
spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
}
@@ -1066,13 +1088,20 @@ lpfc_hba_down_post_s4(struct lpfc_hba *phba)
list_splice(&aborts, &phba->lpfc_scsi_buf_list_put);
spin_unlock_irqrestore(&phba->scsi_buf_list_put_lock, iflag);
- list_for_each_entry_safe(psb, psb_next, &nvme_aborts, list) {
- psb->pCmd = NULL;
- psb->status = IOSTAT_SUCCESS;
+ if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) {
+ list_for_each_entry_safe(psb, psb_next, &nvme_aborts, list) {
+ psb->pCmd = NULL;
+ psb->status = IOSTAT_SUCCESS;
+ }
+ spin_lock_irqsave(&phba->nvme_buf_list_put_lock, iflag);
+ list_splice(&nvme_aborts, &phba->lpfc_nvme_buf_list_put);
+ spin_unlock_irqrestore(&phba->nvme_buf_list_put_lock, iflag);
+
+ list_for_each_entry_safe(ctxp, ctxp_next, &nvmet_aborts, list) {
+ ctxp->flag &= ~(LPFC_NVMET_XBUSY | LPFC_NVMET_ABORT_OP);
+ lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+ }
}
- spin_lock_irqsave(&phba->nvme_buf_list_put_lock, iflag);
- list_splice(&nvme_aborts, &phba->lpfc_nvme_buf_list_put);
- spin_unlock_irqrestore(&phba->nvme_buf_list_put_lock, iflag);
lpfc_sli4_free_sp_events(phba);
return 0;
@@ -2874,34 +2903,38 @@ lpfc_sli4_node_prep(struct lpfc_hba *phba)
{
struct lpfc_nodelist *ndlp, *next_ndlp;
struct lpfc_vport **vports;
- int i;
+ int i, rpi;
+ unsigned long flags;
if (phba->sli_rev != LPFC_SLI_REV4)
return;
vports = lpfc_create_vport_work_array(phba);
- if (vports != NULL) {
- for (i = 0; i <= phba->max_vports && vports[i] != NULL; i++) {
- if (vports[i]->load_flag & FC_UNLOADING)
- continue;
+ if (vports == NULL)
+ return;
- list_for_each_entry_safe(ndlp, next_ndlp,
- &vports[i]->fc_nodes,
- nlp_listp) {
- if (NLP_CHK_NODE_ACT(ndlp)) {
- ndlp->nlp_rpi =
- lpfc_sli4_alloc_rpi(phba);
- lpfc_printf_vlog(ndlp->vport, KERN_INFO,
- LOG_NODE,
- "0009 rpi:%x DID:%x "
- "flg:%x map:%x %p\n",
- ndlp->nlp_rpi,
- ndlp->nlp_DID,
- ndlp->nlp_flag,
- ndlp->nlp_usg_map,
- ndlp);
- }
+ for (i = 0; i <= phba->max_vports && vports[i] != NULL; i++) {
+ if (vports[i]->load_flag & FC_UNLOADING)
+ continue;
+
+ list_for_each_entry_safe(ndlp, next_ndlp,
+ &vports[i]->fc_nodes,
+ nlp_listp) {
+ if (!NLP_CHK_NODE_ACT(ndlp))
+ continue;
+ rpi = lpfc_sli4_alloc_rpi(phba);
+ if (rpi == LPFC_RPI_ALLOC_ERROR) {
+ spin_lock_irqsave(&phba->ndlp_lock, flags);
+ NLP_CLR_NODE_ACT(ndlp);
+ spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+ continue;
}
+ ndlp->nlp_rpi = rpi;
+ lpfc_printf_vlog(ndlp->vport, KERN_INFO, LOG_NODE,
+ "0009 rpi:%x DID:%x "
+ "flg:%x map:%x %p\n", ndlp->nlp_rpi,
+ ndlp->nlp_DID, ndlp->nlp_flag,
+ ndlp->nlp_usg_map, ndlp);
}
}
lpfc_destroy_vport_work_array(phba, vports);
@@ -3508,6 +3541,12 @@ lpfc_sli4_scsi_sgl_update(struct lpfc_hba *phba)
spin_unlock(&phba->scsi_buf_list_put_lock);
spin_unlock_irq(&phba->scsi_buf_list_get_lock);
+ lpfc_printf_log(phba, KERN_INFO, LOG_SLI,
+ "6060 Current allocated SCSI xri-sgl count:%d, "
+ "maximum SCSI xri count:%d (split:%d)\n",
+ phba->sli4_hba.scsi_xri_cnt,
+ phba->sli4_hba.scsi_xri_max, phba->cfg_xri_split);
+
if (phba->sli4_hba.scsi_xri_cnt > phba->sli4_hba.scsi_xri_max) {
/* max scsi xri shrinked below the allocated scsi buffers */
scsi_xri_cnt = phba->sli4_hba.scsi_xri_cnt -
@@ -4508,9 +4547,15 @@ lpfc_sli4_async_fc_evt(struct lpfc_hba *phba, struct lpfc_acqe_fc_la *acqe_fc)
/* Parse and translate link attention fields */
la = (struct lpfc_mbx_read_top *)&pmb->u.mb.un.varReadTop;
la->eventTag = acqe_fc->event_tag;
- bf_set(lpfc_mbx_read_top_att_type, la,
- LPFC_FC_LA_TYPE_LINK_DOWN);
+ if (phba->sli4_hba.link_state.status ==
+ LPFC_FC_LA_TYPE_UNEXP_WWPN) {
+ bf_set(lpfc_mbx_read_top_att_type, la,
+ LPFC_FC_LA_TYPE_UNEXP_WWPN);
+ } else {
+ bf_set(lpfc_mbx_read_top_att_type, la,
+ LPFC_FC_LA_TYPE_LINK_DOWN);
+ }
/* Invoke the mailbox command callback function */
lpfc_mbx_cmpl_read_topology(phba, pmb);
@@ -4716,10 +4761,9 @@ lpfc_sli4_perform_vport_cvl(struct lpfc_vport *vport)
ndlp = lpfc_findnode_did(vport, Fabric_DID);
if (!ndlp) {
/* Cannot find existing Fabric ndlp, so allocate a new one */
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, Fabric_DID);
if (!ndlp)
return 0;
- lpfc_nlp_init(vport, ndlp, Fabric_DID);
/* Set the node type */
ndlp->nlp_type |= NLP_FABRIC;
/* Put ndlp onto node list */
@@ -5778,6 +5822,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
/* Initialize the Abort nvme buffer list used by driver */
spin_lock_init(&phba->sli4_hba.abts_nvme_buf_list_lock);
INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvme_buf_list);
+ INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list);
/* Fast-path XRI aborted CQ Event work queue list */
INIT_LIST_HEAD(&phba->sli4_hba.sp_nvme_xri_aborted_work_queue);
}
@@ -5809,6 +5854,12 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
INIT_LIST_HEAD(&phba->sli4_hba.lpfc_vfi_blk_list);
INIT_LIST_HEAD(&phba->lpfc_vpi_blk_list);
+ /* Initialize mboxq lists. If the early init routines fail
+ * these lists need to be correctly initialized.
+ */
+ INIT_LIST_HEAD(&phba->sli.mboxq);
+ INIT_LIST_HEAD(&phba->sli.mboxq_cmpl);
+
/* initialize optic_state to 0xFF */
phba->sli4_hba.lnk_info.optic_state = 0xff;
@@ -5874,6 +5925,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
"READ_NV, mbxStatus x%x\n",
bf_get(lpfc_mqe_command, &mboxq->u.mqe),
bf_get(lpfc_mqe_status, &mboxq->u.mqe));
+ mempool_free(mboxq, phba->mbox_mem_pool);
rc = -EIO;
goto out_free_bsmbx;
}
@@ -6398,7 +6450,7 @@ lpfc_init_sgl_list(struct lpfc_hba *phba)
INIT_LIST_HEAD(&phba->sli4_hba.lpfc_els_sgl_list);
INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_els_sgl_list);
INIT_LIST_HEAD(&phba->sli4_hba.lpfc_nvmet_sgl_list);
- INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvmet_sgl_list);
+ INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list);
/* els xri-sgl book keeping */
phba->sli4_hba.els_xri_cnt = 0;
@@ -7799,7 +7851,7 @@ lpfc_alloc_fcp_wq_cq(struct lpfc_hba *phba, int wqidx)
/* Create Fast Path FCP WQs */
wqesize = (phba->fcp_embed_io) ?
- LPFC_WQE128_SIZE : phba->sli4_hba.wq_esize;
+ LPFC_WQE128_SIZE : phba->sli4_hba.wq_esize;
qdesc = lpfc_sli4_queue_alloc(phba, wqesize, phba->sli4_hba.wq_ecount);
if (!qdesc) {
lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
@@ -7830,7 +7882,7 @@ int
lpfc_sli4_queue_create(struct lpfc_hba *phba)
{
struct lpfc_queue *qdesc;
- int idx, io_channel, max;
+ int idx, io_channel;
/*
* Create HBA Record arrays.
@@ -7991,15 +8043,6 @@ lpfc_sli4_queue_create(struct lpfc_hba *phba)
if (lpfc_alloc_nvme_wq_cq(phba, idx))
goto out_error;
- /* allocate MRQ CQs */
- max = phba->cfg_nvme_io_channel;
- if (max < phba->cfg_nvmet_mrq)
- max = phba->cfg_nvmet_mrq;
-
- for (idx = 0; idx < max; idx++)
- if (lpfc_alloc_nvme_wq_cq(phba, idx))
- goto out_error;
-
if (phba->nvmet_support) {
for (idx = 0; idx < phba->cfg_nvmet_mrq; idx++) {
qdesc = lpfc_sli4_queue_alloc(phba,
@@ -8221,11 +8264,11 @@ lpfc_sli4_queue_destroy(struct lpfc_hba *phba)
/* Release FCP cqs */
lpfc_sli4_release_queues(&phba->sli4_hba.fcp_cq,
- phba->cfg_fcp_io_channel);
+ phba->cfg_fcp_io_channel);
/* Release FCP wqs */
lpfc_sli4_release_queues(&phba->sli4_hba.fcp_wq,
- phba->cfg_fcp_io_channel);
+ phba->cfg_fcp_io_channel);
/* Release FCP CQ mapping array */
lpfc_sli4_release_queue_map(&phba->sli4_hba.fcp_cq_map);
@@ -8571,15 +8614,15 @@ lpfc_sli4_queue_setup(struct lpfc_hba *phba)
lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
"0528 %s not allocated\n",
phba->sli4_hba.mbx_cq ?
- "Mailbox WQ" : "Mailbox CQ");
+ "Mailbox WQ" : "Mailbox CQ");
rc = -ENOMEM;
goto out_destroy;
}
rc = lpfc_create_wq_cq(phba, phba->sli4_hba.hba_eq[0],
- phba->sli4_hba.mbx_cq,
- phba->sli4_hba.mbx_wq,
- NULL, 0, LPFC_MBOX);
+ phba->sli4_hba.mbx_cq,
+ phba->sli4_hba.mbx_wq,
+ NULL, 0, LPFC_MBOX);
if (rc) {
lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
"0529 Failed setup of mailbox WQ/CQ: rc = 0x%x\n",
@@ -9934,17 +9977,19 @@ lpfc_sli4_xri_exchange_busy_wait(struct lpfc_hba *phba)
{
int wait_time = 0;
int nvme_xri_cmpl = 1;
+ int nvmet_xri_cmpl = 1;
int fcp_xri_cmpl = 1;
int els_xri_cmpl = list_empty(&phba->sli4_hba.lpfc_abts_els_sgl_list);
- int nvmet_xri_cmpl =
- list_empty(&phba->sli4_hba.lpfc_abts_nvmet_sgl_list);
if (phba->cfg_enable_fc4_type & LPFC_ENABLE_FCP)
fcp_xri_cmpl =
list_empty(&phba->sli4_hba.lpfc_abts_scsi_buf_list);
- if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)
+ if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) {
nvme_xri_cmpl =
list_empty(&phba->sli4_hba.lpfc_abts_nvme_buf_list);
+ nvmet_xri_cmpl =
+ list_empty(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list);
+ }
while (!fcp_xri_cmpl || !els_xri_cmpl || !nvme_xri_cmpl ||
!nvmet_xri_cmpl) {
@@ -9970,9 +10015,12 @@ lpfc_sli4_xri_exchange_busy_wait(struct lpfc_hba *phba)
msleep(LPFC_XRI_EXCH_BUSY_WAIT_T1);
wait_time += LPFC_XRI_EXCH_BUSY_WAIT_T1;
}
- if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)
+ if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) {
nvme_xri_cmpl = list_empty(
&phba->sli4_hba.lpfc_abts_nvme_buf_list);
+ nvmet_xri_cmpl = list_empty(
+ &phba->sli4_hba.lpfc_abts_nvmet_ctx_list);
+ }
if (phba->cfg_enable_fc4_type & LPFC_ENABLE_FCP)
fcp_xri_cmpl = list_empty(
@@ -9981,8 +10029,6 @@ lpfc_sli4_xri_exchange_busy_wait(struct lpfc_hba *phba)
els_xri_cmpl =
list_empty(&phba->sli4_hba.lpfc_abts_els_sgl_list);
- nvmet_xri_cmpl =
- list_empty(&phba->sli4_hba.lpfc_abts_nvmet_sgl_list);
}
}
@@ -10048,9 +10094,14 @@ lpfc_sli4_hba_unset(struct lpfc_hba *phba)
/* Stop kthread signal shall trigger work_done one more time */
kthread_stop(phba->worker_thread);
+ /* Unset the queues shared with the hardware then release all
+ * allocated resources.
+ */
+ lpfc_sli4_queue_unset(phba);
+ lpfc_sli4_queue_destroy(phba);
+
/* Reset SLI4 HBA FCoE function */
lpfc_pci_function_reset(phba);
- lpfc_sli4_queue_destroy(phba);
/* Stop the SLI4 device port */
phba->pport->work_port_events = 0;
@@ -10306,6 +10357,7 @@ lpfc_pci_probe_one_s3(struct pci_dev *pdev, const struct pci_device_id *pid)
}
/* Initialize and populate the iocb list per host */
+
error = lpfc_init_iocb_list(phba, LPFC_IOCB_LIST_CNT);
if (error) {
lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
@@ -11051,7 +11103,7 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
struct lpfc_hba *phba;
struct lpfc_vport *vport = NULL;
struct Scsi_Host *shost = NULL;
- int error;
+ int error, cnt;
uint32_t cfg_mode, intr_mode;
/* Allocate memory for HBA structure */
@@ -11085,12 +11137,15 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
goto out_unset_pci_mem_s4;
}
- /* Initialize and populate the iocb list per host */
+ cnt = phba->cfg_iocb_cnt * 1024;
+ if (phba->nvmet_support)
+ cnt += phba->cfg_nvmet_mrq_post * phba->cfg_nvmet_mrq;
+ /* Initialize and populate the iocb list per host */
lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
- "2821 initialize iocb list %d.\n",
- phba->cfg_iocb_cnt*1024);
- error = lpfc_init_iocb_list(phba, phba->cfg_iocb_cnt*1024);
+ "2821 initialize iocb list %d total %d\n",
+ phba->cfg_iocb_cnt, cnt);
+ error = lpfc_init_iocb_list(phba, cnt);
if (error) {
lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
@@ -11177,7 +11232,9 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
if ((phba->nvmet_support == 0) &&
(phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)) {
/* Create NVME binding with nvme_fc_transport. This
- * ensures the vport is initialized.
+ * ensures the vport is initialized. If the localport
+ * create fails, it should not unload the driver to
+ * support field issues.
*/
error = lpfc_nvme_create_localport(vport);
if (error) {
@@ -11185,7 +11242,6 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
"6004 NVME registration failed, "
"error x%x\n",
error);
- goto out_disable_intr;
}
}
@@ -11984,6 +12040,7 @@ int
lpfc_fof_queue_create(struct lpfc_hba *phba)
{
struct lpfc_queue *qdesc;
+ uint32_t wqesize;
/* Create FOF EQ */
qdesc = lpfc_sli4_queue_alloc(phba, phba->sli4_hba.eq_esize,
@@ -12004,8 +12061,11 @@ lpfc_fof_queue_create(struct lpfc_hba *phba)
phba->sli4_hba.oas_cq = qdesc;
/* Create OAS WQ */
- qdesc = lpfc_sli4_queue_alloc(phba, phba->sli4_hba.wq_esize,
+ wqesize = (phba->fcp_embed_io) ?
+ LPFC_WQE128_SIZE : phba->sli4_hba.wq_esize;
+ qdesc = lpfc_sli4_queue_alloc(phba, wqesize,
phba->sli4_hba.wq_ecount);
+
if (!qdesc)
goto out_error;
diff --git a/drivers/scsi/lpfc/lpfc_mbox.c b/drivers/scsi/lpfc/lpfc_mbox.c
index a928f5187fa4..ce25a18367b5 100644
--- a/drivers/scsi/lpfc/lpfc_mbox.c
+++ b/drivers/scsi/lpfc/lpfc_mbox.c
@@ -2083,9 +2083,12 @@ lpfc_request_features(struct lpfc_hba *phba, struct lpfcMboxq *mboxq)
if (phba->max_vpi && phba->cfg_enable_npiv)
bf_set(lpfc_mbx_rq_ftr_rq_npiv, &mboxq->u.mqe.un.req_ftrs, 1);
- if (phba->nvmet_support)
+ if (phba->nvmet_support) {
bf_set(lpfc_mbx_rq_ftr_rq_mrqp, &mboxq->u.mqe.un.req_ftrs, 1);
-
+ /* iaab/iaar NOT set for now */
+ bf_set(lpfc_mbx_rq_ftr_rq_iaab, &mboxq->u.mqe.un.req_ftrs, 0);
+ bf_set(lpfc_mbx_rq_ftr_rq_iaar, &mboxq->u.mqe.un.req_ftrs, 0);
+ }
return;
}
diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c
index 061626bdf701..8777c2d5f50d 100644
--- a/drivers/scsi/lpfc/lpfc_nportdisc.c
+++ b/drivers/scsi/lpfc/lpfc_nportdisc.c
@@ -361,8 +361,12 @@ lpfc_rcv_plogi(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
case NLP_STE_PRLI_ISSUE:
case NLP_STE_UNMAPPED_NODE:
case NLP_STE_MAPPED_NODE:
- /* lpfc_plogi_confirm_nport skips fabric did, handle it here */
- if (!(ndlp->nlp_type & NLP_FABRIC)) {
+ /* For initiators, lpfc_plogi_confirm_nport skips fabric did.
+ * For target mode, execute implicit logo.
+ * Fabric nodes go into NPR.
+ */
+ if (!(ndlp->nlp_type & NLP_FABRIC) &&
+ !(phba->nvmet_support)) {
lpfc_els_rsp_acc(vport, ELS_CMD_PLOGI, cmdiocb,
ndlp, NULL);
return 1;
diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c
index 0024de1c6c1f..8008c8205fb6 100644
--- a/drivers/scsi/lpfc/lpfc_nvme.c
+++ b/drivers/scsi/lpfc/lpfc_nvme.c
@@ -401,6 +401,7 @@ lpfc_nvme_ls_req(struct nvme_fc_local_port *pnvme_lport,
struct lpfc_nodelist *ndlp;
struct ulp_bde64 *bpl;
struct lpfc_dmabuf *bmp;
+ uint16_t ntype, nstate;
/* there are two dma buf in the request, actually there is one and
* the second one is just the start address + cmd size.
@@ -417,11 +418,26 @@ lpfc_nvme_ls_req(struct nvme_fc_local_port *pnvme_lport,
vport = lport->vport;
ndlp = lpfc_findnode_did(vport, pnvme_rport->port_id);
- if (!ndlp) {
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_DISC,
- "6043 Could not find node for DID %x\n",
+ if (!ndlp || !NLP_CHK_NODE_ACT(ndlp)) {
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_NODE | LOG_NVME_IOERR,
+ "6051 DID x%06x not an active rport.\n",
pnvme_rport->port_id);
- return 1;
+ return -ENODEV;
+ }
+
+ /* The remote node has to be a mapped nvme target or an
+ * unmapped nvme initiator or it's an error.
+ */
+ ntype = ndlp->nlp_type;
+ nstate = ndlp->nlp_state;
+ if ((ntype & NLP_NVME_TARGET && nstate != NLP_STE_MAPPED_NODE) ||
+ (ntype & NLP_NVME_INITIATOR && nstate != NLP_STE_UNMAPPED_NODE)) {
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_NODE | LOG_NVME_IOERR,
+ "6088 DID x%06x not ready for "
+ "IO. State x%x, Type x%x\n",
+ pnvme_rport->port_id,
+ ndlp->nlp_state, ndlp->nlp_type);
+ return -ENODEV;
}
bmp = kmalloc(sizeof(struct lpfc_dmabuf), GFP_KERNEL);
if (!bmp) {
@@ -456,7 +472,7 @@ lpfc_nvme_ls_req(struct nvme_fc_local_port *pnvme_lport,
/* Expand print to include key fields. */
lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_DISC,
- "6051 ENTER. lport %p, rport %p lsreq%p rqstlen:%d "
+ "6149 ENTER. lport %p, rport %p lsreq%p rqstlen:%d "
"rsplen:%d %pad %pad\n",
pnvme_lport, pnvme_rport,
pnvme_lsreq, pnvme_lsreq->rqstlen,
@@ -745,6 +761,7 @@ lpfc_nvme_io_cmd_wqe_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn,
struct nvme_fc_cmd_iu *cp;
struct lpfc_nvme_rport *rport;
struct lpfc_nodelist *ndlp;
+ struct lpfc_nvme_fcpreq_priv *freqpriv;
unsigned long flags;
uint32_t code;
uint16_t cid, sqhd, data;
@@ -772,9 +789,8 @@ lpfc_nvme_io_cmd_wqe_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn,
ndlp = rport->ndlp;
if (!ndlp || !NLP_CHK_NODE_ACT(ndlp)) {
lpfc_printf_vlog(vport, KERN_ERR, LOG_NODE | LOG_NVME_IOERR,
- "6061 rport %p, ndlp %p, DID x%06x ndlp "
- "not ready.\n",
- rport, ndlp, rport->remoteport->port_id);
+ "6061 rport %p, DID x%06x node not ready.\n",
+ rport, rport->remoteport->port_id);
ndlp = lpfc_findnode_did(vport, rport->remoteport->port_id);
if (!ndlp) {
@@ -853,15 +869,18 @@ lpfc_nvme_io_cmd_wqe_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn,
break;
lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_IOERR,
"6081 NVME Completion Protocol Error: "
- "status x%x result x%x placed x%x\n",
+ "xri %x status x%x result x%x "
+ "placed x%x\n",
+ lpfc_ncmd->cur_iocbq.sli4_xritag,
lpfc_ncmd->status, lpfc_ncmd->result,
wcqe->total_data_placed);
break;
default:
out_err:
lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_IOERR,
- "6072 NVME Completion Error: "
+ "6072 NVME Completion Error: xri %x "
"status x%x result x%x placed x%x\n",
+ lpfc_ncmd->cur_iocbq.sli4_xritag,
lpfc_ncmd->status, lpfc_ncmd->result,
wcqe->total_data_placed);
nCmd->transferred_length = 0;
@@ -900,6 +919,8 @@ out_err:
phba->cpucheck_cmpl_io[lpfc_ncmd->cpu]++;
}
#endif
+ freqpriv = nCmd->private;
+ freqpriv->nvme_buf = NULL;
nCmd->done(nCmd);
spin_lock_irqsave(&phba->hbalock, flags);
@@ -1099,12 +1120,12 @@ lpfc_nvme_prep_io_dma(struct lpfc_vport *vport,
first_data_sgl = sgl;
lpfc_ncmd->seg_cnt = nCmd->sg_cnt;
- if (lpfc_ncmd->seg_cnt > phba->cfg_sg_seg_cnt) {
+ if (lpfc_ncmd->seg_cnt > phba->cfg_nvme_seg_cnt) {
lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
"6058 Too many sg segments from "
"NVME Transport. Max %d, "
"nvmeIO sg_cnt %d\n",
- phba->cfg_sg_seg_cnt,
+ phba->cfg_nvme_seg_cnt,
lpfc_ncmd->seg_cnt);
lpfc_ncmd->seg_cnt = 0;
return 1;
@@ -1196,6 +1217,7 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport,
struct lpfc_nvme_buf *lpfc_ncmd;
struct lpfc_nvme_rport *rport;
struct lpfc_nvme_qhandle *lpfc_queue_info;
+ struct lpfc_nvme_fcpreq_priv *freqpriv = pnvme_fcreq->private;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
uint64_t start = 0;
#endif
@@ -1274,7 +1296,7 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport,
* Do not let the IO hang out forever. There is no midlayer issuing
* an abort so inform the FW of the maximum IO pending time.
*/
- pnvme_fcreq->private = (void *)lpfc_ncmd;
+ freqpriv->nvme_buf = lpfc_ncmd;
lpfc_ncmd->nvmeCmd = pnvme_fcreq;
lpfc_ncmd->nrport = rport;
lpfc_ncmd->ndlp = ndlp;
@@ -1404,6 +1426,7 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
struct lpfc_nvme_buf *lpfc_nbuf;
struct lpfc_iocbq *abts_buf;
struct lpfc_iocbq *nvmereq_wqe;
+ struct lpfc_nvme_fcpreq_priv *freqpriv = pnvme_fcreq->private;
union lpfc_wqe *abts_wqe;
unsigned long flags;
int ret_val;
@@ -1414,7 +1437,7 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
phba = vport->phba;
/* Announce entry to new IO submit field. */
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
+ lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_ABTS,
"6002 Abort Request to rport DID x%06x "
"for nvme_fc_req %p\n",
pnvme_rport->port_id,
@@ -1444,7 +1467,7 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
/* The remote node has to be ready to send an abort. */
if ((ndlp->nlp_state != NLP_STE_MAPPED_NODE) &&
!(ndlp->nlp_type & NLP_NVME_TARGET)) {
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NODE | LOG_NVME_ABTS,
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
"6048 rport %p, DID x%06x not ready for "
"IO. State x%x, Type x%x\n",
rport, pnvme_rport->port_id,
@@ -1459,27 +1482,28 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
/* driver queued commands are in process of being flushed */
if (phba->hba_flag & HBA_NVME_IOQ_FLUSH) {
spin_unlock_irqrestore(&phba->hbalock, flags);
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
"6139 Driver in reset cleanup - flushing "
"NVME Req now. hba_flag x%x\n",
phba->hba_flag);
return;
}
- lpfc_nbuf = (struct lpfc_nvme_buf *)pnvme_fcreq->private;
+ lpfc_nbuf = freqpriv->nvme_buf;
if (!lpfc_nbuf) {
spin_unlock_irqrestore(&phba->hbalock, flags);
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
"6140 NVME IO req has no matching lpfc nvme "
"io buffer. Skipping abort req.\n");
return;
} else if (!lpfc_nbuf->nvmeCmd) {
spin_unlock_irqrestore(&phba->hbalock, flags);
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
"6141 lpfc NVME IO req has no nvme_fcreq "
"io buffer. Skipping abort req.\n");
return;
}
+ nvmereq_wqe = &lpfc_nbuf->cur_iocbq;
/*
* The lpfc_nbuf and the mapped nvme_fcreq in the driver's
@@ -1490,23 +1514,22 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
*/
if (lpfc_nbuf->nvmeCmd != pnvme_fcreq) {
spin_unlock_irqrestore(&phba->hbalock, flags);
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
"6143 NVME req mismatch: "
"lpfc_nbuf %p nvmeCmd %p, "
- "pnvme_fcreq %p. Skipping Abort\n",
+ "pnvme_fcreq %p. Skipping Abort xri x%x\n",
lpfc_nbuf, lpfc_nbuf->nvmeCmd,
- pnvme_fcreq);
+ pnvme_fcreq, nvmereq_wqe->sli4_xritag);
return;
}
/* Don't abort IOs no longer on the pending queue. */
- nvmereq_wqe = &lpfc_nbuf->cur_iocbq;
if (!(nvmereq_wqe->iocb_flag & LPFC_IO_ON_TXCMPLQ)) {
spin_unlock_irqrestore(&phba->hbalock, flags);
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
"6142 NVME IO req %p not queued - skipping "
- "abort req\n",
- pnvme_fcreq);
+ "abort req xri x%x\n",
+ pnvme_fcreq, nvmereq_wqe->sli4_xritag);
return;
}
@@ -1517,21 +1540,22 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
/* Outstanding abort is in progress */
if (nvmereq_wqe->iocb_flag & LPFC_DRIVER_ABORTED) {
spin_unlock_irqrestore(&phba->hbalock, flags);
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
"6144 Outstanding NVME I/O Abort Request "
"still pending on nvme_fcreq %p, "
- "lpfc_ncmd %p\n",
- pnvme_fcreq, lpfc_nbuf);
+ "lpfc_ncmd %p xri x%x\n",
+ pnvme_fcreq, lpfc_nbuf,
+ nvmereq_wqe->sli4_xritag);
return;
}
abts_buf = __lpfc_sli_get_iocbq(phba);
if (!abts_buf) {
spin_unlock_irqrestore(&phba->hbalock, flags);
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
"6136 No available abort wqes. Skipping "
- "Abts req for nvme_fcreq %p.\n",
- pnvme_fcreq);
+ "Abts req for nvme_fcreq %p xri x%x\n",
+ pnvme_fcreq, nvmereq_wqe->sli4_xritag);
return;
}
@@ -1580,7 +1604,7 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
ret_val = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, abts_buf);
spin_unlock_irqrestore(&phba->hbalock, flags);
if (ret_val == IOCB_ERROR) {
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+ lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
"6137 Failed abts issue_wqe with status x%x "
"for nvme_fcreq %p.\n",
ret_val, pnvme_fcreq);
@@ -1588,8 +1612,8 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
return;
}
- lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
- "6138 Transport Abort NVME Request Issued for\n"
+ lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_ABTS,
+ "6138 Transport Abort NVME Request Issued for "
"ox_id x%x on reqtag x%x\n",
nvmereq_wqe->sli4_xritag,
abts_buf->iotag);
@@ -1618,7 +1642,7 @@ static struct nvme_fc_port_template lpfc_nvme_template = {
.local_priv_sz = sizeof(struct lpfc_nvme_lport),
.remote_priv_sz = sizeof(struct lpfc_nvme_rport),
.lsrqst_priv_sz = 0,
- .fcprqst_priv_sz = 0,
+ .fcprqst_priv_sz = sizeof(struct lpfc_nvme_fcpreq_priv),
};
/**
@@ -2049,7 +2073,7 @@ lpfc_get_nvme_buf(struct lpfc_hba *phba, struct lpfc_nodelist *ndlp)
if (lpfc_test_rrq_active(phba, ndlp,
lpfc_ncmd->cur_iocbq.sli4_lxritag))
continue;
- list_del(&lpfc_ncmd->list);
+ list_del_init(&lpfc_ncmd->list);
found = 1;
break;
}
@@ -2064,7 +2088,7 @@ lpfc_get_nvme_buf(struct lpfc_hba *phba, struct lpfc_nodelist *ndlp)
if (lpfc_test_rrq_active(
phba, ndlp, lpfc_ncmd->cur_iocbq.sli4_lxritag))
continue;
- list_del(&lpfc_ncmd->list);
+ list_del_init(&lpfc_ncmd->list);
found = 1;
break;
}
@@ -2092,6 +2116,12 @@ lpfc_release_nvme_buf(struct lpfc_hba *phba, struct lpfc_nvme_buf *lpfc_ncmd)
lpfc_ncmd->nonsg_phys = 0;
if (lpfc_ncmd->flags & LPFC_SBUF_XBUSY) {
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6310 XB release deferred for "
+ "ox_id x%x on reqtag x%x\n",
+ lpfc_ncmd->cur_iocbq.sli4_xritag,
+ lpfc_ncmd->cur_iocbq.iotag);
+
spin_lock_irqsave(&phba->sli4_hba.abts_nvme_buf_list_lock,
iflag);
lpfc_ncmd->nvmeCmd = NULL;
@@ -2142,8 +2172,18 @@ lpfc_nvme_create_localport(struct lpfc_vport *vport)
nfcp_info.node_name = wwn_to_u64(vport->fc_nodename.u.wwn);
nfcp_info.port_name = wwn_to_u64(vport->fc_portname.u.wwn);
- /* For now need + 1 to get around NVME transport logic */
- lpfc_nvme_template.max_sgl_segments = phba->cfg_sg_seg_cnt + 1;
+ /* Limit to LPFC_MAX_NVME_SEG_CNT.
+ * For now need + 1 to get around NVME transport logic.
+ */
+ if (phba->cfg_sg_seg_cnt > LPFC_MAX_NVME_SEG_CNT) {
+ lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME | LOG_INIT,
+ "6300 Reducing sg segment cnt to %d\n",
+ LPFC_MAX_NVME_SEG_CNT);
+ phba->cfg_nvme_seg_cnt = LPFC_MAX_NVME_SEG_CNT;
+ } else {
+ phba->cfg_nvme_seg_cnt = phba->cfg_sg_seg_cnt;
+ }
+ lpfc_nvme_template.max_sgl_segments = phba->cfg_nvme_seg_cnt + 1;
lpfc_nvme_template.max_hw_queues = phba->cfg_nvme_io_channel;
/* localport is allocated from the stack, but the registration
@@ -2249,12 +2289,23 @@ lpfc_nvme_destroy_localport(struct lpfc_vport *vport)
void
lpfc_nvme_update_localport(struct lpfc_vport *vport)
{
+#if (IS_ENABLED(CONFIG_NVME_FC))
struct nvme_fc_local_port *localport;
struct lpfc_nvme_lport *lport;
localport = vport->localport;
+ if (!localport) {
+ lpfc_printf_vlog(vport, KERN_WARNING, LOG_NVME,
+ "6710 Update NVME fail. No localport\n");
+ return;
+ }
lport = (struct lpfc_nvme_lport *)localport->private;
-
+ if (!lport) {
+ lpfc_printf_vlog(vport, KERN_WARNING, LOG_NVME,
+ "6171 Update NVME fail. localP %p, No lport\n",
+ localport);
+ return;
+ }
lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME,
"6012 Update NVME lport %p did x%x\n",
localport, vport->fc_myDID);
@@ -2268,7 +2319,7 @@ lpfc_nvme_update_localport(struct lpfc_vport *vport)
lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_DISC,
"6030 bound lport %p to DID x%06x\n",
lport, localport->port_id);
-
+#endif
}
int
@@ -2409,6 +2460,7 @@ lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
struct lpfc_nvme_lport *lport;
struct lpfc_nvme_rport *rport;
struct nvme_fc_remote_port *remoteport;
+ unsigned long wait_tmo;
localport = vport->localport;
@@ -2451,11 +2503,12 @@ lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
* before proceeding. This guarantees the transport and driver
* have completed the unreg process.
*/
- ret = wait_for_completion_timeout(&rport->rport_unreg_done, 5);
+ wait_tmo = msecs_to_jiffies(5000);
+ ret = wait_for_completion_timeout(&rport->rport_unreg_done,
+ wait_tmo);
if (ret == 0) {
lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_DISC,
- "6169 Unreg nvme wait failed %d\n",
- ret);
+ "6169 Unreg nvme wait timeout\n");
}
}
return;
@@ -2463,7 +2516,7 @@ lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
input_err:
#endif
lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_DISC,
- "6168: State error: lport %p, rport%p FCID x%06x\n",
+ "6168 State error: lport %p, rport%p FCID x%06x\n",
vport->localport, ndlp->rport, ndlp->nlp_DID);
}
@@ -2494,7 +2547,7 @@ lpfc_sli4_nvme_xri_aborted(struct lpfc_hba *phba,
&phba->sli4_hba.lpfc_abts_nvme_buf_list,
list) {
if (lpfc_ncmd->cur_iocbq.sli4_xritag == xri) {
- list_del(&lpfc_ncmd->list);
+ list_del_init(&lpfc_ncmd->list);
lpfc_ncmd->flags &= ~LPFC_SBUF_XBUSY;
lpfc_ncmd->status = IOSTAT_SUCCESS;
spin_unlock(
@@ -2510,6 +2563,12 @@ lpfc_sli4_nvme_xri_aborted(struct lpfc_hba *phba,
rxid, 1);
lpfc_sli4_abts_err_handler(phba, ndlp, axri);
}
+
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6311 XRI Aborted xri x%x tag x%x "
+ "released\n",
+ xri, lpfc_ncmd->cur_iocbq.iotag);
+
lpfc_release_nvme_buf(phba, lpfc_ncmd);
if (rrq_empty)
lpfc_worker_wake_up(phba);
@@ -2518,4 +2577,8 @@ lpfc_sli4_nvme_xri_aborted(struct lpfc_hba *phba,
}
spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
spin_unlock_irqrestore(&phba->hbalock, iflag);
+
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6312 XRI Aborted xri x%x not found\n", xri);
+
}
diff --git a/drivers/scsi/lpfc/lpfc_nvme.h b/drivers/scsi/lpfc/lpfc_nvme.h
index 1347deb8dd6c..ec32f45daa66 100644
--- a/drivers/scsi/lpfc/lpfc_nvme.h
+++ b/drivers/scsi/lpfc/lpfc_nvme.h
@@ -21,12 +21,7 @@
* included with this package. *
********************************************************************/
-#define LPFC_NVME_MIN_SEGS 16
-#define LPFC_NVME_DEFAULT_SEGS 66 /* 256K IOs - 64 + 2 */
-#define LPFC_NVME_MAX_SEGS 510
-#define LPFC_NVMET_MIN_POSTBUF 16
-#define LPFC_NVMET_DEFAULT_POSTBUF 1024
-#define LPFC_NVMET_MAX_POSTBUF 4096
+#define LPFC_NVME_DEFAULT_SEGS (64 + 1) /* 256K IOs */
#define LPFC_NVME_WQSIZE 256
#define LPFC_NVME_ERSP_LEN 0x20
@@ -102,3 +97,7 @@ struct lpfc_nvme_buf {
uint64_t ts_data_nvme;
#endif
};
+
+struct lpfc_nvme_fcpreq_priv {
+ struct lpfc_nvme_buf *nvme_buf;
+};
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index acba1b67e505..94434e621c33 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -71,6 +71,26 @@ static int lpfc_nvmet_unsol_ls_issue_abort(struct lpfc_hba *,
struct lpfc_nvmet_rcv_ctx *,
uint32_t, uint16_t);
+void
+lpfc_nvmet_defer_release(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp)
+{
+ unsigned long iflag;
+
+ lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
+ "6313 NVMET Defer ctx release xri x%x flg x%x\n",
+ ctxp->oxid, ctxp->flag);
+
+ spin_lock_irqsave(&phba->sli4_hba.abts_nvme_buf_list_lock, iflag);
+ if (ctxp->flag & LPFC_NVMET_CTX_RLS) {
+ spin_unlock_irqrestore(&phba->sli4_hba.abts_nvme_buf_list_lock,
+ iflag);
+ return;
+ }
+ ctxp->flag |= LPFC_NVMET_CTX_RLS;
+ list_add_tail(&ctxp->list, &phba->sli4_hba.lpfc_abts_nvmet_ctx_list);
+ spin_unlock_irqrestore(&phba->sli4_hba.abts_nvme_buf_list_lock, iflag);
+}
+
/**
* lpfc_nvmet_xmt_ls_rsp_cmp - Completion handler for LS Response
* @phba: Pointer to HBA context object.
@@ -139,6 +159,11 @@ lpfc_nvmet_rq_post(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp,
struct lpfc_dmabuf *mp)
{
if (ctxp) {
+ if (ctxp->flag)
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6314 rq_post ctx xri x%x flag x%x\n",
+ ctxp->oxid, ctxp->flag);
+
if (ctxp->txrdy) {
pci_pool_free(phba->txrdy_payload_pool, ctxp->txrdy,
ctxp->txrdy_phys);
@@ -337,39 +362,55 @@ lpfc_nvmet_xmt_fcp_op_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
#endif
ctxp = cmdwqe->context2;
+ ctxp->flag &= ~LPFC_NVMET_IO_INP;
+
rsp = &ctxp->ctx.fcp_req;
op = rsp->op;
- ctxp->flag &= ~LPFC_NVMET_IO_INP;
status = bf_get(lpfc_wcqe_c_status, wcqe);
result = wcqe->parameter;
- if (!phba->targetport)
- goto out;
+ if (phba->targetport)
+ tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
+ else
+ tgtp = NULL;
lpfc_nvmeio_data(phba, "NVMET FCP CMPL: xri x%x op x%x status x%x\n",
ctxp->oxid, op, status);
- tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
if (status) {
rsp->fcp_error = NVME_SC_DATA_XFER_ERROR;
rsp->transferred_length = 0;
- atomic_inc(&tgtp->xmt_fcp_rsp_error);
+ if (tgtp)
+ atomic_inc(&tgtp->xmt_fcp_rsp_error);
+
+ /* pick up SLI4 exhange busy condition */
+ if (bf_get(lpfc_wcqe_c_xb, wcqe)) {
+ ctxp->flag |= LPFC_NVMET_XBUSY;
+
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6315 IO Cmpl XBUSY: xri x%x: %x/%x\n",
+ ctxp->oxid, status, result);
+ } else {
+ ctxp->flag &= ~LPFC_NVMET_XBUSY;
+ }
+
} else {
rsp->fcp_error = NVME_SC_SUCCESS;
if (op == NVMET_FCOP_RSP)
rsp->transferred_length = rsp->rsplen;
else
rsp->transferred_length = rsp->transfer_length;
- atomic_inc(&tgtp->xmt_fcp_rsp_cmpl);
+ if (tgtp)
+ atomic_inc(&tgtp->xmt_fcp_rsp_cmpl);
}
-out:
if ((op == NVMET_FCOP_READDATA_RSP) ||
(op == NVMET_FCOP_RSP)) {
/* Sanity check */
ctxp->state = LPFC_NVMET_STE_DONE;
ctxp->entry_cnt++;
+
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
if (phba->ktime_on) {
if (rsp->op == NVMET_FCOP_READDATA_RSP) {
@@ -408,9 +449,7 @@ out:
if (phba->ktime_on)
lpfc_nvmet_ktime(phba, ctxp);
#endif
- /* Let Abort cmpl repost the context */
- if (!(ctxp->flag & LPFC_NVMET_ABORT_OP))
- lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+ /* lpfc_nvmet_xmt_fcp_release() will recycle the context */
} else {
ctxp->entry_cnt++;
start_clean = offsetof(struct lpfc_iocbq, wqe);
@@ -519,7 +558,6 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport,
container_of(rsp, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
struct lpfc_hba *phba = ctxp->phba;
struct lpfc_iocbq *nvmewqeq;
- unsigned long iflags;
int rc;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
@@ -544,32 +582,12 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport,
}
#endif
- if (rsp->op == NVMET_FCOP_ABORT) {
- lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
- "6103 Abort op: oxri x%x %d cnt %d\n",
- ctxp->oxid, ctxp->state, ctxp->entry_cnt);
-
- lpfc_nvmeio_data(phba, "NVMET FCP ABRT: "
- "xri x%x state x%x cnt x%x\n",
- ctxp->oxid, ctxp->state, ctxp->entry_cnt);
-
- atomic_inc(&lpfc_nvmep->xmt_fcp_abort);
- ctxp->entry_cnt++;
- ctxp->flag |= LPFC_NVMET_ABORT_OP;
- if (ctxp->flag & LPFC_NVMET_IO_INP)
- lpfc_nvmet_sol_fcp_issue_abort(phba, ctxp, ctxp->sid,
- ctxp->oxid);
- else
- lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, ctxp->sid,
- ctxp->oxid);
- return 0;
- }
-
/* Sanity check */
- if (ctxp->state == LPFC_NVMET_STE_ABORT) {
+ if ((ctxp->flag & LPFC_NVMET_ABTS_RCV) ||
+ (ctxp->state == LPFC_NVMET_STE_ABORT)) {
atomic_inc(&lpfc_nvmep->xmt_fcp_drop);
lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
- "6102 Bad state IO x%x aborted\n",
+ "6102 IO xri x%x aborted\n",
ctxp->oxid);
rc = -ENXIO;
goto aerr;
@@ -594,10 +612,7 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport,
lpfc_nvmeio_data(phba, "NVMET FCP CMND: xri x%x op x%x len x%x\n",
ctxp->oxid, rsp->op, rsp->rsplen);
- /* For now we take hbalock */
- spin_lock_irqsave(&phba->hbalock, iflags);
rc = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, nvmewqeq);
- spin_unlock_irqrestore(&phba->hbalock, iflags);
if (rc == WQE_SUCCESS) {
ctxp->flag |= LPFC_NVMET_IO_INP;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
@@ -634,10 +649,79 @@ lpfc_nvmet_targetport_delete(struct nvmet_fc_target_port *targetport)
complete(&tport->tport_unreg_done);
}
+static void
+lpfc_nvmet_xmt_fcp_abort(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *req)
+{
+ struct lpfc_nvmet_tgtport *lpfc_nvmep = tgtport->private;
+ struct lpfc_nvmet_rcv_ctx *ctxp =
+ container_of(req, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
+ struct lpfc_hba *phba = ctxp->phba;
+ unsigned long flags;
+
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6103 Abort op: oxri x%x flg x%x cnt %d\n",
+ ctxp->oxid, ctxp->flag, ctxp->entry_cnt);
+
+ lpfc_nvmeio_data(phba, "NVMET FCP ABRT: "
+ "xri x%x flg x%x cnt x%x\n",
+ ctxp->oxid, ctxp->flag, ctxp->entry_cnt);
+
+ atomic_inc(&lpfc_nvmep->xmt_fcp_abort);
+ ctxp->entry_cnt++;
+ spin_lock_irqsave(&ctxp->ctxlock, flags);
+
+ /* Since iaab/iaar are NOT set, we need to check
+ * if the firmware is in process of aborting IO
+ */
+ if (ctxp->flag & LPFC_NVMET_XBUSY) {
+ spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+ return;
+ }
+ ctxp->flag |= LPFC_NVMET_ABORT_OP;
+ if (ctxp->flag & LPFC_NVMET_IO_INP)
+ lpfc_nvmet_sol_fcp_issue_abort(phba, ctxp, ctxp->sid,
+ ctxp->oxid);
+ else
+ lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, ctxp->sid,
+ ctxp->oxid);
+ spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+}
+
+static void
+lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *rsp)
+{
+ struct lpfc_nvmet_rcv_ctx *ctxp =
+ container_of(rsp, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
+ struct lpfc_hba *phba = ctxp->phba;
+ unsigned long flags;
+ bool aborting = false;
+
+ spin_lock_irqsave(&ctxp->ctxlock, flags);
+ if ((ctxp->flag & LPFC_NVMET_ABORT_OP) ||
+ (ctxp->flag & LPFC_NVMET_XBUSY)) {
+ aborting = true;
+ /* let the abort path do the real release */
+ lpfc_nvmet_defer_release(phba, ctxp);
+ }
+ spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
+ lpfc_nvmeio_data(phba, "NVMET FCP FREE: xri x%x ste %d\n", ctxp->oxid,
+ ctxp->state, 0);
+
+ if (aborting)
+ return;
+
+ lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+}
+
static struct nvmet_fc_target_template lpfc_tgttemplate = {
.targetport_delete = lpfc_nvmet_targetport_delete,
.xmt_ls_rsp = lpfc_nvmet_xmt_ls_rsp,
.fcp_op = lpfc_nvmet_xmt_fcp_op,
+ .fcp_abort = lpfc_nvmet_xmt_fcp_abort,
+ .fcp_req_release = lpfc_nvmet_xmt_fcp_release,
.max_hw_queues = 1,
.max_sgl_segments = LPFC_NVMET_DEFAULT_SEGS,
@@ -666,10 +750,23 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba)
pinfo.port_name = wwn_to_u64(vport->fc_portname.u.wwn);
pinfo.port_id = vport->fc_myDID;
+ /* Limit to LPFC_MAX_NVME_SEG_CNT.
+ * For now need + 1 to get around NVME transport logic.
+ */
+ if (phba->cfg_sg_seg_cnt > LPFC_MAX_NVME_SEG_CNT) {
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME | LOG_INIT,
+ "6400 Reducing sg segment cnt to %d\n",
+ LPFC_MAX_NVME_SEG_CNT);
+ phba->cfg_nvme_seg_cnt = LPFC_MAX_NVME_SEG_CNT;
+ } else {
+ phba->cfg_nvme_seg_cnt = phba->cfg_sg_seg_cnt;
+ }
+ lpfc_tgttemplate.max_sgl_segments = phba->cfg_nvme_seg_cnt + 1;
lpfc_tgttemplate.max_hw_queues = phba->cfg_nvme_io_channel;
- lpfc_tgttemplate.max_sgl_segments = phba->cfg_sg_seg_cnt;
lpfc_tgttemplate.target_features = NVMET_FCTGTFEAT_READDATA_RSP |
- NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED;
+ NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
+ NVMET_FCTGTFEAT_CMD_IN_ISR |
+ NVMET_FCTGTFEAT_OPDONE_IN_ISR;
#if (IS_ENABLED(CONFIG_NVME_TARGET_FC))
error = nvmet_fc_register_targetport(&pinfo, &lpfc_tgttemplate,
@@ -750,7 +847,120 @@ void
lpfc_sli4_nvmet_xri_aborted(struct lpfc_hba *phba,
struct sli4_wcqe_xri_aborted *axri)
{
- /* TODO: work in progress */
+ uint16_t xri = bf_get(lpfc_wcqe_xa_xri, axri);
+ uint16_t rxid = bf_get(lpfc_wcqe_xa_remote_xid, axri);
+ struct lpfc_nvmet_rcv_ctx *ctxp, *next_ctxp;
+ struct lpfc_nodelist *ndlp;
+ unsigned long iflag = 0;
+ int rrq_empty = 0;
+ bool released = false;
+
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6317 XB aborted xri x%x rxid x%x\n", xri, rxid);
+
+ if (!(phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME))
+ return;
+ spin_lock_irqsave(&phba->hbalock, iflag);
+ spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+ list_for_each_entry_safe(ctxp, next_ctxp,
+ &phba->sli4_hba.lpfc_abts_nvmet_ctx_list,
+ list) {
+ if (ctxp->rqb_buffer->sglq->sli4_xritag != xri)
+ continue;
+
+ /* Check if we already received a free context call
+ * and we have completed processing an abort situation.
+ */
+ if (ctxp->flag & LPFC_NVMET_CTX_RLS &&
+ !(ctxp->flag & LPFC_NVMET_ABORT_OP)) {
+ list_del(&ctxp->list);
+ released = true;
+ }
+ ctxp->flag &= ~LPFC_NVMET_XBUSY;
+ spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+
+ rrq_empty = list_empty(&phba->active_rrq_list);
+ spin_unlock_irqrestore(&phba->hbalock, iflag);
+ ndlp = lpfc_findnode_did(phba->pport, ctxp->sid);
+ if (ndlp && NLP_CHK_NODE_ACT(ndlp) &&
+ (ndlp->nlp_state == NLP_STE_UNMAPPED_NODE ||
+ ndlp->nlp_state == NLP_STE_MAPPED_NODE)) {
+ lpfc_set_rrq_active(phba, ndlp,
+ ctxp->rqb_buffer->sglq->sli4_lxritag,
+ rxid, 1);
+ lpfc_sli4_abts_err_handler(phba, ndlp, axri);
+ }
+
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6318 XB aborted %x flg x%x (%x)\n",
+ ctxp->oxid, ctxp->flag, released);
+ if (released)
+ lpfc_nvmet_rq_post(phba, ctxp,
+ &ctxp->rqb_buffer->hbuf);
+ if (rrq_empty)
+ lpfc_worker_wake_up(phba);
+ return;
+ }
+ spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+ spin_unlock_irqrestore(&phba->hbalock, iflag);
+}
+
+int
+lpfc_nvmet_rcv_unsol_abort(struct lpfc_vport *vport,
+ struct fc_frame_header *fc_hdr)
+
+{
+#if (IS_ENABLED(CONFIG_NVME_TARGET_FC))
+ struct lpfc_hba *phba = vport->phba;
+ struct lpfc_nvmet_rcv_ctx *ctxp, *next_ctxp;
+ struct nvmefc_tgt_fcp_req *rsp;
+ uint16_t xri;
+ unsigned long iflag = 0;
+
+ xri = be16_to_cpu(fc_hdr->fh_ox_id);
+
+ spin_lock_irqsave(&phba->hbalock, iflag);
+ spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+ list_for_each_entry_safe(ctxp, next_ctxp,
+ &phba->sli4_hba.lpfc_abts_nvmet_ctx_list,
+ list) {
+ if (ctxp->rqb_buffer->sglq->sli4_xritag != xri)
+ continue;
+
+ spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+ spin_unlock_irqrestore(&phba->hbalock, iflag);
+
+ spin_lock_irqsave(&ctxp->ctxlock, iflag);
+ ctxp->flag |= LPFC_NVMET_ABTS_RCV;
+ spin_unlock_irqrestore(&ctxp->ctxlock, iflag);
+
+ lpfc_nvmeio_data(phba,
+ "NVMET ABTS RCV: xri x%x CPU %02x rjt %d\n",
+ xri, smp_processor_id(), 0);
+
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6319 NVMET Rcv ABTS:acc xri x%x\n", xri);
+
+ rsp = &ctxp->ctx.fcp_req;
+ nvmet_fc_rcv_fcp_abort(phba->targetport, rsp);
+
+ /* Respond with BA_ACC accordingly */
+ lpfc_sli4_seq_abort_rsp(vport, fc_hdr, 1);
+ return 0;
+ }
+ spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+ spin_unlock_irqrestore(&phba->hbalock, iflag);
+
+ lpfc_nvmeio_data(phba, "NVMET ABTS RCV: xri x%x CPU %02x rjt %d\n",
+ xri, smp_processor_id(), 1);
+
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6320 NVMET Rcv ABTS:rjt xri x%x\n", xri);
+
+ /* Respond with BA_RJT accordingly */
+ lpfc_sli4_seq_abort_rsp(vport, fc_hdr, 0);
+#endif
+ return 0;
}
void
@@ -940,6 +1150,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba,
ctxp->rqb_buffer = nvmebuf;
ctxp->entry_cnt = 1;
ctxp->flag = 0;
+ spin_lock_init(&ctxp->ctxlock);
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
if (phba->ktime_on) {
@@ -962,8 +1173,8 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba,
}
#endif
- lpfc_nvmeio_data(phba, "NVMET FCP RCV: xri x%x sz %d from %06x\n",
- oxid, size, sid);
+ lpfc_nvmeio_data(phba, "NVMET FCP RCV: xri x%x sz %d CPU %02x\n",
+ oxid, size, smp_processor_id());
atomic_inc(&tgtp->rcv_fcp_cmd_in);
/*
@@ -1237,11 +1448,11 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba,
return NULL;
}
- if (rsp->sg_cnt > phba->cfg_sg_seg_cnt) {
+ if (rsp->sg_cnt > phba->cfg_nvme_seg_cnt) {
lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
"6109 lpfc_nvmet_prep_fcp_wqe: seg cnt err: "
- "NPORT x%x oxid:x%x\n",
- ctxp->sid, ctxp->oxid);
+ "NPORT x%x oxid:x%x cnt %d\n",
+ ctxp->sid, ctxp->oxid, phba->cfg_nvme_seg_cnt);
return NULL;
}
@@ -1593,6 +1804,8 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
struct lpfc_nvmet_rcv_ctx *ctxp;
struct lpfc_nvmet_tgtport *tgtp;
uint32_t status, result;
+ unsigned long flags;
+ bool released = false;
ctxp = cmdwqe->context2;
status = bf_get(lpfc_wcqe_c_status, wcqe);
@@ -1601,21 +1814,46 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
atomic_inc(&tgtp->xmt_abort_cmpl);
+ ctxp->state = LPFC_NVMET_STE_DONE;
+
+ /* Check if we already received a free context call
+ * and we have completed processing an abort situation.
+ */
+ spin_lock_irqsave(&ctxp->ctxlock, flags);
+ if ((ctxp->flag & LPFC_NVMET_CTX_RLS) &&
+ !(ctxp->flag & LPFC_NVMET_XBUSY)) {
+ list_del(&ctxp->list);
+ released = true;
+ }
+ ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
+ spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
- "6165 Abort cmpl: xri x%x WCQE: %08x %08x %08x %08x\n",
- ctxp->oxid, wcqe->word0, wcqe->total_data_placed,
+ "6165 ABORT cmpl: xri x%x flg x%x (%d) "
+ "WCQE: %08x %08x %08x %08x\n",
+ ctxp->oxid, ctxp->flag, released,
+ wcqe->word0, wcqe->total_data_placed,
result, wcqe->word3);
- ctxp->state = LPFC_NVMET_STE_DONE;
- lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+ /*
+ * if transport has released ctx, then can reuse it. Otherwise,
+ * will be recycled by transport release call.
+ */
+ if (released)
+ lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
cmdwqe->context2 = NULL;
cmdwqe->context3 = NULL;
lpfc_sli_release_iocbq(phba, cmdwqe);
+
+ /* Since iaab/iaar are NOT set, there is no work left.
+ * For LPFC_NVMET_XBUSY, lpfc_sli4_nvmet_xri_aborted
+ * should have been called already.
+ */
}
/**
- * lpfc_nvmet_xmt_fcp_abort_cmp - Completion handler for ABTS
+ * lpfc_nvmet_unsol_fcp_abort_cmp - Completion handler for ABTS
* @phba: Pointer to HBA context object.
* @cmdwqe: Pointer to driver command WQE object.
* @wcqe: Pointer to driver response CQE object.
@@ -1625,12 +1863,14 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
* The function frees memory resources used for the NVME commands.
**/
static void
-lpfc_nvmet_xmt_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
- struct lpfc_wcqe_complete *wcqe)
+lpfc_nvmet_unsol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
+ struct lpfc_wcqe_complete *wcqe)
{
struct lpfc_nvmet_rcv_ctx *ctxp;
struct lpfc_nvmet_tgtport *tgtp;
+ unsigned long flags;
uint32_t status, result;
+ bool released = false;
ctxp = cmdwqe->context2;
status = bf_get(lpfc_wcqe_c_status, wcqe);
@@ -1639,23 +1879,55 @@ lpfc_nvmet_xmt_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
atomic_inc(&tgtp->xmt_abort_cmpl);
+ if (!ctxp) {
+ /* if context is clear, related io alrady complete */
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6070 ABTS cmpl: WCQE: %08x %08x %08x %08x\n",
+ wcqe->word0, wcqe->total_data_placed,
+ result, wcqe->word3);
+ return;
+ }
+
+ /* Sanity check */
+ if (ctxp->state != LPFC_NVMET_STE_ABORT) {
+ lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
+ "6112 ABTS Wrong state:%d oxid x%x\n",
+ ctxp->state, ctxp->oxid);
+ }
+
+ /* Check if we already received a free context call
+ * and we have completed processing an abort situation.
+ */
+ ctxp->state = LPFC_NVMET_STE_DONE;
+ spin_lock_irqsave(&ctxp->ctxlock, flags);
+ if ((ctxp->flag & LPFC_NVMET_CTX_RLS) &&
+ !(ctxp->flag & LPFC_NVMET_XBUSY)) {
+ list_del(&ctxp->list);
+ released = true;
+ }
+ ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
+ spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
- "6070 Abort cmpl: ctx %p WCQE: %08x %08x %08x %08x\n",
- ctxp, wcqe->word0, wcqe->total_data_placed,
+ "6316 ABTS cmpl xri x%x flg x%x (%x) "
+ "WCQE: %08x %08x %08x %08x\n",
+ ctxp->oxid, ctxp->flag, released,
+ wcqe->word0, wcqe->total_data_placed,
result, wcqe->word3);
-
- if (ctxp) {
- /* Sanity check */
- if (ctxp->state != LPFC_NVMET_STE_ABORT) {
- lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
- "6112 ABORT Wrong state:%d oxid x%x\n",
- ctxp->state, ctxp->oxid);
- }
- ctxp->state = LPFC_NVMET_STE_DONE;
+ /*
+ * if transport has released ctx, then can reuse it. Otherwise,
+ * will be recycled by transport release call.
+ */
+ if (released)
lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
- cmdwqe->context2 = NULL;
- cmdwqe->context3 = NULL;
- }
+
+ cmdwqe->context2 = NULL;
+ cmdwqe->context3 = NULL;
+
+ /* Since iaab/iaar are NOT set, there is no work left.
+ * For LPFC_NVMET_XBUSY, lpfc_sli4_nvmet_xri_aborted
+ * should have been called already.
+ */
}
/**
@@ -1708,10 +1980,14 @@ lpfc_nvmet_unsol_issue_abort(struct lpfc_hba *phba,
struct lpfc_nodelist *ndlp;
lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
- "6067 Abort: sid %x xri x%x/x%x\n",
+ "6067 ABTS: sid %x xri x%x/x%x\n",
sid, xri, ctxp->wqeq->sli4_xritag);
tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
+ if (!ctxp->wqeq) {
+ ctxp->wqeq = ctxp->rqb_buffer->iocbq;
+ ctxp->wqeq->hba_wqidx = 0;
+ }
ndlp = lpfc_findnode_did(phba->pport, sid);
if (!ndlp || !NLP_CHK_NODE_ACT(ndlp) ||
@@ -1817,10 +2093,11 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
(ndlp->nlp_state != NLP_STE_MAPPED_NODE))) {
atomic_inc(&tgtp->xmt_abort_rsp_error);
lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS,
- "6160 Drop ABTS - wrong NDLP state x%x.\n",
+ "6160 Drop ABORT - wrong NDLP state x%x.\n",
(ndlp) ? ndlp->nlp_state : NLP_STE_MAX_STATE);
/* No failure to an ABTS request. */
+ ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
return 0;
}
@@ -1828,9 +2105,10 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
ctxp->abort_wqeq = lpfc_sli_get_iocbq(phba);
if (!ctxp->abort_wqeq) {
lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS,
- "6161 Abort failed: No wqeqs: "
+ "6161 ABORT failed: No wqeqs: "
"xri: x%x\n", ctxp->oxid);
/* No failure to an ABTS request. */
+ ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
return 0;
}
abts_wqeq = ctxp->abort_wqeq;
@@ -1838,8 +2116,8 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
ctxp->state = LPFC_NVMET_STE_ABORT;
/* Announce entry to new IO submit field. */
- lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
- "6162 Abort Request to rport DID x%06x "
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6162 ABORT Request to rport DID x%06x "
"for xri x%x x%x\n",
ctxp->sid, ctxp->oxid, ctxp->wqeq->sli4_xritag);
@@ -1855,6 +2133,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
"NVME Req now. hba_flag x%x oxid x%x\n",
phba->hba_flag, ctxp->oxid);
lpfc_sli_release_iocbq(phba, abts_wqeq);
+ ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
return 0;
}
@@ -1866,6 +2145,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
"still pending on oxid x%x\n",
ctxp->oxid);
lpfc_sli_release_iocbq(phba, abts_wqeq);
+ ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
return 0;
}
@@ -1913,9 +2193,10 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
if (rc == WQE_SUCCESS)
return 0;
+ ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
lpfc_sli_release_iocbq(phba, abts_wqeq);
- lpfc_printf_log(phba, KERN_ERR, LOG_NVME,
- "6166 Failed abts issue_wqe with status x%x "
+ lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
+ "6166 Failed ABORT issue_wqe with status x%x "
"for oxid x%x.\n",
rc, ctxp->oxid);
return 1;
@@ -1944,8 +2225,8 @@ lpfc_nvmet_unsol_fcp_issue_abort(struct lpfc_hba *phba,
spin_lock_irqsave(&phba->hbalock, flags);
abts_wqeq = ctxp->wqeq;
- abts_wqeq->wqe_cmpl = lpfc_nvmet_xmt_fcp_abort_cmp;
- abts_wqeq->iocb_cmpl = 0;
+ abts_wqeq->wqe_cmpl = lpfc_nvmet_unsol_fcp_abort_cmp;
+ abts_wqeq->iocb_cmpl = NULL;
abts_wqeq->iocb_flag |= LPFC_IO_NVMET;
rc = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, abts_wqeq);
spin_unlock_irqrestore(&phba->hbalock, flags);
@@ -1955,7 +2236,7 @@ lpfc_nvmet_unsol_fcp_issue_abort(struct lpfc_hba *phba,
}
aerr:
- lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+ ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
atomic_inc(&tgtp->xmt_abort_rsp_error);
lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS,
"6135 Failed to Issue ABTS for oxid x%x. Status x%x\n",
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.h b/drivers/scsi/lpfc/lpfc_nvmet.h
index ca96f05c1604..128759fe6650 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.h
+++ b/drivers/scsi/lpfc/lpfc_nvmet.h
@@ -21,9 +21,7 @@
* included with this package. *
********************************************************************/
-#define LPFC_NVMET_MIN_SEGS 16
-#define LPFC_NVMET_DEFAULT_SEGS 64 /* 256K IOs */
-#define LPFC_NVMET_MAX_SEGS 510
+#define LPFC_NVMET_DEFAULT_SEGS (64 + 1) /* 256K IOs */
#define LPFC_NVMET_SUCCESS_LEN 12
/* Used for NVME Target */
@@ -77,10 +75,12 @@ struct lpfc_nvmet_rcv_ctx {
struct nvmefc_tgt_ls_req ls_req;
struct nvmefc_tgt_fcp_req fcp_req;
} ctx;
+ struct list_head list;
struct lpfc_hba *phba;
struct lpfc_iocbq *wqeq;
struct lpfc_iocbq *abort_wqeq;
dma_addr_t txrdy_phys;
+ spinlock_t ctxlock; /* protect flag access */
uint32_t *txrdy;
uint32_t sid;
uint32_t offset;
@@ -97,8 +97,11 @@ struct lpfc_nvmet_rcv_ctx {
#define LPFC_NVMET_STE_RSP 4
#define LPFC_NVMET_STE_DONE 5
uint16_t flag;
-#define LPFC_NVMET_IO_INP 1
-#define LPFC_NVMET_ABORT_OP 2
+#define LPFC_NVMET_IO_INP 0x1 /* IO is in progress on exchange */
+#define LPFC_NVMET_ABORT_OP 0x2 /* Abort WQE issued on exchange */
+#define LPFC_NVMET_XBUSY 0x4 /* XB bit set on IO cmpl */
+#define LPFC_NVMET_CTX_RLS 0x8 /* ctx free requested */
+#define LPFC_NVMET_ABTS_RCV 0x10 /* ABTS received on exchange */
struct rqb_dmabuf *rqb_buffer;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 1c9fa45df7eb..cf19f4976f5f 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -6338,7 +6338,7 @@ lpfc_sli4_get_allocated_extnts(struct lpfc_hba *phba, uint16_t type,
}
/**
- * lpfc_sli4_repost_sgl_list - Repsot the buffers sgl pages as block
+ * lpfc_sli4_repost_sgl_list - Repost the buffers sgl pages as block
* @phba: pointer to lpfc hba data structure.
* @pring: Pointer to driver SLI ring object.
* @sgl_list: linked link of sgl buffers to post
@@ -13758,7 +13758,10 @@ lpfc_sli4_queue_free(struct lpfc_queue *queue)
lpfc_free_rq_buffer(queue->phba, queue);
kfree(queue->rqbp);
}
- kfree(queue->pring);
+
+ if (!list_empty(&queue->wq_list))
+ list_del(&queue->wq_list);
+
kfree(queue);
return;
}
@@ -14738,6 +14741,9 @@ lpfc_wq_create(struct lpfc_hba *phba, struct lpfc_queue *wq,
case LPFC_Q_CREATE_VERSION_1:
bf_set(lpfc_mbx_wq_create_wqe_count, &wq_create->u.request_1,
wq->entry_count);
+ bf_set(lpfc_mbox_hdr_version, &shdr->request,
+ LPFC_Q_CREATE_VERSION_1);
+
switch (wq->entry_size) {
default:
case 64:
@@ -15561,6 +15567,8 @@ lpfc_wq_destroy(struct lpfc_hba *phba, struct lpfc_queue *wq)
}
/* Remove wq from any list */
list_del_init(&wq->list);
+ kfree(wq->pring);
+ wq->pring = NULL;
mempool_free(mbox, wq->phba->mbox_mem_pool);
return status;
}
@@ -16513,7 +16521,7 @@ lpfc_sli4_xri_inrange(struct lpfc_hba *phba,
* This function sends a basic response to a previous unsol sequence abort
* event after aborting the sequence handling.
**/
-static void
+void
lpfc_sli4_seq_abort_rsp(struct lpfc_vport *vport,
struct fc_frame_header *fc_hdr, bool aborted)
{
@@ -16534,14 +16542,13 @@ lpfc_sli4_seq_abort_rsp(struct lpfc_vport *vport,
ndlp = lpfc_findnode_did(vport, sid);
if (!ndlp) {
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, sid);
if (!ndlp) {
lpfc_printf_vlog(vport, KERN_WARNING, LOG_ELS,
"1268 Failed to allocate ndlp for "
"oxid:x%x SID:x%x\n", oxid, sid);
return;
}
- lpfc_nlp_init(vport, ndlp, sid);
/* Put ndlp onto pport node list */
lpfc_enqueue_node(vport, ndlp);
} else if (!NLP_CHK_NODE_ACT(ndlp)) {
@@ -16690,6 +16697,11 @@ lpfc_sli4_handle_unsol_abort(struct lpfc_vport *vport,
}
lpfc_in_buf_free(phba, &dmabuf->dbuf);
+ if (phba->nvmet_support) {
+ lpfc_nvmet_rcv_unsol_abort(vport, &fc_hdr);
+ return;
+ }
+
/* Respond with BA_ACC or BA_RJT accordingly */
lpfc_sli4_seq_abort_rsp(vport, &fc_hdr, aborted);
}
diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h
index 710458cf11d6..da46471337c8 100644
--- a/drivers/scsi/lpfc/lpfc_sli4.h
+++ b/drivers/scsi/lpfc/lpfc_sli4.h
@@ -620,7 +620,7 @@ struct lpfc_sli4_hba {
struct list_head lpfc_els_sgl_list;
struct list_head lpfc_abts_els_sgl_list;
struct list_head lpfc_nvmet_sgl_list;
- struct list_head lpfc_abts_nvmet_sgl_list;
+ struct list_head lpfc_abts_nvmet_ctx_list;
struct list_head lpfc_abts_scsi_buf_list;
struct list_head lpfc_abts_nvme_buf_list;
struct lpfc_sglq **lpfc_sglq_active_list;
diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h
index d4e95e28f4e3..1c26dc67151b 100644
--- a/drivers/scsi/lpfc/lpfc_version.h
+++ b/drivers/scsi/lpfc/lpfc_version.h
@@ -20,7 +20,7 @@
* included with this package. *
*******************************************************************/
-#define LPFC_DRIVER_VERSION "11.2.0.10"
+#define LPFC_DRIVER_VERSION "11.2.0.12"
#define LPFC_DRIVER_NAME "lpfc"
/* Used for SLI 2/3 */
diff --git a/drivers/scsi/lpfc/lpfc_vport.c b/drivers/scsi/lpfc/lpfc_vport.c
index 9a0339dbc024..c714482bf4c5 100644
--- a/drivers/scsi/lpfc/lpfc_vport.c
+++ b/drivers/scsi/lpfc/lpfc_vport.c
@@ -738,10 +738,9 @@ lpfc_vport_delete(struct fc_vport *fc_vport)
ndlp = lpfc_findnode_did(vport, Fabric_DID);
if (!ndlp) {
/* Cannot find existing Fabric ndlp, allocate one */
- ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+ ndlp = lpfc_nlp_init(vport, Fabric_DID);
if (!ndlp)
goto skip_logo;
- lpfc_nlp_init(vport, ndlp, Fabric_DID);
/* Indicate free memory when release */
NLP_SET_FREE_REQ(ndlp);
} else {
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 6903f03c88af..8a1b94816419 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -477,7 +477,7 @@ static void _set_error_resid(struct osd_request *or, struct request *req,
int error)
{
or->async_error = error;
- or->req_errors = req->errors ? : error;
+ or->req_errors = scsi_req(req)->result ? : error;
or->sense_len = scsi_req(req)->sense_len;
if (or->sense_len)
memcpy(or->sense, scsi_req(req)->sense, or->sense_len);
@@ -489,7 +489,10 @@ static void _set_error_resid(struct osd_request *or, struct request *req,
int osd_execute_request(struct osd_request *or)
{
- int error = blk_execute_rq(or->request->q, NULL, or->request, 0);
+ int error;
+
+ blk_execute_rq(or->request->q, NULL, or->request, 0);
+ error = scsi_req(or->request)->result ? -EIO : 0;
_set_error_resid(or, or->request, error);
return error;
@@ -1602,7 +1605,7 @@ static int _init_blk_request(struct osd_request *or,
req->rq_flags |= RQF_QUIET;
req->timeout = or->timeout;
- req->retries = or->retries;
+ scsi_req(req)->retries = or->retries;
if (has_out) {
or->out.req = req;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index c47f4b349bac..67cbed92f07d 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -327,7 +327,7 @@ static void osst_end_async(struct request *req, int update)
struct osst_tape *STp = SRpnt->stp;
struct rq_map_data *mdata = &SRpnt->stp->buffer->map_data;
- STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
+ STp->buffer->cmdstat.midlevel_result = SRpnt->result = rq->result;
#if DEBUG
STp->write_pending = 0;
#endif
@@ -414,7 +414,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
memcpy(rq->cmd, cmd, rq->cmd_len);
req->timeout = timeout;
- req->retries = retries;
+ rq->retries = retries;
req->end_io_data = SRpnt;
blk_execute_rq_nowait(req->q, NULL, req, 1, osst_end_async);
diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c
index 84c9098cc089..b6e40fd4c3c1 100644
--- a/drivers/scsi/qla2xxx/qla_bsg.c
+++ b/drivers/scsi/qla2xxx/qla_bsg.c
@@ -2553,13 +2553,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
ql_log(ql_log_warn, vha, 0x7089,
"mbx abort_command "
"failed.\n");
- bsg_job->req->errors =
+ scsi_req(bsg_job->req)->result =
bsg_reply->result = -EIO;
} else {
ql_dbg(ql_dbg_user, vha, 0x708a,
"mbx abort_command "
"success.\n");
- bsg_job->req->errors =
+ scsi_req(bsg_job->req)->result =
bsg_reply->result = 0;
}
spin_lock_irqsave(&ha->hardware_lock, flags);
@@ -2570,7 +2570,7 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
}
spin_unlock_irqrestore(&ha->hardware_lock, flags);
ql_log(ql_log_info, vha, 0x708b, "SRB not found to abort.\n");
- bsg_job->req->errors = bsg_reply->result = -ENXIO;
+ scsi_req(bsg_job->req)->result = bsg_reply->result = -ENXIO;
return 0;
done:
diff --git a/drivers/scsi/scsi_debugfs.c b/drivers/scsi/scsi_debugfs.c
new file mode 100644
index 000000000000..a97c9507103d
--- /dev/null
+++ b/drivers/scsi/scsi_debugfs.c
@@ -0,0 +1,13 @@
+#include <linux/seq_file.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_dbg.h>
+#include "scsi_debugfs.h"
+
+void scsi_show_rq(struct seq_file *m, struct request *rq)
+{
+ struct scsi_cmnd *cmd = container_of(scsi_req(rq), typeof(*cmd), req);
+ char buf[80];
+
+ __scsi_format_command(buf, sizeof(buf), cmd->cmnd, cmd->cmd_len);
+ seq_printf(m, ", .cmd=%s", buf);
+}
diff --git a/drivers/scsi/scsi_debugfs.h b/drivers/scsi/scsi_debugfs.h
new file mode 100644
index 000000000000..951b043e82d0
--- /dev/null
+++ b/drivers/scsi/scsi_debugfs.h
@@ -0,0 +1,4 @@
+struct request;
+struct seq_file;
+
+void scsi_show_rq(struct seq_file *m, struct request *rq);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index f2cafae150bc..2db412dd4b44 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1988,7 +1988,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
req->rq_flags |= RQF_QUIET;
req->timeout = 10 * HZ;
- req->retries = 5;
+ rq->retries = 5;
blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done);
}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e5a2d590a104..1c3e87d6c48f 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -34,6 +34,7 @@
#include <trace/events/scsi.h>
+#include "scsi_debugfs.h"
#include "scsi_priv.h"
#include "scsi_logging.h"
@@ -229,8 +230,8 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
* @rq_flags: flags for ->rq_flags
* @resid: optional residual length
*
- * returns the req->errors value which is the scsi_cmnd result
- * field.
+ * Returns the scsi_cmnd result field if a command was executed, or a negative
+ * Linux error code if we didn't get that far.
*/
int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
int data_direction, void *buffer, unsigned bufflen,
@@ -256,7 +257,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
rq->cmd_len = COMMAND_SIZE(cmd[0]);
memcpy(rq->cmd, cmd, rq->cmd_len);
- req->retries = retries;
+ rq->retries = retries;
req->timeout = timeout;
req->cmd_flags |= flags;
req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT;
@@ -281,7 +282,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
memcpy(sense, rq->sense, SCSI_SENSE_BUFFERSIZE);
if (sshdr)
scsi_normalize_sense(rq->sense, rq->sense_len, sshdr);
- ret = req->errors;
+ ret = rq->result;
out:
blk_put_request(req);
@@ -797,8 +798,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
/*
* __scsi_error_from_host_byte may have reset the host_byte
*/
- req->errors = cmd->result;
-
+ scsi_req(req)->result = cmd->result;
scsi_req(req)->resid_len = scsi_get_resid(cmd);
if (scsi_bidi_cmnd(cmd)) {
@@ -835,7 +835,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
/*
* Recovered errors need reporting, but they're always treated as
* success, so fiddle the result code here. For passthrough requests
- * we already took a copy of the original into rq->errors which
+ * we already took a copy of the original into sreq->result which
* is what gets returned to the user
*/
if (sense_valid && (sshdr.sense_key == RECOVERED_ERROR)) {
@@ -1061,10 +1061,10 @@ int scsi_init_io(struct scsi_cmnd *cmd)
struct scsi_device *sdev = cmd->device;
struct request *rq = cmd->request;
bool is_mq = (rq->mq_ctx != NULL);
- int error;
+ int error = BLKPREP_KILL;
if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
- return -EINVAL;
+ goto err_exit;
error = scsi_init_sgtable(rq, &cmd->sdb);
if (error)
@@ -1177,7 +1177,7 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
cmd->cmd_len = scsi_req(req)->cmd_len;
cmd->cmnd = scsi_req(req)->cmd;
cmd->transfersize = blk_rq_bytes(req);
- cmd->allowed = req->retries;
+ cmd->allowed = scsi_req(req)->retries;
return BLKPREP_OK;
}
@@ -1281,7 +1281,7 @@ scsi_prep_return(struct request_queue *q, struct request *req, int ret)
switch (ret) {
case BLKPREP_KILL:
case BLKPREP_INVALID:
- req->errors = DID_NO_CONNECT << 16;
+ scsi_req(req)->result = DID_NO_CONNECT << 16;
/* release the command and kill it */
if (req->special) {
struct scsi_cmnd *cmd = req->special;
@@ -1905,7 +1905,7 @@ static int scsi_mq_prep_fn(struct request *req)
static void scsi_mq_done(struct scsi_cmnd *cmd)
{
trace_scsi_dispatch_cmd_done(cmd);
- blk_mq_complete_request(cmd->request, cmd->request->errors);
+ blk_mq_complete_request(cmd->request);
}
static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -2154,10 +2154,13 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
return q;
}
-static struct blk_mq_ops scsi_mq_ops = {
+static const struct blk_mq_ops scsi_mq_ops = {
.queue_rq = scsi_queue_rq,
.complete = scsi_softirq_done,
.timeout = scsi_timeout,
+#ifdef CONFIG_BLK_DEBUG_FS
+ .show_rq = scsi_show_rq,
+#endif
.init_request = scsi_init_request,
.exit_request = scsi_exit_request,
.map_queues = scsi_map_queues,
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index cdbb293aca08..9fdbd50c31b4 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -184,9 +184,9 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
blk_rq_bytes(req->next_rq);
handler = to_sas_internal(shost->transportt)->f->smp_handler;
ret = handler(shost, rphy, req);
- req->errors = ret;
+ scsi_req(req)->result = ret;
- blk_end_request_all(req, ret);
+ blk_end_request_all(req, 0);
spin_lock_irq(q->queue_lock);
}
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 35ad5e8a31ab..0dc95e102e69 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -418,6 +418,46 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(provisioning_mode);
+static const char *zeroing_mode[] = {
+ [SD_ZERO_WRITE] = "write",
+ [SD_ZERO_WS] = "writesame",
+ [SD_ZERO_WS16_UNMAP] = "writesame_16_unmap",
+ [SD_ZERO_WS10_UNMAP] = "writesame_10_unmap",
+};
+
+static ssize_t
+zeroing_mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct scsi_disk *sdkp = to_scsi_disk(dev);
+
+ return snprintf(buf, 20, "%s\n", zeroing_mode[sdkp->zeroing_mode]);
+}
+
+static ssize_t
+zeroing_mode_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct scsi_disk *sdkp = to_scsi_disk(dev);
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!strncmp(buf, zeroing_mode[SD_ZERO_WRITE], 20))
+ sdkp->zeroing_mode = SD_ZERO_WRITE;
+ else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS], 20))
+ sdkp->zeroing_mode = SD_ZERO_WS;
+ else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS16_UNMAP], 20))
+ sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
+ else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS10_UNMAP], 20))
+ sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
+ else
+ return -EINVAL;
+
+ return count;
+}
+static DEVICE_ATTR_RW(zeroing_mode);
+
static ssize_t
max_medium_access_timeouts_show(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -496,6 +536,7 @@ static struct attribute *sd_disk_attrs[] = {
&dev_attr_app_tag_own.attr,
&dev_attr_thin_provisioning.attr,
&dev_attr_provisioning_mode.attr,
+ &dev_attr_zeroing_mode.attr,
&dev_attr_max_write_same_blocks.attr,
&dev_attr_max_medium_access_timeouts.attr,
NULL,
@@ -644,26 +685,11 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
unsigned int logical_block_size = sdkp->device->sector_size;
unsigned int max_blocks = 0;
- q->limits.discard_zeroes_data = 0;
-
- /*
- * When LBPRZ is reported, discard alignment and granularity
- * must be fixed to the logical block size. Otherwise the block
- * layer will drop misaligned portions of the request which can
- * lead to data corruption. If LBPRZ is not set, we honor the
- * device preference.
- */
- if (sdkp->lbprz) {
- q->limits.discard_alignment = 0;
- q->limits.discard_granularity = logical_block_size;
- } else {
- q->limits.discard_alignment = sdkp->unmap_alignment *
- logical_block_size;
- q->limits.discard_granularity =
- max(sdkp->physical_block_size,
- sdkp->unmap_granularity * logical_block_size);
- }
-
+ q->limits.discard_alignment =
+ sdkp->unmap_alignment * logical_block_size;
+ q->limits.discard_granularity =
+ max(sdkp->physical_block_size,
+ sdkp->unmap_granularity * logical_block_size);
sdkp->provisioning_mode = mode;
switch (mode) {
@@ -681,19 +707,16 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
case SD_LBP_WS16:
max_blocks = min_not_zero(sdkp->max_ws_blocks,
(u32)SD_MAX_WS16_BLOCKS);
- q->limits.discard_zeroes_data = sdkp->lbprz;
break;
case SD_LBP_WS10:
max_blocks = min_not_zero(sdkp->max_ws_blocks,
(u32)SD_MAX_WS10_BLOCKS);
- q->limits.discard_zeroes_data = sdkp->lbprz;
break;
case SD_LBP_ZERO:
max_blocks = min_not_zero(sdkp->max_ws_blocks,
(u32)SD_MAX_WS10_BLOCKS);
- q->limits.discard_zeroes_data = 1;
break;
}
@@ -701,93 +724,122 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
}
-/**
- * sd_setup_discard_cmnd - unmap blocks on thinly provisioned device
- * @sdp: scsi device to operate on
- * @rq: Request to prepare
- *
- * Will issue either UNMAP or WRITE SAME(16) depending on preference
- * indicated by target device.
- **/
-static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
+static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
{
- struct request *rq = cmd->request;
struct scsi_device *sdp = cmd->device;
- struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
- sector_t sector = blk_rq_pos(rq);
- unsigned int nr_sectors = blk_rq_sectors(rq);
- unsigned int len;
- int ret;
+ struct request *rq = cmd->request;
+ u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+ unsigned int data_len = 24;
char *buf;
- struct page *page;
-
- sector >>= ilog2(sdp->sector_size) - 9;
- nr_sectors >>= ilog2(sdp->sector_size) - 9;
- page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
- if (!page)
+ rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+ if (!rq->special_vec.bv_page)
return BLKPREP_DEFER;
+ rq->special_vec.bv_offset = 0;
+ rq->special_vec.bv_len = data_len;
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
- switch (sdkp->provisioning_mode) {
- case SD_LBP_UNMAP:
- buf = page_address(page);
+ cmd->cmd_len = 10;
+ cmd->cmnd[0] = UNMAP;
+ cmd->cmnd[8] = 24;
- cmd->cmd_len = 10;
- cmd->cmnd[0] = UNMAP;
- cmd->cmnd[8] = 24;
+ buf = page_address(rq->special_vec.bv_page);
+ put_unaligned_be16(6 + 16, &buf[0]);
+ put_unaligned_be16(16, &buf[2]);
+ put_unaligned_be64(sector, &buf[8]);
+ put_unaligned_be32(nr_sectors, &buf[16]);
- put_unaligned_be16(6 + 16, &buf[0]);
- put_unaligned_be16(16, &buf[2]);
- put_unaligned_be64(sector, &buf[8]);
- put_unaligned_be32(nr_sectors, &buf[16]);
+ cmd->allowed = SD_MAX_RETRIES;
+ cmd->transfersize = data_len;
+ rq->timeout = SD_TIMEOUT;
+ scsi_req(rq)->resid_len = data_len;
- len = 24;
- break;
+ return scsi_init_io(cmd);
+}
- case SD_LBP_WS16:
- cmd->cmd_len = 16;
- cmd->cmnd[0] = WRITE_SAME_16;
+static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
+{
+ struct scsi_device *sdp = cmd->device;
+ struct request *rq = cmd->request;
+ u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 data_len = sdp->sector_size;
+
+ rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+ if (!rq->special_vec.bv_page)
+ return BLKPREP_DEFER;
+ rq->special_vec.bv_offset = 0;
+ rq->special_vec.bv_len = data_len;
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+ cmd->cmd_len = 16;
+ cmd->cmnd[0] = WRITE_SAME_16;
+ if (unmap)
cmd->cmnd[1] = 0x8; /* UNMAP */
- put_unaligned_be64(sector, &cmd->cmnd[2]);
- put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
+ put_unaligned_be64(sector, &cmd->cmnd[2]);
+ put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
- len = sdkp->device->sector_size;
- break;
+ cmd->allowed = SD_MAX_RETRIES;
+ cmd->transfersize = data_len;
+ rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
+ scsi_req(rq)->resid_len = data_len;
- case SD_LBP_WS10:
- case SD_LBP_ZERO:
- cmd->cmd_len = 10;
- cmd->cmnd[0] = WRITE_SAME;
- if (sdkp->provisioning_mode == SD_LBP_WS10)
- cmd->cmnd[1] = 0x8; /* UNMAP */
- put_unaligned_be32(sector, &cmd->cmnd[2]);
- put_unaligned_be16(nr_sectors, &cmd->cmnd[7]);
+ return scsi_init_io(cmd);
+}
- len = sdkp->device->sector_size;
- break;
+static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
+{
+ struct scsi_device *sdp = cmd->device;
+ struct request *rq = cmd->request;
+ u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 data_len = sdp->sector_size;
- default:
- ret = BLKPREP_INVALID;
- goto out;
- }
+ rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+ if (!rq->special_vec.bv_page)
+ return BLKPREP_DEFER;
+ rq->special_vec.bv_offset = 0;
+ rq->special_vec.bv_len = data_len;
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
- rq->timeout = SD_TIMEOUT;
+ cmd->cmd_len = 10;
+ cmd->cmnd[0] = WRITE_SAME;
+ if (unmap)
+ cmd->cmnd[1] = 0x8; /* UNMAP */
+ put_unaligned_be32(sector, &cmd->cmnd[2]);
+ put_unaligned_be16(nr_sectors, &cmd->cmnd[7]);
- cmd->transfersize = len;
cmd->allowed = SD_MAX_RETRIES;
+ cmd->transfersize = data_len;
+ rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
+ scsi_req(rq)->resid_len = data_len;
- rq->special_vec.bv_page = page;
- rq->special_vec.bv_offset = 0;
- rq->special_vec.bv_len = len;
+ return scsi_init_io(cmd);
+}
- rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
- scsi_req(rq)->resid_len = len;
+static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
+{
+ struct request *rq = cmd->request;
+ struct scsi_device *sdp = cmd->device;
+ struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+ u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+
+ if (!(rq->cmd_flags & REQ_NOUNMAP)) {
+ switch (sdkp->zeroing_mode) {
+ case SD_ZERO_WS16_UNMAP:
+ return sd_setup_write_same16_cmnd(cmd, true);
+ case SD_ZERO_WS10_UNMAP:
+ return sd_setup_write_same10_cmnd(cmd, true);
+ }
+ }
- ret = scsi_init_io(cmd);
-out:
- if (ret != BLKPREP_OK)
- __free_page(page);
- return ret;
+ if (sdp->no_write_same)
+ return BLKPREP_INVALID;
+ if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff)
+ return sd_setup_write_same16_cmnd(cmd, false);
+ return sd_setup_write_same10_cmnd(cmd, false);
}
static void sd_config_write_same(struct scsi_disk *sdkp)
@@ -816,9 +868,20 @@ static void sd_config_write_same(struct scsi_disk *sdkp)
sdkp->max_ws_blocks = 0;
}
+ if (sdkp->lbprz && sdkp->lbpws)
+ sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
+ else if (sdkp->lbprz && sdkp->lbpws10)
+ sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
+ else if (sdkp->max_ws_blocks)
+ sdkp->zeroing_mode = SD_ZERO_WS;
+ else
+ sdkp->zeroing_mode = SD_ZERO_WRITE;
+
out:
blk_queue_max_write_same_sectors(q, sdkp->max_ws_blocks *
(logical_block_size >> 9));
+ blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks *
+ (logical_block_size >> 9));
}
/**
@@ -1155,7 +1218,20 @@ static int sd_init_command(struct scsi_cmnd *cmd)
switch (req_op(rq)) {
case REQ_OP_DISCARD:
- return sd_setup_discard_cmnd(cmd);
+ switch (scsi_disk(rq->rq_disk)->provisioning_mode) {
+ case SD_LBP_UNMAP:
+ return sd_setup_unmap_cmnd(cmd);
+ case SD_LBP_WS16:
+ return sd_setup_write_same16_cmnd(cmd, true);
+ case SD_LBP_WS10:
+ return sd_setup_write_same10_cmnd(cmd, true);
+ case SD_LBP_ZERO:
+ return sd_setup_write_same10_cmnd(cmd, false);
+ default:
+ return BLKPREP_INVALID;
+ }
+ case REQ_OP_WRITE_ZEROES:
+ return sd_setup_write_zeroes_cmnd(cmd);
case REQ_OP_WRITE_SAME:
return sd_setup_write_same_cmnd(cmd);
case REQ_OP_FLUSH:
@@ -1795,6 +1871,7 @@ static int sd_done(struct scsi_cmnd *SCpnt)
switch (req_op(req)) {
case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
case REQ_OP_ZONE_RESET:
if (!result) {
@@ -2768,7 +2845,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp)
sd_config_discard(sdkp, SD_LBP_WS16);
} else { /* LBP VPD page tells us what to use */
- if (sdkp->lbpu && sdkp->max_unmap_blocks && !sdkp->lbprz)
+ if (sdkp->lbpu && sdkp->max_unmap_blocks)
sd_config_discard(sdkp, SD_LBP_UNMAP);
else if (sdkp->lbpws)
sd_config_discard(sdkp, SD_LBP_WS16);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 4dac35e96a75..a2c4b5c35379 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -59,6 +59,13 @@ enum {
SD_LBP_DISABLE, /* Discard disabled due to failed cmd */
};
+enum {
+ SD_ZERO_WRITE = 0, /* Use WRITE(10/16) command */
+ SD_ZERO_WS, /* Use WRITE SAME(10/16) command */
+ SD_ZERO_WS16_UNMAP, /* Use WRITE SAME(16) with UNMAP */
+ SD_ZERO_WS10_UNMAP, /* Use WRITE SAME(10) with UNMAP */
+};
+
struct scsi_disk {
struct scsi_driver *driver; /* always &sd_template */
struct scsi_device *device;
@@ -89,6 +96,7 @@ struct scsi_disk {
u8 write_prot;
u8 protection_type;/* Data Integrity Field */
u8 provisioning_mode;
+ u8 zeroing_mode;
unsigned ATO : 1; /* state of disk ATO bit */
unsigned cache_override : 1; /* temp override of WCE,RCD */
unsigned WCE : 1; /* state of disk WCE bit */
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 92620c8ea8ad..1994f7799fce 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -329,6 +329,7 @@ void sd_zbc_complete(struct scsi_cmnd *cmd,
switch (req_op(rq)) {
case REQ_OP_WRITE:
+ case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
case REQ_OP_ZONE_RESET:
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 225abaad4d1c..504504beaa5e 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1300,7 +1300,7 @@ sg_rq_end_io(struct request *rq, int uptodate)
pr_info("%s: device detaching\n", __func__);
sense = req->sense;
- result = rq->errors;
+ result = req->result;
resid = req->resid_len;
SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sdp,
@@ -1718,7 +1718,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
srp->rq = rq;
rq->end_io_data = srp;
- rq->retries = SG_DEFAULT_RETRIES;
+ req->retries = SG_DEFAULT_RETRIES;
if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index e5ef78a6848e..1ea34d6f5437 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -480,7 +480,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
atomic64_add(ktime_to_ns(now), &STp->stats->tot_write_time);
atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
atomic64_inc(&STp->stats->write_cnt);
- if (req->errors) {
+ if (scsi_req(req)->result) {
atomic64_add(atomic_read(&STp->stats->last_write_size)
- STp->buffer->cmdstat.residual,
&STp->stats->write_byte_cnt);
@@ -494,7 +494,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
atomic64_add(ktime_to_ns(now), &STp->stats->tot_read_time);
atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
atomic64_inc(&STp->stats->read_cnt);
- if (req->errors) {
+ if (scsi_req(req)->result) {
atomic64_add(atomic_read(&STp->stats->last_read_size)
- STp->buffer->cmdstat.residual,
&STp->stats->read_byte_cnt);
@@ -518,7 +518,7 @@ static void st_scsi_execute_end(struct request *req, int uptodate)
struct scsi_tape *STp = SRpnt->stp;
struct bio *tmp;
- STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
+ STp->buffer->cmdstat.midlevel_result = SRpnt->result = rq->result;
STp->buffer->cmdstat.residual = rq->resid_len;
st_do_stats(STp, req);
@@ -579,7 +579,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
memset(rq->cmd, 0, BLK_MAX_CDB);
memcpy(rq->cmd, cmd, rq->cmd_len);
req->timeout = timeout;
- req->retries = retries;
+ rq->retries = retries;
req->end_io_data = SRpnt;
blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end);
diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h
index 8886458748c1..a676bccabd43 100644
--- a/drivers/staging/lustre/lustre/include/lustre_disk.h
+++ b/drivers/staging/lustre/lustre/include/lustre_disk.h
@@ -133,13 +133,9 @@ struct lustre_sb_info {
struct obd_export *lsi_osd_exp;
char lsi_osd_type[16];
char lsi_fstype[16];
- struct backing_dev_info lsi_bdi; /* each client mountpoint needs
- * own backing_dev_info
- */
};
#define LSI_UMOUNT_FAILOVER 0x00200000
-#define LSI_BDI_INITIALIZED 0x00400000
#define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info))
#define s2lsi_nocast(sb) ((sb)->s_fs_info)
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index b229cbc7bb33..d483c44aafe5 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -863,15 +863,6 @@ void ll_lli_init(struct ll_inode_info *lli)
mutex_init(&lli->lli_layout_mutex);
}
-static inline int ll_bdi_register(struct backing_dev_info *bdi)
-{
- static atomic_t ll_bdi_num = ATOMIC_INIT(0);
-
- bdi->name = "lustre";
- return bdi_register(bdi, NULL, "lustre-%d",
- atomic_inc_return(&ll_bdi_num));
-}
-
int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
{
struct lustre_profile *lprof = NULL;
@@ -881,6 +872,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
char *profilenm = get_profile_name(sb);
struct config_llog_instance *cfg;
int err;
+ static atomic_t ll_bdi_num = ATOMIC_INIT(0);
CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
@@ -903,16 +895,11 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
if (err)
goto out_free;
- err = bdi_init(&lsi->lsi_bdi);
- if (err)
- goto out_free;
- lsi->lsi_flags |= LSI_BDI_INITIALIZED;
- lsi->lsi_bdi.capabilities = 0;
- err = ll_bdi_register(&lsi->lsi_bdi);
+ err = super_setup_bdi_name(sb, "lustre-%d",
+ atomic_inc_return(&ll_bdi_num));
if (err)
goto out_free;
- sb->s_bdi = &lsi->lsi_bdi;
/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
sb->s_d_op = &ll_d_ops;
@@ -1033,11 +1020,6 @@ void ll_put_super(struct super_block *sb)
if (profilenm)
class_del_profile(profilenm);
- if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
- bdi_destroy(&lsi->lsi_bdi);
- lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
- }
-
ll_free_sbi(sb);
lsi->lsi_llsbi = NULL;
diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c
index 344e8448869c..5798810197ec 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -167,10 +167,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
struct iscsi_portal_group *tpg;
struct iscsi_tpg_np *tpg_np;
char *str, *str2, *ip_str, *port_str;
- struct sockaddr_storage sockaddr;
- struct sockaddr_in *sock_in;
- struct sockaddr_in6 *sock_in6;
- unsigned long port;
+ struct sockaddr_storage sockaddr = { };
int ret;
char buf[MAX_PORTAL_LEN + 1];
@@ -182,21 +179,19 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
memset(buf, 0, MAX_PORTAL_LEN + 1);
snprintf(buf, MAX_PORTAL_LEN + 1, "%s", name);
- memset(&sockaddr, 0, sizeof(struct sockaddr_storage));
-
str = strstr(buf, "[");
if (str) {
- const char *end;
-
str2 = strstr(str, "]");
if (!str2) {
pr_err("Unable to locate trailing \"]\""
" in IPv6 iSCSI network portal address\n");
return ERR_PTR(-EINVAL);
}
- str++; /* Skip over leading "[" */
+
+ ip_str = str + 1; /* Skip over leading "[" */
*str2 = '\0'; /* Terminate the unbracketed IPv6 address */
str2++; /* Skip over the \0 */
+
port_str = strstr(str2, ":");
if (!port_str) {
pr_err("Unable to locate \":port\""
@@ -205,23 +200,8 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
}
*port_str = '\0'; /* Terminate string for IP */
port_str++; /* Skip over ":" */
-
- ret = kstrtoul(port_str, 0, &port);
- if (ret < 0) {
- pr_err("kstrtoul() failed for port_str: %d\n", ret);
- return ERR_PTR(ret);
- }
- sock_in6 = (struct sockaddr_in6 *)&sockaddr;
- sock_in6->sin6_family = AF_INET6;
- sock_in6->sin6_port = htons((unsigned short)port);
- ret = in6_pton(str, -1,
- (void *)&sock_in6->sin6_addr.in6_u, -1, &end);
- if (ret <= 0) {
- pr_err("in6_pton returned: %d\n", ret);
- return ERR_PTR(-EINVAL);
- }
} else {
- str = ip_str = &buf[0];
+ ip_str = &buf[0];
port_str = strstr(ip_str, ":");
if (!port_str) {
pr_err("Unable to locate \":port\""
@@ -230,17 +210,15 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
}
*port_str = '\0'; /* Terminate string for IP */
port_str++; /* Skip over ":" */
+ }
- ret = kstrtoul(port_str, 0, &port);
- if (ret < 0) {
- pr_err("kstrtoul() failed for port_str: %d\n", ret);
- return ERR_PTR(ret);
- }
- sock_in = (struct sockaddr_in *)&sockaddr;
- sock_in->sin_family = AF_INET;
- sock_in->sin_port = htons((unsigned short)port);
- sock_in->sin_addr.s_addr = in_aton(ip_str);
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC, ip_str,
+ port_str, &sockaddr);
+ if (ret) {
+ pr_err("malformed ip/port passed: %s\n", name);
+ return ERR_PTR(ret);
}
+
tpg = container_of(se_tpg, struct iscsi_portal_group, tpg_se_tpg);
ret = iscsit_get_tpg(tpg);
if (ret < 0)
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index c754ae33bf7b..d2f089cfa9ae 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -851,7 +851,7 @@ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
attrib->unmap_granularity = q->limits.discard_granularity / block_size;
attrib->unmap_granularity_alignment = q->limits.discard_alignment /
block_size;
- attrib->unmap_zeroes_data = q->limits.discard_zeroes_data;
+ attrib->unmap_zeroes_data = 0;
return true;
}
EXPORT_SYMBOL(target_configure_unmap_from_queue);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 94cda7991e80..a93d94e68ab5 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1008,7 +1008,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
req->timeout = PS_TIMEOUT_DISK;
else
req->timeout = PS_TIMEOUT_OTHER;
- req->retries = PS_RETRY;
+ scsi_req(req)->retries = PS_RETRY;
blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req,
(cmd->sam_task_attr == TCM_HEAD_TAG),
@@ -1050,7 +1050,7 @@ static void pscsi_req_done(struct request *req, int uptodate)
struct se_cmd *cmd = req->end_io_data;
struct pscsi_plugin_task *pt = cmd->priv;
- pt->pscsi_result = req->errors;
+ pt->pscsi_result = scsi_req(req)->result;
pt->pscsi_resid = scsi_req(req)->resid_len;
cmd->scsi_status = status_byte(pt->pscsi_result) << 1;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index a89f3cfe3c7d..c202930086ed 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -333,10 +333,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
goto err_names;
init_rwsem(&v9ses->rename_sem);
- rc = bdi_setup_and_register(&v9ses->bdi, "9p");
- if (rc)
- goto err_names;
-
v9ses->uid = INVALID_UID;
v9ses->dfltuid = V9FS_DEFUID;
v9ses->dfltgid = V9FS_DEFGID;
@@ -345,7 +341,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if (IS_ERR(v9ses->clnt)) {
rc = PTR_ERR(v9ses->clnt);
p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
- goto err_bdi;
+ goto err_names;
}
v9ses->flags = V9FS_ACCESS_USER;
@@ -415,8 +411,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
err_clnt:
p9_client_destroy(v9ses->clnt);
-err_bdi:
- bdi_destroy(&v9ses->bdi);
err_names:
kfree(v9ses->uname);
kfree(v9ses->aname);
@@ -445,8 +439,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
kfree(v9ses->uname);
kfree(v9ses->aname);
- bdi_destroy(&v9ses->bdi);
-
spin_lock(&v9fs_sessionlist_lock);
list_del(&v9ses->slist);
spin_unlock(&v9fs_sessionlist_lock);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 443d12e02043..76eaf49abd3a 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -114,7 +114,6 @@ struct v9fs_session_info {
kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
- struct backing_dev_info bdi;
struct rw_semaphore rename_sem;
};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index de3ed8629196..a0965fb587a5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -72,10 +72,12 @@ static int v9fs_set_super(struct super_block *s, void *data)
*
*/
-static void
+static int
v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
int flags, void *data)
{
+ int ret;
+
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
sb->s_blocksize = 1 << sb->s_blocksize_bits;
@@ -85,7 +87,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_xattr = v9fs_xattr_handlers;
} else
sb->s_op = &v9fs_super_ops;
- sb->s_bdi = &v9ses->bdi;
+
+ ret = super_setup_bdi(sb);
+ if (ret)
+ return ret;
+
if (v9ses->cache)
sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
@@ -99,6 +105,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
#endif
save_mount_options(sb, data);
+ return 0;
}
/**
@@ -138,7 +145,9 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
retval = PTR_ERR(sb);
goto clunk_fid;
}
- v9fs_fill_super(sb, v9ses, flags, data);
+ retval = v9fs_fill_super(sb, v9ses, flags, data);
+ if (retval)
+ goto release_sb;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
sb->s_d_op = &v9fs_cached_dentry_operations;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a6901360fb81..393672997cc2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -318,7 +318,6 @@ struct afs_volume {
unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
struct afs_server *servers[8]; /* servers on which volume resides (ordered) */
struct rw_semaphore server_sem; /* lock for accessing current server */
- struct backing_dev_info bdi;
};
/*
diff --git a/fs/afs/super.c b/fs/afs/super.c
index fbdb022b75a2..c79633e5cfd8 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -319,7 +319,10 @@ static int afs_fill_super(struct super_block *sb,
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = AFS_FS_MAGIC;
sb->s_op = &afs_super_ops;
- sb->s_bdi = &as->volume->bdi;
+ ret = super_setup_bdi(sb);
+ if (ret)
+ return ret;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
/* allocate the root inode and dentry */
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 546f9d01710b..db73d6dad02b 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,11 +106,6 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
volume->cell = params->cell;
volume->vid = vlocation->vldb.vid[params->type];
- volume->bdi.ra_pages = VM_MAX_READAHEAD*1024/PAGE_SIZE;
- ret = bdi_setup_and_register(&volume->bdi, "afs");
- if (ret)
- goto error_bdi;
-
init_rwsem(&volume->server_sem);
/* look up all the applicable server records */
@@ -156,8 +151,6 @@ error:
return ERR_PTR(ret);
error_discard:
- bdi_destroy(&volume->bdi);
-error_bdi:
up_write(&params->cell->vl_sem);
for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -207,7 +200,6 @@ void afs_put_volume(struct afs_volume *volume)
for (loop = volume->nservers - 1; loop >= 0; loop--)
afs_put_server(volume->servers[loop]);
- bdi_destroy(&volume->bdi);
kfree(volume);
_leave(" [destroyed]");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eca00ec4370..9ccabe3bb7de 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -885,6 +885,8 @@ static void bdev_evict_inode(struct inode *inode)
spin_lock(&bdev_lock);
list_del_init(&bdev->bd_list);
spin_unlock(&bdev_lock);
+ /* Detach inode from wb early as bdi_put() may free bdi->wb */
+ inode_detach_wb(inode);
if (bdev->bd_bdi != &noop_backing_dev_info) {
bdi_put(bdev->bd_bdi);
bdev->bd_bdi = &noop_backing_dev_info;
@@ -1451,7 +1453,6 @@ int revalidate_disk(struct gendisk *disk)
if (disk->fops->revalidate_disk)
ret = disk->fops->revalidate_disk(disk);
- blk_integrity_revalidate(disk);
bdev = bdget_disk(disk, 0);
if (!bdev)
return ret;
@@ -1556,8 +1557,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- if (bdev->bd_bdi == &noop_backing_dev_info)
- bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
if (!partno) {
ret = -ENXIO;
@@ -1622,6 +1621,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
}
+
+ if (bdev->bd_bdi == &noop_backing_dev_info)
+ bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
} else {
if (bdev->bd_contains == bdev) {
ret = 0;
@@ -1653,8 +1655,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
bdev->bd_queue = NULL;
- bdi_put(bdev->bd_bdi);
- bdev->bd_bdi = &noop_backing_dev_info;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
@@ -1876,12 +1876,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
kill_bdev(bdev);
bdev_write_inode(bdev);
- /*
- * Detaching bdev inode from its wb in __destroy_inode()
- * is too late: the queue which embeds its bdi (along with
- * root wb) can be gone as soon as we put_disk() below.
- */
- inode_detach_wb(bdev->bd_inode);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
@@ -2074,7 +2068,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct request_queue *q = bdev_get_queue(bdev);
struct address_space *mapping;
loff_t end = start + len - 1;
loff_t isize;
@@ -2110,18 +2103,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
- GFP_KERNEL, false);
+ GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
- /* Only punch if the device can do zeroing discard. */
- if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data)
- return -EOPNOTSUPP;
- error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
- GFP_KERNEL, 0);
+ error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
+ GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
- if (!blk_queue_discard(q))
- return -EOPNOTSUPP;
error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
GFP_KERNEL, 0);
break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c4115901d906..3e21211e99c3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -810,7 +810,6 @@ struct btrfs_fs_info {
struct btrfs_super_block *super_for_commit;
struct super_block *sb;
struct inode *btree_inode;
- struct backing_dev_info bdi;
struct mutex tree_log_mutex;
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eb1ee7b6f532..061c1d1f774f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1808,21 +1808,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
return ret;
}
-static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
-{
- int err;
-
- err = bdi_setup_and_register(bdi, "btrfs");
- if (err)
- return err;
-
- bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
- bdi->congested_fn = btrfs_congested_fn;
- bdi->congested_data = info;
- bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- return 0;
-}
-
/*
* called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens
@@ -2601,16 +2586,10 @@ int open_ctree(struct super_block *sb,
goto fail;
}
- ret = setup_bdi(fs_info, &fs_info->bdi);
- if (ret) {
- err = ret;
- goto fail_srcu;
- }
-
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
- goto fail_bdi;
+ goto fail_srcu;
}
fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids));
@@ -2718,7 +2697,6 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
- sb->s_bdi = &fs_info->bdi;
btrfs_init_btree_inode(fs_info);
@@ -2915,9 +2893,12 @@ int open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
- fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
- fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
- SZ_4M / PAGE_SIZE);
+ sb->s_bdi->congested_fn = btrfs_congested_fn;
+ sb->s_bdi->congested_data = fs_info;
+ sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
+ sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
@@ -3285,8 +3266,6 @@ fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
-fail_bdi:
- bdi_destroy(&fs_info->bdi);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
@@ -4007,7 +3986,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->bio_counter);
- bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
btrfs_free_stripe_hash_table(fs_info);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a59801dc2a34..afbea61d957e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1042,9 +1042,12 @@ static void report_reserved_underflow(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup,
u64 num_bytes)
{
- btrfs_warn(fs_info,
+#ifdef CONFIG_BTRFS_DEBUG
+ WARN_ON(qgroup->reserved < num_bytes);
+ btrfs_debug(fs_info,
"qgroup %llu reserved space underflow, have: %llu, to free: %llu",
qgroup->qgroupid, qgroup->reserved, num_bytes);
+#endif
qgroup->reserved = 0;
}
/*
@@ -1075,7 +1078,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
qgroup->excl += sign * num_bytes;
qgroup->excl_cmpr += sign * num_bytes;
if (sign > 0) {
- if (WARN_ON(qgroup->reserved < num_bytes))
+ if (qgroup->reserved < num_bytes)
report_reserved_underflow(fs_info, qgroup, num_bytes);
else
qgroup->reserved -= num_bytes;
@@ -1100,7 +1103,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
if (sign > 0) {
- if (WARN_ON(qgroup->reserved < num_bytes))
+ if (qgroup->reserved < num_bytes)
report_reserved_underflow(fs_info, qgroup,
num_bytes);
else
@@ -2469,7 +2472,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
- if (WARN_ON(qg->reserved < num_bytes))
+ if (qg->reserved < num_bytes)
report_reserved_underflow(fs_info, qg, num_bytes);
else
qg->reserved -= num_bytes;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9530a333d302..72a053c9a7f0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1136,6 +1136,13 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_flags |= MS_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
+
+ err = super_setup_bdi(sb);
+ if (err) {
+ btrfs_err(fs_info, "super_setup_bdi failed");
+ return err;
+ }
+
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
btrfs_err(fs_info, "open_ctree failed");
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1a3e1b40799a..9ecb2fd348cb 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -578,7 +578,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
if (writeback_stat >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
+ set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -700,7 +700,7 @@ static void writepages_finish(struct ceph_osd_request *req)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(
fsc->mount_options->congestion_kb))
- clear_bdi_congested(&fsc->backing_dev_info,
+ clear_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
if (rc < 0)
@@ -979,7 +979,7 @@ get_more_pages:
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(
fsc->mount_options->congestion_kb)) {
- set_bdi_congested(&fsc->backing_dev_info,
+ set_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f2ae393e2c31..3ef11bc8d728 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -251,7 +251,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
goto out;
snprintf(name, sizeof(name), "../../bdi/%s",
- dev_name(fsc->backing_dev_info.dev));
+ dev_name(fsc->sb->s_bdi->dev));
fsc->debugfs_bdi =
debugfs_create_symlink("bdi",
fsc->client->debugfs_dir,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d449e1c03cbd..d3119fe3ab45 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2071,11 +2071,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
- if (ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(inode, attr->ia_mode);
- if (err)
- goto out_put;
- }
if (mask) {
req->r_inode = inode;
@@ -2089,13 +2084,11 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
- if (mask & CEPH_SETATTR_SIZE)
- __ceph_do_pending_vmtruncate(inode);
- ceph_free_cap_flush(prealloc_cf);
- return err;
-out_put:
- ceph_mdsc_put_request(req);
ceph_free_cap_flush(prealloc_cf);
+
+ if (err >= 0 && (mask & CEPH_SETATTR_SIZE))
+ __ceph_do_pending_vmtruncate(inode);
+
return err;
}
@@ -2114,7 +2107,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (err != 0)
return err;
- return __ceph_setattr(inode, attr);
+ err = __ceph_setattr(inode, attr);
+
+ if (err >= 0 && (attr->ia_valid & ATTR_MODE))
+ err = posix_acl_chmod(inode, attr->ia_mode);
+
+ return err;
}
/*
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 0ec8d0114e57..a8c81b2052ca 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -579,10 +579,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
atomic_long_set(&fsc->writeback_count, 0);
- err = bdi_init(&fsc->backing_dev_info);
- if (err < 0)
- goto fail_client;
-
err = -ENOMEM;
/*
* The number of concurrent works can be high but they don't need
@@ -590,7 +586,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
*/
fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
if (fsc->wb_wq == NULL)
- goto fail_bdi;
+ goto fail_client;
fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
if (fsc->pg_inv_wq == NULL)
goto fail_wb_wq;
@@ -624,8 +620,6 @@ fail_pg_inv_wq:
destroy_workqueue(fsc->pg_inv_wq);
fail_wb_wq:
destroy_workqueue(fsc->wb_wq);
-fail_bdi:
- bdi_destroy(&fsc->backing_dev_info);
fail_client:
ceph_destroy_client(fsc->client);
fail:
@@ -643,8 +637,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
- bdi_destroy(&fsc->backing_dev_info);
-
mempool_destroy(fsc->wb_pagevec_pool);
destroy_mount_options(fsc->mount_options);
@@ -937,33 +929,32 @@ static int ceph_compare_super(struct super_block *sb, void *data)
*/
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-static int ceph_register_bdi(struct super_block *sb,
- struct ceph_fs_client *fsc)
+static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
{
int err;
+ err = super_setup_bdi_name(sb, "ceph-%ld",
+ atomic_long_inc_return(&bdi_seq));
+ if (err)
+ return err;
+
/* set ra_pages based on rasize mount option? */
if (fsc->mount_options->rasize >= PAGE_SIZE)
- fsc->backing_dev_info.ra_pages =
+ sb->s_bdi->ra_pages =
(fsc->mount_options->rasize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
else
- fsc->backing_dev_info.ra_pages =
- VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
if (fsc->mount_options->rsize > fsc->mount_options->rasize &&
fsc->mount_options->rsize >= PAGE_SIZE)
- fsc->backing_dev_info.io_pages =
+ sb->s_bdi->io_pages =
(fsc->mount_options->rsize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
else if (fsc->mount_options->rsize == 0)
- fsc->backing_dev_info.io_pages = ULONG_MAX;
+ sb->s_bdi->io_pages = ULONG_MAX;
- err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
- atomic_long_inc_return(&bdi_seq));
- if (!err)
- sb->s_bdi = &fsc->backing_dev_info;
- return err;
+ return 0;
}
static struct dentry *ceph_mount(struct file_system_type *fs_type,
@@ -1018,7 +1009,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
dout("get_sb got existing client %p\n", fsc);
} else {
dout("get_sb using new client %p\n", fsc);
- err = ceph_register_bdi(sb, fsc);
+ err = ceph_setup_bdi(sb, fsc);
if (err < 0) {
res = ERR_PTR(err);
goto out_splat;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fe6b9cfc4013..176186b12457 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -92,8 +92,6 @@ struct ceph_fs_client {
struct workqueue_struct *trunc_wq;
atomic_long_t writeback_count;
- struct backing_dev_info backing_dev_info;
-
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 07ed81cf1552..cbd216b57239 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -68,7 +68,6 @@ struct cifs_sb_info {
umode_t mnt_dir_mode;
unsigned int mnt_cifs_flags;
char *mountdata; /* options received at mount time or via DFS refs */
- struct backing_dev_info bdi;
struct delayed_work prune_tlinks;
struct rcu_head rcu;
char *prepath;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index dd3f5fabfdf6..34fee9fb7e4f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -138,7 +138,12 @@ cifs_read_super(struct super_block *sb)
sb->s_magic = CIFS_MAGIC_NUMBER;
sb->s_op = &cifs_super_ops;
sb->s_xattr = cifs_xattr_handlers;
- sb->s_bdi = &cifs_sb->bdi;
+ rc = super_setup_bdi(sb);
+ if (rc)
+ goto out_no_root;
+ /* tune readahead according to rsize */
+ sb->s_bdi->ra_pages = cifs_sb->rsize / PAGE_SIZE;
+
sb->s_blocksize = CIFS_MAX_MSGSIZE;
sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
inode = cifs_root_iget(sb);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d82467cfb0e2..b3c9d8c310f2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3692,10 +3692,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
int referral_walks_count = 0;
#endif
- rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
- if (rc)
- return rc;
-
#ifdef CONFIG_CIFS_DFS_UPCALL
try_mount_again:
/* cleanup activities if we're chasing a referral */
@@ -3723,7 +3719,6 @@ try_mount_again:
server = cifs_get_tcp_session(volume_info);
if (IS_ERR(server)) {
rc = PTR_ERR(server);
- bdi_destroy(&cifs_sb->bdi);
goto out;
}
if ((volume_info->max_credits < 20) ||
@@ -3780,9 +3775,6 @@ try_mount_again:
cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
- /* tune readahead according to rsize */
- cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE;
-
remote_path_check:
#ifdef CONFIG_CIFS_DFS_UPCALL
/*
@@ -3899,7 +3891,6 @@ mount_fail_check:
cifs_put_smb_ses(ses);
else
cifs_put_tcp_session(server, 0);
- bdi_destroy(&cifs_sb->bdi);
}
out:
@@ -4102,7 +4093,6 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
}
spin_unlock(&cifs_sb->tlink_tree_lock);
- bdi_destroy(&cifs_sb->bdi);
kfree(cifs_sb->mountdata);
kfree(cifs_sb->prepath);
call_rcu(&cifs_sb->rcu, delayed_free);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2dea594da199..6058df380cc0 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,10 +183,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
goto unlock_out;
}
- error = bdi_setup_and_register(&vc->bdi, "coda");
- if (error)
- goto unlock_out;
-
vc->vc_sb = sb;
mutex_unlock(&vc->vc_mutex);
@@ -197,7 +193,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = CODA_SUPER_MAGIC;
sb->s_op = &coda_super_operations;
sb->s_d_op = &coda_dentry_operations;
- sb->s_bdi = &vc->bdi;
+
+ error = super_setup_bdi(sb);
+ if (error)
+ goto error;
/* get root fid from Venus: this needs the root inode */
error = venus_rootfid(sb, &fid);
@@ -228,7 +227,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
error:
mutex_lock(&vc->vc_mutex);
- bdi_destroy(&vc->bdi);
vc->vc_sb = NULL;
sb->s_fs_info = NULL;
unlock_out:
@@ -240,7 +238,6 @@ static void coda_put_super(struct super_block *sb)
{
struct venus_comm *vcp = coda_vcp(sb);
mutex_lock(&vcp->vc_mutex);
- bdi_destroy(&vcp->bdi);
vcp->vc_sb = NULL;
sb->s_fs_info = NULL;
mutex_unlock(&vcp->vc_mutex);
diff --git a/fs/dax.c b/fs/dax.c
index 85abd741253d..6433650be833 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -991,7 +991,7 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
sector_t start_sector = dax.sector + (offset >> 9);
return blkdev_issue_zeroout(bdev, start_sector,
- length >> 9, GFP_NOFS, true);
+ length >> 9, GFP_NOFS, 0);
} else {
if (dax_map_atomic(bdev, &dax) < 0)
return PTR_ERR(dax.addr);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 95c1c8d34539..9c351bf757b2 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -349,7 +349,6 @@ struct ecryptfs_mount_crypt_stat {
struct ecryptfs_sb_info {
struct super_block *wsi_sb;
struct ecryptfs_mount_crypt_stat mount_crypt_stat;
- struct backing_dev_info bdi;
};
/* file private data. */
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 151872dcc1f4..9014479d0160 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -519,12 +519,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out;
}
- rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
+ rc = super_setup_bdi(s);
if (rc)
goto out1;
ecryptfs_set_superblock_private(s, sbi);
- s->s_bdi = &sbi->bdi;
/* ->kill_sb() will take care of sbi after that point */
sbi = NULL;
@@ -633,7 +632,6 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
if (!sb_info)
return;
ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
- bdi_destroy(&sb_info->bdi);
kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
}
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2e86086bc940..5dc392404559 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -64,7 +64,6 @@ struct exofs_dev {
* our extension to the in-memory superblock
*/
struct exofs_sb_info {
- struct backing_dev_info bdi; /* register our bdi with VFS */
struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
int s_timeout; /* timeout for OSD operations */
uint64_t s_nextid; /* highest object ID used */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 1076a4233b39..819624cfc8da 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -464,7 +464,6 @@ static void exofs_put_super(struct super_block *sb)
sbi->one_comp.obj.partition);
exofs_sysfs_sb_del(sbi);
- bdi_destroy(&sbi->bdi);
exofs_free_sbi(sbi);
sb->s_fs_info = NULL;
}
@@ -809,8 +808,12 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
__sbi_read_stats(sbi);
/* set up operation vectors */
- sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
- sb->s_bdi = &sbi->bdi;
+ ret = super_setup_bdi(sb);
+ if (ret) {
+ EXOFS_DBGMSG("Failed to super_setup_bdi\n");
+ goto free_sbi;
+ }
+ sb->s_bdi->ra_pages = __ra_pages(&sbi->layout);
sb->s_fs_info = sbi;
sb->s_op = &exofs_sops;
sb->s_export_op = &exofs_export_ops;
@@ -836,14 +839,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}
- ret = bdi_setup_and_register(&sbi->bdi, "exofs");
- if (ret) {
- EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
- dput(sb->s_root);
- sb->s_root = NULL;
- goto free_sbi;
- }
-
exofs_sysfs_dbg_print();
_exofs_print_device("Mounting", opts->dev_name,
ore_comp_dev(&sbi->oc, 0),
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b681b43c766e..c2d7f3a92679 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -382,9 +382,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&fc->blocked_waitq);
if (fc->num_background == fc->congestion_threshold &&
- fc->connected && fc->bdi_initialized) {
- clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
- clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+ fc->connected && fc->sb) {
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
fc->num_background--;
fc->active_background--;
@@ -573,10 +573,9 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold &&
- fc->bdi_initialized) {
- set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
- set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+ if (fc->num_background == fc->congestion_threshold && fc->sb) {
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 32ac2c9b09c0..f33341d9501a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -527,9 +527,6 @@ struct fuse_conn {
/** Filesystem supports NFS exporting. Only set in INIT */
unsigned export_support:1;
- /** Set if bdi is valid */
- unsigned bdi_initialized:1;
-
/** write-back cache policy (default is write-through) */
unsigned writeback_cache:1;
@@ -631,9 +628,6 @@ struct fuse_conn {
/** Negotiated minor version */
unsigned minor;
- /** Backing dev info */
- struct backing_dev_info bdi;
-
/** Entry on the fuse_conn_list */
struct list_head entry;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6fe6a88ecb4a..73cf05135252 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -386,12 +386,6 @@ static void fuse_send_destroy(struct fuse_conn *fc)
}
}
-static void fuse_bdi_destroy(struct fuse_conn *fc)
-{
- if (fc->bdi_initialized)
- bdi_destroy(&fc->bdi);
-}
-
static void fuse_put_super(struct super_block *sb)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
@@ -403,7 +397,6 @@ static void fuse_put_super(struct super_block *sb)
list_del(&fc->entry);
fuse_ctl_remove_conn(fc);
mutex_unlock(&fuse_mutex);
- fuse_bdi_destroy(fc);
fuse_conn_put(fc);
}
@@ -928,7 +921,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->no_flock = 1;
}
- fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
+ fc->sb->s_bdi->ra_pages =
+ min(fc->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
fc->max_write = max_t(unsigned, 4096, fc->max_write);
@@ -944,7 +938,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
arg->major = FUSE_KERNEL_VERSION;
arg->minor = FUSE_KERNEL_MINOR_VERSION;
- arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE;
+ arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
@@ -976,27 +970,18 @@ static void fuse_free_conn(struct fuse_conn *fc)
static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
{
int err;
+ char *suffix = "";
- fc->bdi.name = "fuse";
- fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
- /* fuse does it's own writeback accounting */
- fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
-
- err = bdi_init(&fc->bdi);
+ if (sb->s_bdev)
+ suffix = "-fuseblk";
+ err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev),
+ MINOR(fc->dev), suffix);
if (err)
return err;
- fc->bdi_initialized = 1;
-
- if (sb->s_bdev) {
- err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
- MAJOR(fc->dev), MINOR(fc->dev));
- } else {
- err = bdi_register_dev(&fc->bdi, fc->dev);
- }
-
- if (err)
- return err;
+ sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ /* fuse does it's own writeback accounting */
+ sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
/*
* For a single fuse filesystem use max 1% of dirty +
@@ -1010,7 +995,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
*
* /sys/class/bdi/<bdi>/max_ratio
*/
- bdi_set_max_ratio(&fc->bdi, 1);
+ bdi_set_max_ratio(sb->s_bdi, 1);
return 0;
}
@@ -1113,8 +1098,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto err_dev_free;
- sb->s_bdi = &fc->bdi;
-
/* Handle umasking inside the fuse code */
if (sb->s_flags & MS_POSIXACL)
fc->dont_mask = 1;
@@ -1182,7 +1165,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
err_dev_free:
fuse_dev_free(fud);
err_put_conn:
- fuse_bdi_destroy(fc);
fuse_conn_put(fc);
err_fput:
fput(file);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b108e7ba81af..ed67548b286c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -23,6 +23,7 @@
#include <linux/quotaops.h>
#include <linux/lockdep.h>
#include <linux/module.h>
+#include <linux/backing-dev.h>
#include "gfs2.h"
#include "incore.h"
@@ -1222,12 +1223,7 @@ static int set_gfs2_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
-
- /*
- * We set the bdi here to the queue backing, file systems can
- * overwrite this in ->fill_super()
- */
- s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
+ s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
return 0;
}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d5606099712a..6d0f14c86099 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -554,12 +554,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
sb->s_magic = NCP_SUPER_MAGIC;
sb->s_op = &ncp_sops;
sb->s_d_op = &ncp_dentry_operations;
- sb->s_bdi = &server->bdi;
server = NCP_SBP(sb);
memset(server, 0, sizeof(*server));
- error = bdi_setup_and_register(&server->bdi, "ncpfs");
+ error = super_setup_bdi(sb);
if (error)
goto out_fput;
@@ -568,7 +567,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (data.info_fd != -1) {
struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
if (!info_sock)
- goto out_bdi;
+ goto out_fput;
server->info_sock = info_sock;
error = -EBADFD;
if (info_sock->type != SOCK_STREAM)
@@ -746,8 +745,6 @@ out_nls:
out_fput2:
if (server->info_sock)
sockfd_put(server->info_sock);
-out_bdi:
- bdi_destroy(&server->bdi);
out_fput:
sockfd_put(sock);
out:
@@ -788,7 +785,6 @@ static void ncp_put_super(struct super_block *sb)
kill_pid(server->m.wdog_pid, SIGTERM, 1);
put_pid(server->m.wdog_pid);
- bdi_destroy(&server->bdi);
kfree(server->priv.data);
kfree(server->auth.object_name);
vfree(server->rxbuf);
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index 55e26fd80886..366fd63cc506 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -143,7 +143,6 @@ struct ncp_server {
size_t len;
__u8 data[128];
} unexpected_packet;
- struct backing_dev_info bdi;
};
extern void ncp_tcp_rcv_proc(struct work_struct *work);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 390ada8741bc..04d15a0045e3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -761,9 +761,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
- server->backing_dev_info.name = "nfs";
- server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-
if (server->wsize > max_rpc_payload)
server->wsize = max_rpc_payload;
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
@@ -917,12 +914,6 @@ struct nfs_server *nfs_alloc_server(void)
return NULL;
}
- if (bdi_init(&server->backing_dev_info)) {
- nfs_free_iostats(server->io_stats);
- kfree(server);
- return NULL;
- }
-
ida_init(&server->openowner_id);
ida_init(&server->lockowner_id);
pnfs_init_server(server);
@@ -953,7 +944,6 @@ void nfs_free_server(struct nfs_server *server)
ida_destroy(&server->lockowner_id);
ida_destroy(&server->openowner_id);
nfs_free_iostats(server->io_stats);
- bdi_destroy(&server->backing_dev_info);
kfree(server);
nfs_release_automount_timer();
dprintk("<-- nfs_free_server()\n");
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index aab32fc3d6a8..c1b5fed7c863 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -537,7 +537,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
if (put_dreq(dreq))
nfs_direct_complete(dreq);
- return 0;
+ return requested_bytes;
}
/**
@@ -566,7 +566,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
struct inode *inode = mapping->host;
struct nfs_direct_req *dreq;
struct nfs_lock_context *l_ctx;
- ssize_t result = -EINVAL;
+ ssize_t result = -EINVAL, requested;
size_t count = iov_iter_count(iter);
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
@@ -600,14 +600,19 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
nfs_start_io_direct(inode);
NFS_I(inode)->read_io += count;
- result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
+ requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
nfs_end_io_direct(inode);
- if (!result) {
+ if (requested > 0) {
result = nfs_direct_wait(dreq);
- if (result > 0)
+ if (result > 0) {
+ requested -= result;
iocb->ki_pos += result;
+ }
+ iov_iter_revert(iter, requested);
+ } else {
+ result = requested;
}
out_release:
@@ -954,7 +959,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
if (put_dreq(dreq))
nfs_direct_write_complete(dreq);
- return 0;
+ return requested_bytes;
}
/**
@@ -979,7 +984,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
*/
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
{
- ssize_t result = -EINVAL;
+ ssize_t result = -EINVAL, requested;
size_t count;
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -1022,7 +1027,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
nfs_start_io_direct(inode);
- result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
if (mapping->nrpages) {
invalidate_inode_pages2_range(mapping,
@@ -1031,13 +1036,17 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
nfs_end_io_direct(inode);
- if (!result) {
+ if (requested > 0) {
result = nfs_direct_wait(dreq);
if (result > 0) {
+ requested -= result;
iocb->ki_pos = pos + result;
/* XXX: should check the generic_write_sync retval */
generic_write_sync(iocb, result);
}
+ iov_iter_revert(iter, requested);
+ } else {
+ result = requested;
}
out_release:
nfs_direct_req_release(dreq);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7b38fedb7e03..9dc65d7ae754 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -139,7 +139,7 @@ struct nfs_mount_request {
};
struct nfs_mount_info {
- void (*fill_super)(struct super_block *, struct nfs_mount_info *);
+ int (*fill_super)(struct super_block *, struct nfs_mount_info *);
int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *);
struct nfs_parsed_mount_data *parsed;
struct nfs_clone_mount *cloned;
@@ -407,7 +407,7 @@ struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *
struct dentry * nfs_xdev_mount_common(struct file_system_type *, int,
const char *, struct nfs_mount_info *);
void nfs_kill_super(struct super_block *);
-void nfs_fill_super(struct super_block *, struct nfs_mount_info *);
+int nfs_fill_super(struct super_block *, struct nfs_mount_info *);
extern struct rpc_stat nfs_rpcstat;
@@ -458,7 +458,7 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
/* super.c */
-void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
+int nfs_clone_super(struct super_block *, struct nfs_mount_info *);
void nfs_umount_begin(struct super_block *);
int nfs_statfs(struct dentry *, struct kstatfs *);
int nfs_show_options(struct seq_file *, struct dentry *);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 54e0f9f2dd94..dc69314d455e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2315,18 +2315,17 @@ inline void nfs_initialise_sb(struct super_block *sb)
sb->s_blocksize = nfs_block_bits(server->wsize,
&sb->s_blocksize_bits);
- sb->s_bdi = &server->backing_dev_info;
-
nfs_super_set_maxbytes(sb, server->maxfilesize);
}
/*
* Finish setting up an NFS2/3 superblock
*/
-void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
{
struct nfs_parsed_mount_data *data = mount_info->parsed;
struct nfs_server *server = NFS_SB(sb);
+ int ret;
sb->s_blocksize_bits = 0;
sb->s_blocksize = 0;
@@ -2344,13 +2343,21 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
}
nfs_initialise_sb(sb);
+
+ ret = super_setup_bdi_name(sb, "%u:%u", MAJOR(server->s_dev),
+ MINOR(server->s_dev));
+ if (ret)
+ return ret;
+ sb->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
+ return 0;
+
}
EXPORT_SYMBOL_GPL(nfs_fill_super);
/*
* Finish setting up a cloned NFS2/3/4 superblock
*/
-void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
{
const struct super_block *old_sb = mount_info->cloned->sb;
struct nfs_server *server = NFS_SB(sb);
@@ -2370,6 +2377,10 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
}
nfs_initialise_sb(sb);
+
+ sb->s_bdi = bdi_get(old_sb->s_bdi);
+
+ return 0;
}
static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
@@ -2522,11 +2533,6 @@ static void nfs_get_cache_cookie(struct super_block *sb,
}
#endif
-static int nfs_bdi_register(struct nfs_server *server)
-{
- return bdi_register_dev(&server->backing_dev_info, server->s_dev);
-}
-
int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
struct nfs_mount_info *mount_info)
{
@@ -2594,17 +2600,14 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
nfs_free_server(server);
server = NULL;
} else {
- error = nfs_bdi_register(server);
- if (error) {
- mntroot = ERR_PTR(error);
- goto error_splat_super;
- }
server->super = s;
}
if (!s->s_root) {
/* initial superblock/root creation */
- mount_info->fill_super(s, mount_info);
+ error = mount_info->fill_super(s, mount_info);
+ if (error)
+ goto error_splat_super;
nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index abb2c8a3be42..cc341fc7fd44 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -263,16 +263,15 @@ int nfs_congestion_kb;
static void nfs_set_page_writeback(struct page *page)
{
- struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
int ret = test_set_page_writeback(page);
WARN_ON_ONCE(ret != 0);
if (atomic_long_inc_return(&nfss->writeback) >
- NFS_CONGESTION_ON_THRESH) {
- set_bdi_congested(&nfss->backing_dev_info,
- BLK_RW_ASYNC);
- }
+ NFS_CONGESTION_ON_THRESH)
+ set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
}
static void nfs_end_page_writeback(struct nfs_page *req)
@@ -285,7 +284,7 @@ static void nfs_end_page_writeback(struct nfs_page *req)
end_page_writeback(req->wb_page);
if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+ clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
}
@@ -1808,7 +1807,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
}
nfss = NFS_SERVER(data->inode);
if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+ clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
nfs_commit_end(cinfo.mds);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 92b4b41d19d2..fb5213afc854 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -242,10 +242,11 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
req->cmd[4] = bufflen & 0xff;
req->cmd_len = COMMAND_SIZE(INQUIRY);
- error = blk_execute_rq(rq->q, NULL, rq, 1);
- if (error) {
+ blk_execute_rq(rq->q, NULL, rq, 1);
+ if (req->result) {
pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
- rq->errors);
+ req->result);
+ error = -EIO;
goto out_put_request;
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index dba2ff8eaa68..452334694a5d 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -358,6 +358,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
{
unsigned int len, v, hdr, dlen;
u32 max_blocksize = svc_max_payload(rqstp);
+ struct kvec *head = rqstp->rq_arg.head;
+ struct kvec *tail = rqstp->rq_arg.tail;
p = decode_fh(p, &args->fh);
if (!p)
@@ -367,6 +369,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
args->count = ntohl(*p++);
args->stable = ntohl(*p++);
len = args->len = ntohl(*p++);
+ if ((void *)p > head->iov_base + head->iov_len)
+ return 0;
/*
* The count must equal the amount of data passed.
*/
@@ -377,9 +381,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
* Check to make sure that we got the right number of
* bytes.
*/
- hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
- dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
- + rqstp->rq_arg.tail[0].iov_len - hdr;
+ hdr = (void*)p - head->iov_base;
+ dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr;
/*
* Round the length of the data which was specified up to
* the next multiple of XDR units and then compare that
@@ -396,7 +399,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
len = args->len = max_blocksize;
}
rqstp->rq_vec[0].iov_base = (void*)p;
- rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+ rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
v = 0;
while (len > rqstp->rq_vec[v].iov_len) {
len -= rqstp->rq_vec[v].iov_len;
@@ -471,6 +474,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
/* first copy and check from the first page */
old = (char*)p;
vec = &rqstp->rq_arg.head[0];
+ if ((void *)old > vec->iov_base + vec->iov_len)
+ return 0;
avail = vec->iov_len - (old - (char*)vec->iov_base);
while (len && avail && *old) {
*new++ = *old++;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 31e1f9593457..59979f0bbd4b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -747,6 +747,37 @@ static __be32 map_new_errors(u32 vers, __be32 nfserr)
return nfserr;
}
+/*
+ * A write procedure can have a large argument, and a read procedure can
+ * have a large reply, but no NFSv2 or NFSv3 procedure has argument and
+ * reply that can both be larger than a page. The xdr code has taken
+ * advantage of this assumption to be a sloppy about bounds checking in
+ * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that
+ * problem, we enforce these assumptions here:
+ */
+static bool nfs_request_too_big(struct svc_rqst *rqstp,
+ struct svc_procedure *proc)
+{
+ /*
+ * The ACL code has more careful bounds-checking and is not
+ * susceptible to this problem:
+ */
+ if (rqstp->rq_prog != NFS_PROGRAM)
+ return false;
+ /*
+ * Ditto NFSv4 (which can in theory have argument and reply both
+ * more than a page):
+ */
+ if (rqstp->rq_vers >= 4)
+ return false;
+ /* The reply will be small, we're OK: */
+ if (proc->pc_xdrressize > 0 &&
+ proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE))
+ return false;
+
+ return rqstp->rq_arg.len > PAGE_SIZE;
+}
+
int
nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
{
@@ -759,6 +790,11 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
rqstp->rq_vers, rqstp->rq_proc);
proc = rqstp->rq_procinfo;
+ if (nfs_request_too_big(rqstp, proc)) {
+ dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers);
+ *statp = rpc_garbage_args;
+ return 1;
+ }
/*
* Give the xdr decoder a chance to change this if it wants
* (necessary in the NFSv4.0 compound case)
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 41b468a6a90f..de07ff625777 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -280,6 +280,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_writeargs *args)
{
unsigned int len, hdr, dlen;
+ struct kvec *head = rqstp->rq_arg.head;
int v;
p = decode_fh(p, &args->fh);
@@ -300,9 +301,10 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
* Check to make sure that we got the right number of
* bytes.
*/
- hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
- dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
- - hdr;
+ hdr = (void*)p - head->iov_base;
+ if (hdr > head->iov_len)
+ return 0;
+ dlen = head->iov_len + rqstp->rq_arg.page_len - hdr;
/*
* Round the length of the data which was specified up to
@@ -316,7 +318,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
return 0;
rqstp->rq_vec[0].iov_base = (void*)p;
- rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+ rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
v = 0;
while (len > rqstp->rq_vec[v].iov_len) {
len -= rqstp->rq_vec[v].iov_len;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e1872f36147f..926682981d61 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1068,7 +1068,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = 1;
sb->s_max_links = NILFS_LINK_MAX;
- sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info;
+ sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi);
err = load_nilfs(nilfs, sb);
if (err)
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 6333cbbdfef7..83b506020718 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -521,13 +521,11 @@ int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
size_t n = size;
if (n > PAGE_SIZE)
n = PAGE_SIZE;
- n = copy_page_from_iter(page, 0, n, iter);
- if (!n)
+ if (copy_page_from_iter(page, 0, n, iter) != n)
return -EFAULT;
size -= n;
}
return 0;
-
}
/*
diff --git a/fs/stat.c b/fs/stat.c
index c6c963b2546b..a257b872a53d 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -547,13 +547,13 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
/**
* sys_statx - System call to get enhanced stats
* @dfd: Base directory to pathwalk from *or* fd to stat.
- * @filename: File to stat *or* NULL.
+ * @filename: File to stat or "" with AT_EMPTY_PATH
* @flags: AT_* flags to control pathwalk.
* @mask: Parts of statx struct actually required.
* @buffer: Result buffer.
*
- * Note that if filename is NULL, then it does the equivalent of fstat() using
- * dfd to indicate the file of interest.
+ * Note that fstat() can be emulated by setting dfd to the fd of interest,
+ * supplying "" as the filename and setting AT_EMPTY_PATH in the flags.
*/
SYSCALL_DEFINE5(statx,
int, dfd, const char __user *, filename, unsigned, flags,
@@ -568,10 +568,7 @@ SYSCALL_DEFINE5(statx,
if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
return -EINVAL;
- if (filename)
- error = vfs_statx(dfd, filename, flags, &stat, mask);
- else
- error = vfs_statx_fd(dfd, &stat, mask, flags);
+ error = vfs_statx(dfd, filename, flags, &stat, mask);
if (error)
return error;
diff --git a/fs/super.c b/fs/super.c
index b8b6a086c03b..adb0c0de428c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -446,6 +446,10 @@ void generic_shutdown_super(struct super_block *sb)
hlist_del_init(&sb->s_instances);
spin_unlock(&sb_lock);
up_write(&sb->s_umount);
+ if (sb->s_bdi != &noop_backing_dev_info) {
+ bdi_put(sb->s_bdi);
+ sb->s_bdi = &noop_backing_dev_info;
+ }
}
EXPORT_SYMBOL(generic_shutdown_super);
@@ -1049,12 +1053,8 @@ static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
+ s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
- /*
- * We set the bdi here to the queue backing, file systems can
- * overwrite this in ->fill_super()
- */
- s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
return 0;
}
@@ -1256,6 +1256,49 @@ out:
}
/*
+ * Setup private BDI for given superblock. It gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
+{
+ struct backing_dev_info *bdi;
+ int err;
+ va_list args;
+
+ bdi = bdi_alloc(GFP_KERNEL);
+ if (!bdi)
+ return -ENOMEM;
+
+ bdi->name = sb->s_type->name;
+
+ va_start(args, fmt);
+ err = bdi_register_va(bdi, fmt, args);
+ va_end(args);
+ if (err) {
+ bdi_put(bdi);
+ return err;
+ }
+ WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+ sb->s_bdi = bdi;
+
+ return 0;
+}
+EXPORT_SYMBOL(super_setup_bdi_name);
+
+/*
+ * Setup private BDI for given superblock. I gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi(struct super_block *sb)
+{
+ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+ return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
+ atomic_long_inc_return(&bdi_seq));
+}
+EXPORT_SYMBOL(super_setup_bdi);
+
+/*
* This is an internal function, please use sb_end_{write,pagefault,intwrite}
* instead.
*/
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b73811bd7676..cf4cc99b75b5 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1827,7 +1827,6 @@ static void ubifs_put_super(struct super_block *sb)
}
ubifs_umount(c);
- bdi_destroy(&c->bdi);
ubi_close_volume(c->ubi);
mutex_unlock(&c->umount_mutex);
}
@@ -2019,29 +2018,25 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
goto out;
}
+ err = ubifs_parse_options(c, data, 0);
+ if (err)
+ goto out_close;
+
/*
* UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
* UBIFS, I/O is not deferred, it is done immediately in readpage,
* which means the user would have to wait not just for their own I/O
* but the read-ahead I/O as well i.e. completely pointless.
*
- * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+ * Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0. Also
+ * @sb->s_bdi->capabilities are initialized to 0 so there won't be any
+ * writeback happening.
*/
- c->bdi.name = "ubifs",
- c->bdi.capabilities = 0;
- err = bdi_init(&c->bdi);
+ err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num,
+ c->vi.vol_id);
if (err)
goto out_close;
- err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d",
- c->vi.ubi_num, c->vi.vol_id);
- if (err)
- goto out_bdi;
-
- err = ubifs_parse_options(c, data, 0);
- if (err)
- goto out_bdi;
- sb->s_bdi = &c->bdi;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
sb->s_blocksize = UBIFS_BLOCK_SIZE;
@@ -2080,8 +2075,6 @@ out_umount:
ubifs_umount(c);
out_unlock:
mutex_unlock(&c->umount_mutex);
-out_bdi:
- bdi_destroy(&c->bdi);
out_close:
ubi_close_volume(c->ubi);
out:
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4d57e488038e..4da10a6d702a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -972,7 +972,6 @@ struct ubifs_debug_info;
* struct ubifs_info - UBIFS file-system description data structure
* (per-superblock).
* @vfs_sb: VFS @struct super_block object
- * @bdi: backing device info object to make VFS happy and disable read-ahead
*
* @highest_inum: highest used inode number
* @max_sqnum: current global sequence number
@@ -1220,7 +1219,6 @@ struct ubifs_debug_info;
*/
struct ubifs_info {
struct super_block *vfs_sb;
- struct backing_dev_info bdi;
ino_t highest_inum;
unsigned long long max_sqnum;
@@ -1461,7 +1459,6 @@ extern const struct inode_operations ubifs_file_inode_operations;
extern const struct file_operations ubifs_dir_operations;
extern const struct inode_operations ubifs_dir_inode_operations;
extern const struct inode_operations ubifs_symlink_inode_operations;
-extern struct backing_dev_info ubifs_backing_dev_info;
extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
/* io.c */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 828532ce0adc..8795e9cd867c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -81,7 +81,7 @@ xfs_zero_extent(
return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
block << (mp->m_super->s_blocksize_bits - 9),
count_fsb << (mp->m_super->s_blocksize_bits - 9),
- GFP_NOFS, true);
+ GFP_NOFS, 0);
}
int
diff --git a/include/linux/ata.h b/include/linux/ata.h
index af6859b3a93d..ad7d9ee89ff0 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -817,11 +817,6 @@ static inline bool ata_id_sct_error_recovery_ctrl(const u16 *id)
return id[ATA_ID_SCT_CMD_XPORT] & (1 << 3) ? true : false;
}
-static inline bool ata_id_sct_write_same(const u16 *id)
-{
- return id[ATA_ID_SCT_CMD_XPORT] & (1 << 2) ? true : false;
-}
-
static inline bool ata_id_sct_long_sector_access(const u16 *id)
{
return id[ATA_ID_SCT_CMD_XPORT] & (1 << 1) ? true : false;
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ad955817916d..866c433e7d32 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -21,6 +21,7 @@ struct dentry;
*/
enum wb_state {
WB_registered, /* bdi_register() was done */
+ WB_shutting_down, /* wb_shutdown() in progress */
WB_writeback_running, /* Writeback is in progress */
WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
};
@@ -54,7 +55,9 @@ struct bdi_writeback_congested {
atomic_t refcnt; /* nr of attached wb's and blkg */
#ifdef CONFIG_CGROUP_WRITEBACK
- struct backing_dev_info *bdi; /* the associated bdi */
+ struct backing_dev_info *__bdi; /* the associated bdi, set to NULL
+ * on bdi unregistration. For memcg-wb
+ * internal use only! */
int blkcg_id; /* ID of the associated blkcg */
struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */
#endif
@@ -143,7 +146,7 @@ struct backing_dev_info {
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
- char *name;
+ const char *name;
struct kref refcnt; /* Reference counter for the structure */
unsigned int capabilities; /* Device capabilities */
@@ -161,7 +164,6 @@ struct backing_dev_info {
#ifdef CONFIG_CGROUP_WRITEBACK
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
struct rb_root cgwb_congested_tree; /* their congested states */
- atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
#else
struct bdi_writeback_congested *wb_congested;
#endif
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c52a48cb9a66..557d84063934 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -17,8 +17,6 @@
#include <linux/backing-dev-defs.h>
#include <linux/slab.h>
-int __must_check bdi_init(struct backing_dev_info *bdi);
-
static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
{
kref_get(&bdi->refcnt);
@@ -27,16 +25,18 @@ static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
void bdi_put(struct backing_dev_info *bdi);
-__printf(3, 4)
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
- const char *fmt, ...);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
+__printf(2, 3)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
+ va_list args);
int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
void bdi_unregister(struct backing_dev_info *bdi);
-int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
-void bdi_destroy(struct backing_dev_info *bdi);
struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
+static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
+{
+ return bdi_alloc_node(gfp_mask, NUMA_NO_NODE);
+}
void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
bool range_cyclic, enum wb_reason reason);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..4931756d86d9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -383,7 +383,7 @@ extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
extern void bioset_free(struct bio_set *);
extern mempool_t *biovec_create_pool(int pool_entries);
-extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
+extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
extern void bio_put(struct bio *);
extern void __bio_clone_fast(struct bio *, struct bio *);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9382c5da7a2e..f3e5e1de1bdb 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -15,7 +15,7 @@ struct blk_mq_hw_ctx {
unsigned long state; /* BLK_MQ_S_* flags */
} ____cacheline_aligned_in_smp;
- struct work_struct run_work;
+ struct delayed_work run_work;
cpumask_var_t cpumask;
int next_cpu;
int next_cpu_batch;
@@ -51,9 +51,6 @@ struct blk_mq_hw_ctx {
atomic_t nr_active;
- struct delayed_work delayed_run_work;
- struct delayed_work delay_work;
-
struct hlist_node cpuhp_dead;
struct kobject kobj;
@@ -82,7 +79,6 @@ struct blk_mq_tag_set {
struct blk_mq_queue_data {
struct request *rq;
- struct list_head *list;
bool last;
};
@@ -143,6 +139,14 @@ struct blk_mq_ops {
reinit_request_fn *reinit_request;
map_queues_fn *map_queues;
+
+#ifdef CONFIG_BLK_DEBUG_FS
+ /*
+ * Used by the debugfs implementation to show driver-specific
+ * information about a request.
+ */
+ void (*show_rq)(struct seq_file *m, struct request *rq);
+#endif
};
enum {
@@ -153,7 +157,6 @@ enum {
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
BLK_MQ_F_TAG_SHARED = 1 << 1,
BLK_MQ_F_SG_MERGE = 1 << 2,
- BLK_MQ_F_DEFER_ISSUE = 1 << 4,
BLK_MQ_F_BLOCKING = 1 << 5,
BLK_MQ_F_NO_SCHED = 1 << 6,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
@@ -163,6 +166,7 @@ enum {
BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_S_SCHED_RESTART = 2,
BLK_MQ_S_TAG_WAITING = 3,
+ BLK_MQ_S_START_ON_RUN = 4,
BLK_MQ_MAX_DEPTH = 10240,
@@ -230,7 +234,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
void blk_mq_abort_requeue_list(struct request_queue *q);
-void blk_mq_complete_request(struct request *rq, int error);
+void blk_mq_complete_request(struct request *rq);
bool blk_mq_queue_stopped(struct request_queue *q);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
@@ -240,13 +244,14 @@ void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_unfreeze_queue(struct request_queue *q);
-void blk_mq_freeze_queue_start(struct request_queue *q);
+void blk_freeze_queue_start(struct request_queue *q);
void blk_mq_freeze_queue_wait(struct request_queue *q);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d703acb55d0f..61339bc44400 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -17,6 +17,10 @@ struct io_context;
struct cgroup_subsys_state;
typedef void (bio_end_io_t) (struct bio *);
+struct blk_issue_stat {
+ u64 stat;
+};
+
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
@@ -29,7 +33,7 @@ struct bio {
* top bits REQ_OP. Use
* accessors.
*/
- unsigned short bi_flags; /* status, command, etc */
+ unsigned short bi_flags; /* status, etc and bvec pool number */
unsigned short bi_ioprio;
struct bvec_iter bi_iter;
@@ -58,6 +62,10 @@ struct bio {
*/
struct io_context *bi_ioc;
struct cgroup_subsys_state *bi_css;
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ void *bi_cg_private;
+ struct blk_issue_stat bi_issue_stat;
+#endif
#endif
union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -102,12 +110,9 @@ struct bio {
#define BIO_REFFED 8 /* bio has elevated ->bi_cnt */
#define BIO_THROTTLED 9 /* This bio has already been subjected to
* throttling rules. Don't do it again. */
-
-/*
- * Flags starting here get preserved by bio_reset() - this includes
- * BVEC_POOL_IDX()
- */
-#define BIO_RESET_BITS 10
+#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion
+ * of this bio. */
+/* See BVEC_POOL_OFFSET below before adding new flags */
/*
* We support 6 different bvec pools, the last one is magic in that it
@@ -117,13 +122,22 @@ struct bio {
#define BVEC_POOL_MAX (BVEC_POOL_NR - 1)
/*
- * Top 4 bits of bio flags indicate the pool the bvecs came from. We add
+ * Top 3 bits of bio flags indicate the pool the bvecs came from. We add
* 1 to the actual index so that 0 indicates that there are no bvecs to be
* freed.
*/
-#define BVEC_POOL_BITS (4)
+#define BVEC_POOL_BITS (3)
#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS)
#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET)
+#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
+# error "BVEC_POOL_BITS is too small"
+#endif
+
+/*
+ * Flags starting here get preserved by bio_reset() - this includes
+ * only BVEC_POOL_IDX()
+ */
+#define BIO_RESET_BITS BVEC_POOL_OFFSET
/*
* Operations and flags common to the bio and request structures.
@@ -160,7 +174,7 @@ enum req_opf {
/* write the same sector many times */
REQ_OP_WRITE_SAME = 7,
/* write the zero filled sector many times */
- REQ_OP_WRITE_ZEROES = 8,
+ REQ_OP_WRITE_ZEROES = 9,
/* SCSI passthrough using struct scsi_request */
REQ_OP_SCSI_IN = 32,
@@ -187,6 +201,10 @@ enum req_flag_bits {
__REQ_PREFLUSH, /* request for cache flush */
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_BACKGROUND, /* background IO */
+
+ /* command specific flags for REQ_OP_WRITE_ZEROES: */
+ __REQ_NOUNMAP, /* do not free blocks when zeroing */
+
__REQ_NR_BITS, /* stops here */
};
@@ -204,6 +222,8 @@ enum req_flag_bits {
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
+#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
+
#define REQ_FAILFAST_MASK \
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -283,12 +303,6 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
return (cookie & BLK_QC_T_INTERNAL) != 0;
}
-struct blk_issue_stat {
- u64 time;
-};
-
-#define BLK_RQ_STAT_BATCH 64
-
struct blk_rq_stat {
s64 mean;
u64 min;
@@ -296,7 +310,6 @@ struct blk_rq_stat {
s32 nr_samples;
s32 nr_batch;
u64 batch;
- s64 time;
};
#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 01a696b0a4d3..83d28623645f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -40,15 +40,20 @@ struct blkcg_gq;
struct blk_flush_queue;
struct pr_ops;
struct rq_wb;
+struct blk_queue_stats;
+struct blk_stat_callback;
#define BLKDEV_MIN_RQ 4
#define BLKDEV_MAX_RQ 128 /* Default maximum */
+/* Must be consisitent with blk_mq_poll_stats_bkt() */
+#define BLK_MQ_POLL_STATS_BKTS 16
+
/*
* Maximum number of blkcg policies allowed to be registered concurrently.
* Defined here to simplify include dependency.
*/
-#define BLKCG_MAX_POLS 2
+#define BLKCG_MAX_POLS 3
typedef void (rq_end_io_fn)(struct request *, int);
@@ -173,6 +178,7 @@ struct request {
struct rb_node rb_node; /* sort/lookup */
struct bio_vec special_vec;
void *completion_data;
+ int error_count; /* for legacy drivers, don't use */
};
/*
@@ -213,16 +219,14 @@ struct request {
unsigned short ioprio;
- void *special; /* opaque pointer available for LLD use */
+ unsigned int timeout;
- int errors;
+ void *special; /* opaque pointer available for LLD use */
unsigned int extra_len; /* length of alignment and padding */
unsigned long deadline;
struct list_head timeout_list;
- unsigned int timeout;
- int retries;
/*
* completion callback.
@@ -337,7 +341,6 @@ struct queue_limits {
unsigned char misaligned;
unsigned char discard_misaligned;
unsigned char cluster;
- unsigned char discard_zeroes_data;
unsigned char raid_partial_stripes_expensive;
enum blk_zoned_model zoned;
};
@@ -388,6 +391,7 @@ struct request_queue {
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
+ struct blk_queue_stats *stats;
struct rq_wb *rq_wb;
/*
@@ -505,8 +509,6 @@ struct request_queue {
unsigned int nr_sorted;
unsigned int in_flight[2];
- struct blk_rq_stat rq_stats[2];
-
/*
* Number of active block driver functions for which blk_drain_queue()
* must wait. Must be incremented around functions that unlock the
@@ -516,6 +518,10 @@ struct request_queue {
unsigned int rq_timeout;
int poll_nsec;
+
+ struct blk_stat_callback *poll_cb;
+ struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS];
+
struct timer_list timeout;
struct work_struct timeout_work;
struct list_head timeout_list;
@@ -610,6 +616,8 @@ struct request_queue {
#define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */
#define QUEUE_FLAG_DAX 26 /* device supports DAX */
#define QUEUE_FLAG_STATS 27 /* track rq completion times */
+#define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */
+#define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */
#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_STACKABLE) | \
@@ -918,6 +926,7 @@ extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
extern blk_qc_t generic_make_request(struct bio *bio);
extern void blk_rq_init(struct request_queue *q, struct request *rq);
+extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
extern void blk_put_request(struct request *);
extern void __blk_put_request(struct request_queue *, struct request *);
extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
@@ -963,7 +972,7 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns
extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
struct rq_map_data *, const struct iov_iter *,
gfp_t);
-extern int blk_execute_rq(struct request_queue *, struct gendisk *,
+extern void blk_execute_rq(struct request_queue *, struct gendisk *,
struct request *, int);
extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
struct request *, int, rq_end_io_fn *);
@@ -1081,20 +1090,6 @@ static inline unsigned int blk_rq_count_bios(struct request *rq)
}
/*
- * blk_rq_set_prio - associate a request with prio from ioc
- * @rq: request of interest
- * @ioc: target iocontext
- *
- * Assocate request prio with ioc prio so request based drivers
- * can leverage priority information.
- */
-static inline void blk_rq_set_prio(struct request *rq, struct io_context *ioc)
-{
- if (ioc)
- rq->ioprio = ioc->ioprio;
-}
-
-/*
* Request issue related functions.
*/
extern struct request *blk_peek_request(struct request_queue *q);
@@ -1120,13 +1115,10 @@ extern void blk_finish_request(struct request *rq, int error);
extern bool blk_end_request(struct request *rq, int error,
unsigned int nr_bytes);
extern void blk_end_request_all(struct request *rq, int error);
-extern bool blk_end_request_cur(struct request *rq, int error);
-extern bool blk_end_request_err(struct request *rq, int error);
extern bool __blk_end_request(struct request *rq, int error,
unsigned int nr_bytes);
extern void __blk_end_request_all(struct request *rq, int error);
extern bool __blk_end_request_cur(struct request *rq, int error);
-extern bool __blk_end_request_err(struct request *rq, int error);
extern void blk_complete_request(struct request *);
extern void __blk_complete_request(struct request *);
@@ -1329,23 +1321,27 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
return bqt->tag_index[tag];
}
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
+extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, struct page *page);
#define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */
-#define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */
-extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, int flags,
struct bio **biop);
-extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, struct page *page);
+
+#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
+#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
+
extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
- bool discard);
+ unsigned flags);
extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, bool discard);
+ sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
+
static inline int sb_issue_discard(struct super_block *sb, sector_t block,
sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
{
@@ -1359,7 +1355,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
return blkdev_issue_zeroout(sb->s_bdev,
block << (sb->s_blocksize_bits - 9),
nr_blocks << (sb->s_blocksize_bits - 9),
- gfp_mask, true);
+ gfp_mask, 0);
}
extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
@@ -1529,19 +1525,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
return q->limits.discard_alignment;
}
-static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
-{
- if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
- return 1;
-
- return 0;
-}
-
-static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
-{
- return queue_discard_zeroes_data(bdev_get_queue(bdev));
-}
-
static inline unsigned int bdev_write_same(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
@@ -1726,6 +1709,7 @@ int kblockd_schedule_work(struct work_struct *work);
int kblockd_schedule_work_on(int cpu, struct work_struct *work);
int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
#ifdef CONFIG_BLK_CGROUP
/*
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 6a3f850cabab..21745946cae1 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -13,6 +13,7 @@
#include <linux/wait.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
+#include <linux/refcount.h>
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
@@ -106,9 +107,6 @@ struct cgroup_subsys_state {
/* reference count - access via css_[try]get() and css_put() */
struct percpu_ref refcnt;
- /* PI: the parent css */
- struct cgroup_subsys_state *parent;
-
/* siblings list anchored at the parent's ->children */
struct list_head sibling;
struct list_head children;
@@ -138,6 +136,12 @@ struct cgroup_subsys_state {
/* percpu_ref killing and RCU release */
struct rcu_head rcu_head;
struct work_struct destroy_work;
+
+ /*
+ * PI: the parent css. Placed here for cache proximity to following
+ * fields of the containing structure.
+ */
+ struct cgroup_subsys_state *parent;
};
/*
@@ -156,7 +160,7 @@ struct css_set {
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
/* reference count */
- atomic_t refcount;
+ refcount_t refcount;
/* the default cgroup associated with this css_set */
struct cgroup *dfl_cgrp;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index af9c86e958bd..ed2573e149fa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -17,11 +17,11 @@
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/jump_label.h>
-#include <linux/nsproxy.h>
#include <linux/types.h>
#include <linux/ns_common.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
+#include <linux/refcount.h>
#include <linux/cgroup-defs.h>
@@ -661,7 +661,7 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
#endif /* CONFIG_CGROUP_DATA */
struct cgroup_namespace {
- atomic_t count;
+ refcount_t count;
struct ns_common ns;
struct user_namespace *user_ns;
struct ucounts *ucounts;
@@ -696,12 +696,12 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
if (ns)
- atomic_inc(&ns->count);
+ refcount_inc(&ns->count);
}
static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
- if (ns && atomic_dec_and_test(&ns->count))
+ if (ns && refcount_dec_and_test(&ns->count))
free_cgroup_ns(ns);
}
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 5b8721efa948..31e4e1f1547c 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -15,7 +15,6 @@ struct venus_comm {
struct list_head vc_processing;
int vc_inuse;
struct super_block *vc_sb;
- struct backing_dev_info bdi;
struct mutex vc_mutex;
};
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 611fce58d670..119a3f9604b0 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -42,7 +42,7 @@ static inline void cpuset_dec(void)
extern int cpuset_init(void);
extern void cpuset_init_smp(void);
-extern void cpuset_update_active_cpus(bool cpu_online);
+extern void cpuset_update_active_cpus(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -155,7 +155,7 @@ static inline bool cpusets_enabled(void) { return false; }
static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}
-static inline void cpuset_update_active_cpus(bool cpu_online)
+static inline void cpuset_update_active_cpus(void)
{
partition_sched_domains(1, NULL, NULL);
}
diff --git a/include/linux/dell-led.h b/include/linux/dell-led.h
index 7009b8bec77b..3f033c48071e 100644
--- a/include/linux/dell-led.h
+++ b/include/linux/dell-led.h
@@ -1,10 +1,6 @@
#ifndef __DELL_LED_H__
#define __DELL_LED_H__
-enum {
- DELL_LED_MICMUTE,
-};
-
-int dell_app_wmi_led_set(int whichled, int on);
+int dell_micmute_led_set(int on);
#endif
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a7e6903866fd..c7ea33e38fb9 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -255,6 +255,12 @@ struct dm_target {
unsigned num_write_same_bios;
/*
+ * The number of WRITE ZEROES bios that will be submitted to the target.
+ * The bio number can be accessed with dm_bio_get_target_bio_nr.
+ */
+ unsigned num_write_zeroes_bios;
+
+ /*
* The minimum number of extra bytes allocated in each io for the
* target to use.
*/
@@ -290,11 +296,6 @@ struct dm_target {
* on max_io_len boundary.
*/
bool split_discard_bios:1;
-
- /*
- * Set if this target does not return zeroes on discarded blocks.
- */
- bool discard_zeroes_data_unsupported:1;
};
/* Each target can link one of these into the table */
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 5b6adf964248..8ae0f45fafd6 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -28,12 +28,10 @@ struct device;
#define EDAC_OPSTATE_INT 2
extern int edac_op_state;
-extern int edac_err_assert;
-extern atomic_t edac_handlers;
-extern int edac_handler_set(void);
-extern void edac_atomic_assert_error(void);
-extern struct bus_type *edac_get_sysfs_subsys(void);
+struct bus_type *edac_get_sysfs_subsys(void);
+int edac_get_report_status(void);
+void edac_set_report_status(int new);
enum {
EDAC_REPORTING_ENABLED,
@@ -41,28 +39,6 @@ enum {
EDAC_REPORTING_FORCE
};
-extern int edac_report_status;
-#ifdef CONFIG_EDAC
-static inline int get_edac_report_status(void)
-{
- return edac_report_status;
-}
-
-static inline void set_edac_report_status(int new)
-{
- edac_report_status = new;
-}
-#else
-static inline int get_edac_report_status(void)
-{
- return EDAC_REPORTING_DISABLED;
-}
-
-static inline void set_edac_report_status(int new)
-{
-}
-#endif
-
static inline void opstate_init(void)
{
switch (edac_op_state) {
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 22d39e8d4de1..3a216318ae73 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -93,6 +93,8 @@ struct blk_mq_hw_ctx;
struct elevator_mq_ops {
int (*init_sched)(struct request_queue *, struct elevator_type *);
void (*exit_sched)(struct elevator_queue *);
+ int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
+ void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
@@ -104,7 +106,7 @@ struct elevator_mq_ops {
void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
bool (*has_work)(struct blk_mq_hw_ctx *);
- void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
+ void (*completed_request)(struct request *);
void (*started_request)(struct request *);
void (*requeue_request)(struct request *);
struct request *(*former_request)(struct request_queue *, struct request *);
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 7010fb01a81a..7e206a9f88db 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -236,11 +236,11 @@ extern int extcon_set_property_capability(struct extcon_dev *edev,
unsigned int id, unsigned int prop);
/*
- * Following APIs are to monitor every action of a notifier.
- * Registrar gets notified for every external port of a connection device.
- * Probably this could be used to debug an action of notifier; however,
- * we do not recommend to use this for normal 'notifiee' device drivers who
- * want to be notified by a specific external port of the notifier.
+ * Following APIs are to monitor the status change of the external connectors.
+ * extcon_register_notifier(*edev, id, *nb) : Register a notifier block
+ * for specific external connector of the extcon.
+ * extcon_register_notifier_all(*edev, *nb) : Register a notifier block
+ * for all supported external connectors of the extcon.
*/
extern int extcon_register_notifier(struct extcon_dev *edev, unsigned int id,
struct notifier_block *nb);
@@ -253,6 +253,17 @@ extern void devm_extcon_unregister_notifier(struct device *dev,
struct extcon_dev *edev, unsigned int id,
struct notifier_block *nb);
+extern int extcon_register_notifier_all(struct extcon_dev *edev,
+ struct notifier_block *nb);
+extern int extcon_unregister_notifier_all(struct extcon_dev *edev,
+ struct notifier_block *nb);
+extern int devm_extcon_register_notifier_all(struct device *dev,
+ struct extcon_dev *edev,
+ struct notifier_block *nb);
+extern void devm_extcon_unregister_notifier_all(struct device *dev,
+ struct extcon_dev *edev,
+ struct notifier_block *nb);
+
/*
* Following API get the extcon device from devicetree.
* This function use phandle of devicetree to get extcon device directly.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7251f7bb45e8..30e5c14bd743 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2121,6 +2121,9 @@ extern int vfs_ustat(dev_t, struct kstatfs *);
extern int freeze_super(struct super_block *super);
extern int thaw_super(struct super_block *super);
extern bool our_mnt(struct vfsmount *mnt);
+extern __printf(2, 3)
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
+extern int super_setup_bdi(struct super_block *sb);
extern int current_umask(void);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 76f39754e7b0..acff9437e5c3 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -159,11 +159,11 @@ struct badblocks;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct blk_integrity {
- struct blk_integrity_profile *profile;
- unsigned char flags;
- unsigned char tuple_size;
- unsigned char interval_exp;
- unsigned char tag_size;
+ const struct blk_integrity_profile *profile;
+ unsigned char flags;
+ unsigned char tuple_size;
+ unsigned char interval_exp;
+ unsigned char tag_size;
};
#endif /* CONFIG_BLK_DEV_INTEGRITY */
@@ -722,11 +722,9 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
#if defined(CONFIG_BLK_DEV_INTEGRITY)
extern void blk_integrity_add(struct gendisk *);
extern void blk_integrity_del(struct gendisk *);
-extern void blk_integrity_revalidate(struct gendisk *);
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline void blk_integrity_add(struct gendisk *disk) { }
static inline void blk_integrity_del(struct gendisk *disk) { }
-static inline void blk_integrity_revalidate(struct gendisk *disk) { }
#endif /* CONFIG_BLK_DEV_INTEGRITY */
#else /* CONFIG_BLOCK */
diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index 88b673749121..ceb751987c40 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -337,7 +337,7 @@ struct hwmon_ops {
int (*read)(struct device *dev, enum hwmon_sensor_types type,
u32 attr, int channel, long *val);
int (*read_string)(struct device *dev, enum hwmon_sensor_types type,
- u32 attr, int channel, char **str);
+ u32 attr, int channel, const char **str);
int (*write)(struct device *dev, enum hwmon_sensor_types type,
u32 attr, int channel, long val);
};
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 2f51c1724b5a..6980ca322074 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -88,7 +88,7 @@ static inline bool ata_pm_request(struct request *rq)
ide_req(rq)->type == ATA_PRIV_PM_RESUME);
}
-/* Error codes returned in rq->errors to the higher part of the driver. */
+/* Error codes returned in result to the higher part of the driver. */
enum {
IDE_DRV_ERROR_GENERAL = 101,
IDE_DRV_ERROR_FILEMARK = 102,
diff --git a/include/linux/inet.h b/include/linux/inet.h
index 4cca05c9678e..636ebe87e6f8 100644
--- a/include/linux/inet.h
+++ b/include/linux/inet.h
@@ -43,6 +43,8 @@
#define _LINUX_INET_H
#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <linux/socket.h>
/*
* These mimic similar macros defined in user-space for inet_ntop(3).
@@ -54,4 +56,8 @@
extern __be32 in_aton(const char *str);
extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
+
+extern int inet_pton_with_scope(struct net *net, unsigned short af,
+ const char *src, const char *port, struct sockaddr_storage *addr);
+
#endif /* _LINUX_INET_H */
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e6284591599e..ca85cb80e99a 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -108,6 +108,8 @@ extern int __must_check kobject_rename(struct kobject *, const char *new_name);
extern int __must_check kobject_move(struct kobject *, struct kobject *);
extern struct kobject *kobject_get(struct kobject *kobj);
+extern struct kobject * __must_check kobject_get_unless_zero(
+ struct kobject *kobj);
extern void kobject_put(struct kobject *kobj);
extern const void *kobject_namespace(struct kobject *kobj);
diff --git a/include/linux/leds-pca9532.h b/include/linux/leds-pca9532.h
index d215b4561180..5e240b2b4d58 100644
--- a/include/linux/leds-pca9532.h
+++ b/include/linux/leds-pca9532.h
@@ -22,7 +22,8 @@ enum pca9532_state {
PCA9532_OFF = 0x0,
PCA9532_ON = 0x1,
PCA9532_PWM0 = 0x2,
- PCA9532_PWM1 = 0x3
+ PCA9532_PWM1 = 0x3,
+ PCA9532_KEEP = 0xff,
};
struct pca9532_led {
@@ -44,4 +45,3 @@ struct pca9532_platform_data {
};
#endif /* __LINUX_PCA9532_H */
-
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 38c0bd7ca107..64c56d454f7d 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -122,10 +122,16 @@ struct led_classdev {
struct mutex led_access;
};
-extern int led_classdev_register(struct device *parent,
- struct led_classdev *led_cdev);
-extern int devm_led_classdev_register(struct device *parent,
- struct led_classdev *led_cdev);
+extern int of_led_classdev_register(struct device *parent,
+ struct device_node *np,
+ struct led_classdev *led_cdev);
+#define led_classdev_register(parent, led_cdev) \
+ of_led_classdev_register(parent, NULL, led_cdev)
+extern int devm_of_led_classdev_register(struct device *parent,
+ struct device_node *np,
+ struct led_classdev *led_cdev);
+#define devm_led_classdev_register(parent, led_cdev) \
+ devm_of_led_classdev_register(parent, NULL, led_cdev)
extern void led_classdev_unregister(struct led_classdev *led_cdev);
extern void devm_led_classdev_unregister(struct device *parent,
struct led_classdev *led_cdev);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index ca45e4a088a9..7dfa56ebbc6d 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -56,7 +56,6 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *);
typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
typedef void (nvm_destroy_dma_pool_fn)(void *);
typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
@@ -70,7 +69,6 @@ struct nvm_dev_ops {
nvm_op_set_bb_fn *set_bb_tbl;
nvm_submit_io_fn *submit_io;
- nvm_erase_blk_fn *erase_block;
nvm_create_dma_pool_fn *create_dma_pool;
nvm_destroy_dma_pool_fn *destroy_dma_pool;
@@ -125,7 +123,7 @@ enum {
/* NAND Access Modes */
NVM_IO_SUSPEND = 0x80,
NVM_IO_SLC_MODE = 0x100,
- NVM_IO_SCRAMBLE_DISABLE = 0x200,
+ NVM_IO_SCRAMBLE_ENABLE = 0x200,
/* Block Types */
NVM_BLK_T_FREE = 0x0,
@@ -438,7 +436,8 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
typedef sector_t (nvm_tgt_capacity_fn)(void *);
-typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *);
+typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
+ int flags);
typedef void (nvm_tgt_exit_fn)(void *);
typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
@@ -479,10 +478,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
int, int);
extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
-extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
+extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
+extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *,
const struct ppa_addr *, int, int);
-extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
-extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int);
+extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *);
extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
void *);
extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
diff --git a/include/linux/mailbox/brcm-message.h b/include/linux/mailbox/brcm-message.h
index 6b55c938b401..c20b4843fc2d 100644
--- a/include/linux/mailbox/brcm-message.h
+++ b/include/linux/mailbox/brcm-message.h
@@ -16,6 +16,7 @@
enum brcm_message_type {
BRCM_MESSAGE_UNKNOWN = 0,
+ BRCM_MESSAGE_BATCH,
BRCM_MESSAGE_SPU,
BRCM_MESSAGE_SBA,
BRCM_MESSAGE_MAX,
@@ -23,24 +24,29 @@ enum brcm_message_type {
struct brcm_sba_command {
u64 cmd;
+ u64 *cmd_dma;
+ dma_addr_t cmd_dma_addr;
#define BRCM_SBA_CMD_TYPE_A BIT(0)
#define BRCM_SBA_CMD_TYPE_B BIT(1)
#define BRCM_SBA_CMD_TYPE_C BIT(2)
#define BRCM_SBA_CMD_HAS_RESP BIT(3)
#define BRCM_SBA_CMD_HAS_OUTPUT BIT(4)
u64 flags;
- dma_addr_t input;
- size_t input_len;
dma_addr_t resp;
size_t resp_len;
- dma_addr_t output;
- size_t output_len;
+ dma_addr_t data;
+ size_t data_len;
};
struct brcm_message {
enum brcm_message_type type;
union {
struct {
+ struct brcm_message *msgs;
+ unsigned int msgs_queued;
+ unsigned int msgs_count;
+ } batch;
+ struct {
struct scatterlist *src;
struct scatterlist *dst;
} spu;
diff --git a/include/linux/mfd/motorola-cpcap.h b/include/linux/mfd/motorola-cpcap.h
index b4031c2b2214..53758a7d7c32 100644
--- a/include/linux/mfd/motorola-cpcap.h
+++ b/include/linux/mfd/motorola-cpcap.h
@@ -14,6 +14,9 @@
* published by the Free Software Foundation.
*/
+#include <linux/device.h>
+#include <linux/regmap.h>
+
#define CPCAP_VENDOR_ST 0
#define CPCAP_VENDOR_TI 1
diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h
deleted file mode 100644
index e11f4d9f1c2e..000000000000
--- a/include/linux/mg_disk.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * include/linux/mg_disk.c
- *
- * Private data for mflash platform driver
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __MG_DISK_H__
-#define __MG_DISK_H__
-
-/* name for platform device */
-#define MG_DEV_NAME "mg_disk"
-
-/* names of GPIO resource */
-#define MG_RST_PIN "mg_rst"
-/* except MG_BOOT_DEV, reset-out pin should be assigned */
-#define MG_RSTOUT_PIN "mg_rstout"
-
-/* device attribution */
-/* use mflash as boot device */
-#define MG_BOOT_DEV (1 << 0)
-/* use mflash as storage device */
-#define MG_STORAGE_DEV (1 << 1)
-/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
-#define MG_STORAGE_DEV_SKIP_RST (1 << 2)
-
-/* private driver data */
-struct mg_drv_data {
- /* disk resource */
- u32 use_polling;
-
- /* device attribution */
- u32 dev_attr;
-
- /* internally used */
- void *host;
-};
-
-#endif
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index eebdc63cf6af..79b176eca04a 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -334,11 +334,6 @@ struct mtd_info {
int (*_get_device) (struct mtd_info *mtd);
void (*_put_device) (struct mtd_info *mtd);
- /* Backing device capabilities for this device
- * - provides mmap capabilities
- */
- struct backing_dev_info *backing_dev_info;
-
struct notifier_block reboot_notifier; /* default mode before reboot */
/* ECC status information */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b34097c67848..e1502c55741e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -133,7 +133,6 @@ struct nfs_server {
struct rpc_clnt * client_acl; /* ACL RPC client handle */
struct nlm_host *nlm_host; /* NLM client handle */
struct nfs_iostats __percpu *io_stats; /* I/O statistics */
- struct backing_dev_info backing_dev_info;
atomic_long_t writeback; /* number of writeback pages */
int flags; /* various flags */
unsigned int caps; /* server capabilities */
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index f21471f7ee40..0db37158a61d 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -137,9 +137,9 @@ enum nvmefc_fcp_datadir {
* transferred. Should equal payload_length on success.
* @rcv_rsplen: length, in bytes, of the FCP RSP IU received.
* @status: Completion status of the FCP operation. must be 0 upon success,
- * NVME_SC_FC_xxx value upon failure. Note: this is NOT a
- * reflection of the NVME CQE completion status. Only the status
- * of the FCP operation at the NVME-FC level.
+ * negative errno value upon failure (ex: -EIO). Note: this is
+ * NOT a reflection of the NVME CQE completion status. Only the
+ * status of the FCP operation at the NVME-FC level.
*/
struct nvmefc_fcp_req {
void *cmdaddr;
@@ -533,9 +533,6 @@ enum {
* rsp as well
*/
NVMET_FCOP_RSP = 4, /* send rsp frame */
- NVMET_FCOP_ABORT = 5, /* abort exchange via ABTS */
- NVMET_FCOP_BA_ACC = 6, /* send BA_ACC */
- NVMET_FCOP_BA_RJT = 7, /* send BA_RJT */
};
/**
@@ -572,8 +569,6 @@ enum {
* upon compeletion of the operation. The nvmet-fc layer will also set a
* private pointer for its own use in the done routine.
*
- * Note: the LLDD must never fail a NVMET_FCOP_ABORT request !!
- *
* Values set by the NVMET-FC layer prior to calling the LLDD fcp_op
* entrypoint.
* @op: Indicates the FCP IU operation to perform (see NVMET_FCOP_xxx)
@@ -655,6 +650,22 @@ enum {
* on. The transport should pick a cpu to schedule the work
* on.
*/
+ NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 2),
+ /* Bit 2: When 0, the LLDD is calling the cmd rcv handler
+ * in a non-isr context, allowing the transport to finish
+ * op completion in the calling context. When 1, the LLDD
+ * is calling the cmd rcv handler in an ISR context,
+ * requiring the transport to transition to a workqueue
+ * for op completion.
+ */
+ NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 3),
+ /* Bit 3: When 0, the LLDD is calling the op done handler
+ * in a non-isr context, allowing the transport to finish
+ * op completion in the calling context. When 1, the LLDD
+ * is calling the op done handler in an ISR context,
+ * requiring the transport to transition to a workqueue
+ * for op completion.
+ */
};
@@ -725,12 +736,12 @@ struct nvmet_fc_target_port {
* be freed/released.
* Entrypoint is Mandatory.
*
- * @fcp_op: Called to perform a data transfer, transmit a response, or
- * abort an FCP opertion. The nvmefc_tgt_fcp_req structure is the same
- * LLDD-supplied exchange structure specified in the
- * nvmet_fc_rcv_fcp_req() call made when the FCP CMD IU was received.
- * The op field in the structure shall indicate the operation for
- * the LLDD to perform relative to the io.
+ * @fcp_op: Called to perform a data transfer or transmit a response.
+ * The nvmefc_tgt_fcp_req structure is the same LLDD-supplied
+ * exchange structure specified in the nvmet_fc_rcv_fcp_req() call
+ * made when the FCP CMD IU was received. The op field in the
+ * structure shall indicate the operation for the LLDD to perform
+ * relative to the io.
* NVMET_FCOP_READDATA operation: the LLDD is to send the
* payload data (described by sglist) to the host in 1 or
* more FC sequences (preferrably 1). Note: the fc-nvme layer
@@ -752,29 +763,31 @@ struct nvmet_fc_target_port {
* successfully, the LLDD is to update the nvmefc_tgt_fcp_req
* transferred_length field and may subsequently transmit the
* FCP_RSP iu payload (described by rspbuf, rspdma, rsplen).
- * The LLDD is to await FCP_CONF reception to confirm the RSP
- * reception by the host. The LLDD may retramsit the FCP_RSP iu
- * if necessary per FC-NVME. Upon reception of FCP_CONF, or upon
- * FCP_CONF failure, the LLDD is to set the nvmefc_tgt_fcp_req
- * fcp_error field and consider the operation complete..
+ * If FCP_CONF is supported, the LLDD is to await FCP_CONF
+ * reception to confirm the RSP reception by the host. The LLDD
+ * may retramsit the FCP_RSP iu if necessary per FC-NVME. Upon
+ * transmission of the FCP_RSP iu if FCP_CONF is not supported,
+ * or upon success/failure of FCP_CONF if it is supported, the
+ * LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
+ * consider the operation complete.
* NVMET_FCOP_RSP: the LLDD is to transmit the FCP_RSP iu payload
- * (described by rspbuf, rspdma, rsplen). The LLDD is to await
- * FCP_CONF reception to confirm the RSP reception by the host.
- * The LLDD may retramsit the FCP_RSP iu if necessary per FC-NVME.
- * Upon reception of FCP_CONF, or upon FCP_CONF failure, the
+ * (described by rspbuf, rspdma, rsplen). If FCP_CONF is
+ * supported, the LLDD is to await FCP_CONF reception to confirm
+ * the RSP reception by the host. The LLDD may retramsit the
+ * FCP_RSP iu if FCP_CONF is not received per FC-NVME. Upon
+ * transmission of the FCP_RSP iu if FCP_CONF is not supported,
+ * or upon success/failure of FCP_CONF if it is supported, the
* LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
- * consider the operation complete..
- * NVMET_FCOP_ABORT: the LLDD is to terminate the exchange
- * corresponding to the fcp operation. The LLDD shall send
- * ABTS and follow FC exchange abort-multi rules, including
- * ABTS retries and possible logout.
+ * consider the operation complete.
* Upon completing the indicated operation, the LLDD is to set the
* status fields for the operation (tranferred_length and fcp_error
- * status) in the request, then all the "done" routine
- * indicated in the fcp request. Upon return from the "done"
- * routine for either a NVMET_FCOP_RSP or NVMET_FCOP_ABORT operation
- * the fc-nvme layer will not longer reference the fcp request,
- * allowing the LLDD to free/release the fcp request.
+ * status) in the request, then call the "done" routine
+ * indicated in the fcp request. After the operation completes,
+ * regardless of whether the FCP_RSP iu was successfully transmit,
+ * the LLDD-supplied exchange structure must remain valid until the
+ * transport calls the fcp_req_release() callback to return ownership
+ * of the exchange structure back to the LLDD so that it may be used
+ * for another fcp command.
* Note: when calling the done routine for READDATA or WRITEDATA
* operations, the fc-nvme layer may immediate convert, in the same
* thread and before returning to the LLDD, the fcp operation to
@@ -786,6 +799,22 @@ struct nvmet_fc_target_port {
* Returns 0 on success, -<errno> on failure (Ex: -EIO)
* Entrypoint is Mandatory.
*
+ * @fcp_abort: Called by the transport to abort an active command.
+ * The command may be in-between operations (nothing active in LLDD)
+ * or may have an active WRITEDATA operation pending. The LLDD is to
+ * initiate the ABTS process for the command and return from the
+ * callback. The ABTS does not need to be complete on the command.
+ * The fcp_abort callback inherently cannot fail. After the
+ * fcp_abort() callback completes, the transport will wait for any
+ * outstanding operation (if there was one) to complete, then will
+ * call the fcp_req_release() callback to return the command's
+ * exchange context back to the LLDD.
+ *
+ * @fcp_req_release: Called by the transport to return a nvmefc_tgt_fcp_req
+ * to the LLDD after all operations on the fcp operation are complete.
+ * This may be due to the command completing or upon completion of
+ * abort cleanup.
+ *
* @max_hw_queues: indicates the maximum number of hw queues the LLDD
* supports for cpu affinitization.
* Value is Mandatory. Must be at least 1.
@@ -820,7 +849,11 @@ struct nvmet_fc_target_template {
int (*xmt_ls_rsp)(struct nvmet_fc_target_port *tgtport,
struct nvmefc_tgt_ls_req *tls_req);
int (*fcp_op)(struct nvmet_fc_target_port *tgtport,
- struct nvmefc_tgt_fcp_req *);
+ struct nvmefc_tgt_fcp_req *fcpreq);
+ void (*fcp_abort)(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *fcpreq);
+ void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *fcpreq);
u32 max_hw_queues;
u16 max_sgl_segments;
@@ -848,4 +881,7 @@ int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport,
struct nvmefc_tgt_fcp_req *fcpreq,
void *cmdiubuf, u32 cmdiubuf_len);
+void nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *fcpreq);
+
#endif /* _NVME_FC_DRIVER_H */
diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h
index 4b45226bd604..e997c4a49a88 100644
--- a/include/linux/nvme-fc.h
+++ b/include/linux/nvme-fc.h
@@ -16,8 +16,7 @@
*/
/*
- * This file contains definitions relative to FC-NVME r1.11 and a few
- * newer items
+ * This file contains definitions relative to FC-NVME r1.14 (16-020vB).
*/
#ifndef _NVME_FC_H
@@ -47,8 +46,15 @@ struct nvme_fc_cmd_iu {
#define NVME_FC_SIZEOF_ZEROS_RSP 12
+enum {
+ FCNVME_SC_SUCCESS = 0,
+ FCNVME_SC_INVALID_FIELD = 1,
+ FCNVME_SC_INVALID_CONNID = 2,
+};
+
struct nvme_fc_ersp_iu {
- __u8 rsvd0[2];
+ __u8 status_code;
+ __u8 rsvd1;
__be16 iu_len;
__be32 rsn;
__be32 xfrd_len;
@@ -58,7 +64,7 @@ struct nvme_fc_ersp_iu {
};
-/* FC-NVME r1.03/16-119v0 NVME Link Services */
+/* FC-NVME Link Services */
enum {
FCNVME_LS_RSVD = 0,
FCNVME_LS_RJT = 1,
@@ -68,7 +74,7 @@ enum {
FCNVME_LS_DISCONNECT = 5,
};
-/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */
+/* FC-NVME Link Service Descriptors */
enum {
FCNVME_LSDESC_RSVD = 0x0,
FCNVME_LSDESC_RQST = 0x1,
@@ -92,7 +98,6 @@ static inline __be32 fcnvme_lsdesc_len(size_t sz)
return cpu_to_be32(sz - (2 * sizeof(u32)));
}
-
struct fcnvme_ls_rqst_w0 {
u8 ls_cmd; /* FCNVME_LS_xxx */
u8 zeros[3];
@@ -106,8 +111,53 @@ struct fcnvme_lsdesc_rqst {
__be32 rsvd12;
};
+/* FC-NVME LS RJT reason_code values */
+enum fcnvme_ls_rjt_reason {
+ FCNVME_RJT_RC_NONE = 0,
+ /* no reason - not to be sent */
+
+ FCNVME_RJT_RC_INVAL = 0x01,
+ /* invalid NVMe_LS command code */
+
+ FCNVME_RJT_RC_LOGIC = 0x03,
+ /* logical error */
+
+ FCNVME_RJT_RC_UNAB = 0x09,
+ /* unable to perform command request */
+
+ FCNVME_RJT_RC_UNSUP = 0x0b,
+ /* command not supported */
+
+ FCNVME_RJT_RC_INPROG = 0x0e,
+ /* command already in progress */
+ FCNVME_RJT_RC_INV_ASSOC = 0x40,
+ /* Invalid Association ID*/
+ FCNVME_RJT_RC_INV_CONN = 0x41,
+ /* Invalid Connection ID*/
+
+ FCNVME_RJT_RC_VENDOR = 0xff,
+ /* vendor specific error */
+};
+
+/* FC-NVME LS RJT reason_explanation values */
+enum fcnvme_ls_rjt_explan {
+ FCNVME_RJT_EXP_NONE = 0x00,
+ /* No additional explanation */
+
+ FCNVME_RJT_EXP_OXID_RXID = 0x17,
+ /* invalid OX_ID-RX_ID combination */
+
+ FCNVME_RJT_EXP_INSUF_RES = 0x29,
+ /* insufficient resources */
+
+ FCNVME_RJT_EXP_UNAB_DATA = 0x2a,
+ /* unable to supply requested data */
+
+ FCNVME_RJT_EXP_INV_LEN = 0x2d,
+ /* Invalid payload length */
+};
/* FCNVME_LSDESC_RJT */
struct fcnvme_lsdesc_rjt {
@@ -119,15 +169,15 @@ struct fcnvme_lsdesc_rjt {
* Reject reason and explanaction codes are generic
* to ELs's from LS-3.
*/
- u8 reason_code;
- u8 reason_explanation;
+ u8 reason_code; /* fcnvme_ls_rjt_reason */
+ u8 reason_explanation; /* fcnvme_ls_rjt_explan */
u8 vendor;
__be32 rsvd12;
};
-#define FCNVME_ASSOC_HOSTID_LEN 64
+#define FCNVME_ASSOC_HOSTID_LEN 16
#define FCNVME_ASSOC_HOSTNQN_LEN 256
#define FCNVME_ASSOC_SUBNQN_LEN 256
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9061780b141f..b625bacf37ef 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -245,6 +245,7 @@ enum {
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
+ NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7,
};
struct nvme_lbaf {
@@ -603,6 +604,7 @@ enum nvme_admin_opcode {
nvme_admin_download_fw = 0x11,
nvme_admin_ns_attach = 0x15,
nvme_admin_keep_alive = 0x18,
+ nvme_admin_dbbuf = 0x7C,
nvme_admin_format_nvm = 0x80,
nvme_admin_security_send = 0x81,
nvme_admin_security_recv = 0x82,
@@ -874,6 +876,16 @@ struct nvmf_property_get_command {
__u8 resv4[16];
};
+struct nvme_dbbuf {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __u32 rsvd1[5];
+ __le64 prp1;
+ __le64 prp2;
+ __u32 rsvd12[6];
+};
+
struct nvme_command {
union {
struct nvme_common_command common;
@@ -893,6 +905,7 @@ struct nvme_command {
struct nvmf_connect_command connect;
struct nvmf_property_set_command prop_set;
struct nvmf_property_get_command prop_get;
+ struct nvme_dbbuf dbbuf;
};
};
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 43a774873aa9..fb3857337151 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -852,6 +852,7 @@ void phy_change_work(struct work_struct *work);
void phy_mac_interrupt(struct phy_device *phydev, int new_link);
void phy_start_machine(struct phy_device *phydev);
void phy_stop_machine(struct phy_device *phydev);
+void phy_trigger_machine(struct phy_device *phydev, bool sync);
int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd);
int phy_ethtool_ksettings_get(struct phy_device *phydev,
diff --git a/include/linux/power/bq24190_charger.h b/include/linux/power/bq24190_charger.h
deleted file mode 100644
index 9f0283721cbc..000000000000
--- a/include/linux/power/bq24190_charger.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Platform data for the TI bq24190 battery charger driver.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _BQ24190_CHARGER_H_
-#define _BQ24190_CHARGER_H_
-
-struct bq24190_platform_data {
- unsigned int gpio_int; /* GPIO pin that's connected to INT# */
-};
-
-#endif
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index d4e0a204c118..a1904aadbc45 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -176,6 +176,25 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin);
/**
+ * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
+ * limiting the depth used from each word.
+ * @sb: Bitmap to allocate from.
+ * @alloc_hint: Hint for where to start searching for a free bit.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ *
+ * This rather specific operation allows for having multiple users with
+ * different allocation limits. E.g., there can be a high-priority class that
+ * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
+ * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
+ * class can only allocate half of the total bits in the bitmap, preventing it
+ * from starving out the high-priority class.
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+ unsigned long shallow_depth);
+
+/**
* sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
* @sb: Bitmap to check.
*
@@ -326,6 +345,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
int __sbitmap_queue_get(struct sbitmap_queue *sbq);
/**
+ * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word, with preemption
+ * already disabled.
+ * @sbq: Bitmap queue to allocate from.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+ unsigned int shallow_depth);
+
+/**
* sbitmap_queue_get() - Try to allocate a free bit from a &struct
* sbitmap_queue.
* @sbq: Bitmap queue to allocate from.
@@ -346,6 +378,29 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
}
/**
+ * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word.
+ * @sbq: Bitmap queue to allocate from.
+ * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
+ * sbitmap_queue_clear()).
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+ unsigned int *cpu,
+ unsigned int shallow_depth)
+{
+ int nr;
+
+ *cpu = get_cpu();
+ nr = __sbitmap_queue_get_shallow(sbq, shallow_depth);
+ put_cpu();
+ return nr;
+}
+
+/**
* sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
* &struct sbitmap_queue.
* @sbq: Bitmap to free from.
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index 9fba9dd33544..9375d23a24e7 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -34,9 +34,9 @@ struct t10_pi_tuple {
};
-extern struct blk_integrity_profile t10_pi_type1_crc;
-extern struct blk_integrity_profile t10_pi_type1_ip;
-extern struct blk_integrity_profile t10_pi_type3_crc;
-extern struct blk_integrity_profile t10_pi_type3_ip;
+extern const struct blk_integrity_profile t10_pi_type1_crc;
+extern const struct blk_integrity_profile t10_pi_type1_ip;
+extern const struct blk_integrity_profile t10_pi_type3_crc;
+extern const struct blk_integrity_profile t10_pi_type3_ip;
#endif
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a3c0cbd7c888..d5815794416c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -237,6 +237,7 @@ static inline void inode_attach_wb(struct inode *inode, struct page *page)
static inline void inode_detach_wb(struct inode *inode)
{
if (inode->i_wb) {
+ WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
wb_put(inode->i_wb);
inode->i_wb = NULL;
}
diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h
index ba0aeb980f7e..f0c76f9dc285 100644
--- a/include/scsi/scsi_request.h
+++ b/include/scsi/scsi_request.h
@@ -9,8 +9,10 @@ struct scsi_request {
unsigned char __cmd[BLK_MAX_CDB];
unsigned char *cmd;
unsigned short cmd_len;
+ int result;
unsigned int sense_len;
unsigned int resid_len; /* residual count */
+ int retries;
void *sense;
};
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index a88ed13446ff..d0dbe60d8a6d 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -61,7 +61,16 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer,
TP_ARGS(bh)
);
-DECLARE_EVENT_CLASS(block_rq_with_error,
+/**
+ * block_rq_requeue - place block IO request back on a queue
+ * @q: queue holding operation
+ * @rq: block IO operation request
+ *
+ * The block operation request @rq is being placed back into queue
+ * @q. For some reason the request was not completed and needs to be
+ * put back in the queue.
+ */
+TRACE_EVENT(block_rq_requeue,
TP_PROTO(struct request_queue *q, struct request *rq),
@@ -71,7 +80,6 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
__field( dev_t, dev )
__field( sector_t, sector )
__field( unsigned int, nr_sector )
- __field( int, errors )
__array( char, rwbs, RWBS_LEN )
__dynamic_array( char, cmd, 1 )
),
@@ -80,7 +88,6 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
__entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
__entry->sector = blk_rq_trace_sector(rq);
__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
- __entry->errors = rq->errors;
blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
__get_str(cmd)[0] = '\0';
@@ -90,46 +97,13 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rwbs, __get_str(cmd),
(unsigned long long)__entry->sector,
- __entry->nr_sector, __entry->errors)
-);
-
-/**
- * block_rq_abort - abort block operation request
- * @q: queue containing the block operation request
- * @rq: block IO operation request
- *
- * Called immediately after pending block IO operation request @rq in
- * queue @q is aborted. The fields in the operation request @rq
- * can be examined to determine which device and sectors the pending
- * operation would access.
- */
-DEFINE_EVENT(block_rq_with_error, block_rq_abort,
-
- TP_PROTO(struct request_queue *q, struct request *rq),
-
- TP_ARGS(q, rq)
-);
-
-/**
- * block_rq_requeue - place block IO request back on a queue
- * @q: queue holding operation
- * @rq: block IO operation request
- *
- * The block operation request @rq is being placed back into queue
- * @q. For some reason the request was not completed and needs to be
- * put back in the queue.
- */
-DEFINE_EVENT(block_rq_with_error, block_rq_requeue,
-
- TP_PROTO(struct request_queue *q, struct request *rq),
-
- TP_ARGS(q, rq)
+ __entry->nr_sector, 0)
);
/**
* block_rq_complete - block IO operation completed by device driver
- * @q: queue containing the block operation request
* @rq: block operations request
+ * @error: status code
* @nr_bytes: number of completed bytes
*
* The block_rq_complete tracepoint event indicates that some portion
@@ -140,16 +114,15 @@ DEFINE_EVENT(block_rq_with_error, block_rq_requeue,
*/
TRACE_EVENT(block_rq_complete,
- TP_PROTO(struct request_queue *q, struct request *rq,
- unsigned int nr_bytes),
+ TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),
- TP_ARGS(q, rq, nr_bytes),
+ TP_ARGS(rq, error, nr_bytes),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( sector_t, sector )
__field( unsigned int, nr_sector )
- __field( int, errors )
+ __field( int, error )
__array( char, rwbs, RWBS_LEN )
__dynamic_array( char, cmd, 1 )
),
@@ -158,7 +131,7 @@ TRACE_EVENT(block_rq_complete,
__entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
__entry->sector = blk_rq_pos(rq);
__entry->nr_sector = nr_bytes >> 9;
- __entry->errors = rq->errors;
+ __entry->error = error;
blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
__get_str(cmd)[0] = '\0';
@@ -168,7 +141,7 @@ TRACE_EVENT(block_rq_complete,
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rwbs, __get_str(cmd),
(unsigned long long)__entry->sector,
- __entry->nr_sector, __entry->errors)
+ __entry->nr_sector, __entry->error)
);
DECLARE_EVENT_CLASS(block_rq,
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index fd19f36b3129..c8aec4b9e73b 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -85,6 +85,10 @@ struct nvm_ioctl_create_conf {
};
};
+enum {
+ NVM_TARGET_FACTORY = 1 << 0, /* Init target in factory mode */
+};
+
struct nvm_ioctl_create {
char dev[DISK_NAME_LEN]; /* open-channel SSD device */
char tgttype[NVM_TTYPE_NAME_MAX]; /* target type name */
diff --git a/include/uapi/linux/nbd-netlink.h b/include/uapi/linux/nbd-netlink.h
new file mode 100644
index 000000000000..6f7ca3d63a65
--- /dev/null
+++ b/include/uapi/linux/nbd-netlink.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2017 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef _UAPILINUX_NBD_NETLINK_H
+#define _UAPILINUX_NBD_NETLINK_H
+
+#define NBD_GENL_FAMILY_NAME "nbd"
+#define NBD_GENL_VERSION 0x1
+#define NBD_GENL_MCAST_GROUP_NAME "nbd_mc_group"
+
+/* Configuration policy attributes, used for CONNECT */
+enum {
+ NBD_ATTR_UNSPEC,
+ NBD_ATTR_INDEX,
+ NBD_ATTR_SIZE_BYTES,
+ NBD_ATTR_BLOCK_SIZE_BYTES,
+ NBD_ATTR_TIMEOUT,
+ NBD_ATTR_SERVER_FLAGS,
+ NBD_ATTR_CLIENT_FLAGS,
+ NBD_ATTR_SOCKETS,
+ NBD_ATTR_DEAD_CONN_TIMEOUT,
+ NBD_ATTR_DEVICE_LIST,
+ __NBD_ATTR_MAX,
+};
+#define NBD_ATTR_MAX (__NBD_ATTR_MAX - 1)
+
+/*
+ * This is the format for multiple devices with NBD_ATTR_DEVICE_LIST
+ *
+ * [NBD_ATTR_DEVICE_LIST]
+ * [NBD_DEVICE_ITEM]
+ * [NBD_DEVICE_INDEX]
+ * [NBD_DEVICE_CONNECTED]
+ */
+enum {
+ NBD_DEVICE_ITEM_UNSPEC,
+ NBD_DEVICE_ITEM,
+ __NBD_DEVICE_ITEM_MAX,
+};
+#define NBD_DEVICE_ITEM_MAX (__NBD_DEVICE_ITEM_MAX - 1)
+
+enum {
+ NBD_DEVICE_UNSPEC,
+ NBD_DEVICE_INDEX,
+ NBD_DEVICE_CONNECTED,
+ __NBD_DEVICE_MAX,
+};
+#define NBD_DEVICE_ATTR_MAX (__NBD_DEVICE_MAX - 1)
+
+/*
+ * This is the format for multiple sockets with NBD_ATTR_SOCKETS
+ *
+ * [NBD_ATTR_SOCKETS]
+ * [NBD_SOCK_ITEM]
+ * [NBD_SOCK_FD]
+ * [NBD_SOCK_ITEM]
+ * [NBD_SOCK_FD]
+ */
+enum {
+ NBD_SOCK_ITEM_UNSPEC,
+ NBD_SOCK_ITEM,
+ __NBD_SOCK_ITEM_MAX,
+};
+#define NBD_SOCK_ITEM_MAX (__NBD_SOCK_ITEM_MAX - 1)
+
+enum {
+ NBD_SOCK_UNSPEC,
+ NBD_SOCK_FD,
+ __NBD_SOCK_MAX,
+};
+#define NBD_SOCK_MAX (__NBD_SOCK_MAX - 1)
+
+enum {
+ NBD_CMD_UNSPEC,
+ NBD_CMD_CONNECT,
+ NBD_CMD_DISCONNECT,
+ NBD_CMD_RECONFIGURE,
+ NBD_CMD_LINK_DEAD,
+ NBD_CMD_STATUS,
+ __NBD_CMD_MAX,
+};
+#define NBD_CMD_MAX (__NBD_CMD_MAX - 1)
+
+#endif /* _UAPILINUX_NBD_NETLINK_H */
diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index c91c642ea900..155e33f81913 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -37,7 +37,7 @@ enum {
NBD_CMD_TRIM = 4
};
-/* values for flags field */
+/* values for flags field, these are server interaction specific. */
#define NBD_FLAG_HAS_FLAGS (1 << 0) /* nbd-server supports flags */
#define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */
#define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */
@@ -45,6 +45,10 @@ enum {
#define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */
#define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */
+/* These are client behavior specific flags. */
+#define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) /* delete the nbd device on
+ disconnect. */
+
/* userspace doesn't need the nbd_device structure */
/* These are sent over the network in the request/reply magic fields */
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index d538897b8e08..17b10304c393 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -48,17 +48,13 @@
* tv_sec holds the number of seconds before (negative) or after (positive)
* 00:00:00 1st January 1970 UTC.
*
- * tv_nsec holds a number of nanoseconds before (0..-999,999,999 if tv_sec is
- * negative) or after (0..999,999,999 if tv_sec is positive) the tv_sec time.
- *
- * Note that if both tv_sec and tv_nsec are non-zero, then the two values must
- * either be both positive or both negative.
+ * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time.
*
* __reserved is held in case we need a yet finer resolution.
*/
struct statx_timestamp {
__s64 tv_sec;
- __s32 tv_nsec;
+ __u32 tv_nsec;
__s32 __reserved;
};
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 9203bfb05603..00f4d6bf048f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -5,6 +5,7 @@
#include <linux/kernfs.h>
#include <linux/workqueue.h>
#include <linux/list.h>
+#include <linux/refcount.h>
/*
* A cgroup can be associated with multiple css_sets as different tasks may
@@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset)
* can see it. Similar to atomic_dec_and_lock(), but for an
* rwlock
*/
- if (atomic_add_unless(&cset->refcount, -1, 1))
+ if (refcount_dec_not_one(&cset->refcount))
return;
spin_lock_irqsave(&css_set_lock, flags);
@@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset)
*/
static inline void get_css_set(struct css_set *cset)
{
- atomic_inc(&cset->refcount);
+ refcount_inc(&cset->refcount);
}
bool cgroup_ssid_enabled(int ssid);
@@ -163,7 +164,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 1dc22f6b49f5..85d75152402d 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp)
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += atomic_read(&link->cset->refcount);
+ count += refcount_read(&link->cset->refcount);
spin_unlock_irq(&css_set_lock);
return count;
}
@@ -1072,6 +1072,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
struct cgroup_subsys *ss;
struct dentry *dentry;
int i, ret;
+ bool new_root = false;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
@@ -1181,10 +1182,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
ret = -ENOMEM;
goto out_unlock;
}
+ new_root = true;
init_cgroup_root(root, &opts);
- ret = cgroup_setup_root(root, opts.subsys_mask);
+ ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
if (ret)
cgroup_free_root(root);
@@ -1201,6 +1203,18 @@ out_free:
CGROUP_SUPER_MAGIC, ns);
/*
+ * There's a race window after we release cgroup_mutex and before
+ * allocating a superblock. Make sure a concurrent process won't
+ * be able to re-use the root during this window by delaying the
+ * initialization of root refcnt.
+ */
+ if (new_root) {
+ mutex_lock(&cgroup_mutex);
+ percpu_ref_reinit(&root->cgrp.self.refcnt);
+ mutex_unlock(&cgroup_mutex);
+ }
+
+ /*
* If @pinned_sb, we're reusing an existing root and holding an
* extra ref on its sb. Mount is complete. Put the extra ref.
*/
@@ -1286,7 +1300,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
u64 count;
rcu_read_lock();
- count = atomic_read(&task_css_set(current)->refcount);
+ count = refcount_read(&task_css_set(current)->refcount);
rcu_read_unlock();
return count;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 687f5e0194ef..c3c9a0e1b3c9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .count = { .counter = 2, },
+ .count = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
@@ -436,7 +436,12 @@ out_unlock:
return css;
}
-static void cgroup_get(struct cgroup *cgrp)
+static void __maybe_unused cgroup_get(struct cgroup *cgrp)
+{
+ css_get(&cgrp->self);
+}
+
+static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
@@ -554,7 +559,7 @@ EXPORT_SYMBOL_GPL(of_css);
* haven't been created.
*/
struct css_set init_css_set = {
- .refcount = ATOMIC_INIT(1),
+ .refcount = REFCOUNT_INIT(1),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
@@ -724,7 +729,7 @@ void put_css_set_locked(struct css_set *cset)
lockdep_assert_held(&css_set_lock);
- if (!atomic_dec_and_test(&cset->refcount))
+ if (!refcount_dec_and_test(&cset->refcount))
return;
/* This css_set is dead. unlink it and release cgroup and css refs */
@@ -932,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
if (cgroup_parent(cgrp))
- cgroup_get(cgrp);
+ cgroup_get_live(cgrp);
}
/**
@@ -977,7 +982,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
return NULL;
}
- atomic_set(&cset->refcount, 1);
+ refcount_set(&cset->refcount, 1);
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->task_iters);
@@ -1640,7 +1645,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1656,8 +1661,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
root_cgrp->id = ret;
root_cgrp->ancestor_ids[0] = ret;
- ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
- GFP_KERNEL);
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
+ ref_flags, GFP_KERNEL);
if (ret)
goto out;
@@ -1802,7 +1807,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
return ERR_PTR(-EINVAL);
}
cgrp_dfl_visible = true;
- cgroup_get(&cgrp_dfl_root.cgrp);
+ cgroup_get_live(&cgrp_dfl_root.cgrp);
dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
CGROUP2_SUPER_MAGIC, ns);
@@ -2576,7 +2581,7 @@ restart:
if (!css || !percpu_ref_is_dying(&css->refcnt))
continue;
- cgroup_get(dsct);
+ cgroup_get_live(dsct);
prepare_to_wait(&dsct->offline_waitq, &wait,
TASK_UNINTERRUPTIBLE);
@@ -3947,7 +3952,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
{
lockdep_assert_held(&cgroup_mutex);
- cgroup_get(cgrp);
+ cgroup_get_live(cgrp);
memset(css, 0, sizeof(*css));
css->cgroup = cgrp;
@@ -4123,7 +4128,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
/* allocation complete, commit to creation */
list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
atomic_inc(&root->nr_cgrps);
- cgroup_get(parent);
+ cgroup_get_live(parent);
/*
* @cgrp is now fully operational. If something fails after this
@@ -4513,7 +4518,7 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
- BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
mutex_unlock(&cgroup_mutex);
@@ -4947,7 +4952,7 @@ struct cgroup *cgroup_get_from_path(const char *path)
if (kn) {
if (kernfs_type(kn) == KERNFS_DIR) {
cgrp = kn->priv;
- cgroup_get(cgrp);
+ cgroup_get_live(cgrp);
} else {
cgrp = ERR_PTR(-ENOTDIR);
}
@@ -5027,6 +5032,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
/* Socket clone path */
if (skcd->val) {
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
cgroup_get(sock_cgroup_ptr(skcd));
return;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f41292be0fb..f6501f4f6040 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2121,10 +2121,8 @@ int __init cpuset_init(void)
{
int err = 0;
- if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
- BUG();
- if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
- BUG();
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -2139,8 +2137,7 @@ int __init cpuset_init(void)
if (err < 0)
return err;
- if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
- BUG();
+ BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
return 0;
}
@@ -2354,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
rebuild_sched_domains();
}
-void cpuset_update_active_cpus(bool cpu_online)
+void cpuset_update_active_cpus(void)
{
/*
* We're inside cpu hotplug critical region which usually nests
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 96d38dab6fb2..66129eb4371d 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
kfree(new_ns);
return ERR_PTR(ret);
}
- atomic_set(&new_ns->count, 1);
+ refcount_set(&new_ns->count, 1);
new_ns->ns.ops = &cgroupns_operations;
return new_ns;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc05a0f1..430b0460db89 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5732,7 +5732,7 @@ static void cpuset_cpu_active(void)
* cpuset configurations.
*/
}
- cpuset_update_active_cpus(true);
+ cpuset_update_active_cpus();
}
static int cpuset_cpu_inactive(unsigned int cpu)
@@ -5755,7 +5755,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
if (overflow)
return -EBUSY;
- cpuset_update_active_cpus(false);
+ cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b46c8..aea3135c5d90 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+ enum cpu_usage_stat idx)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ u64_stats_update_begin(&irqtime->sync);
+ cpustat[idx] += delta;
+ irqtime->total += delta;
+ irqtime->tick_delta += delta;
+ u64_stats_update_end(&irqtime->sync);
+}
+
/*
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
void irqtime_account_irq(struct task_struct *curr)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
- u64 *cpustat = kcpustat_this_cpu->cpustat;
s64 delta;
int cpu;
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
- u64_stats_update_begin(&irqtime->sync);
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count()) {
- cpustat[CPUTIME_IRQ] += delta;
- irqtime->tick_delta += delta;
- } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
- cpustat[CPUTIME_SOFTIRQ] += delta;
- irqtime->tick_delta += delta;
- }
-
- u64_stats_update_end(&irqtime->sync);
+ if (hardirq_count())
+ irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf92214ad8..767aab3505a8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
+ u64 total;
u64 tick_delta;
u64 irq_start_time;
struct u64_stats_sync sync;
@@ -1876,16 +1877,20 @@ struct irqtime {
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
static inline u64 irq_time_read(int cpu)
{
struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
- u64 *cpustat = kcpustat_cpu(cpu).cpustat;
unsigned int seq;
u64 total;
do {
seq = __u64_stats_fetch_begin(&irqtime->sync);
- total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+ total = irqtime->total;
} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
return total;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b2058a7f94bd..bd8ae8d5ae9c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q)
/**
* blk_add_trace_rq - Add a trace for a request oriented action
- * @q: queue the io is for
* @rq: the source request
+ * @error: return status to log
* @nr_bytes: number of completed bytes
* @what: the action
*
@@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q)
* Records an action against a request. Will log the bio offset + size.
*
**/
-static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+static void blk_add_trace_rq(struct request *rq, int error,
unsigned int nr_bytes, u32 what)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt = rq->q->blk_trace;
if (likely(!bt))
return;
@@ -713,40 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, rq->errors, 0, NULL);
-}
-
-static void blk_add_trace_rq_abort(void *ignore,
- struct request_queue *q, struct request *rq)
-{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
+ rq->cmd_flags, what, error, 0, NULL);
}
static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
}
static void blk_add_trace_rq_issue(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
}
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
}
-static void blk_add_trace_rq_complete(void *ignore,
- struct request_queue *q,
- struct request *rq,
- unsigned int nr_bytes)
+static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
+ int error, unsigned int nr_bytes)
{
- blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
+ blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
}
/**
@@ -941,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
- rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
+ rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
sizeof(r), &r);
}
@@ -966,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q,
return;
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, rq->errors, len, data);
+ BLK_TA_DRV_DATA, 0, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -974,8 +966,6 @@ static void blk_register_tracepoints(void)
{
int ret;
- ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
- WARN_ON(ret);
ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
@@ -1028,7 +1018,6 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
- unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
tracepoint_synchronize_unregister();
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c0168b7da1ea..bbf46da28e9a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3209,9 +3209,8 @@ static int init_worker_pool(struct worker_pool *pool)
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
- init_timer_deferrable(&pool->idle_timer);
- pool->idle_timer.function = idle_worker_timeout;
- pool->idle_timer.data = (unsigned long)pool;
+ setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout,
+ (unsigned long)pool);
setup_timer(&pool->mayday_timer, pool_mayday_timeout,
(unsigned long)pool);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 60abc44385b7..cc001a542cb5 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -798,7 +798,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
while (1) {
size_t n = off - pipe->bufs[idx].offset;
if (unroll < n) {
- off -= (n - unroll);
+ off -= unroll;
break;
}
unroll -= n;
diff --git a/lib/kobject.c b/lib/kobject.c
index 445dcaeb0f56..763d70a18941 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -601,12 +601,15 @@ struct kobject *kobject_get(struct kobject *kobj)
}
EXPORT_SYMBOL(kobject_get);
-static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
+struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
{
+ if (!kobj)
+ return NULL;
if (!kref_get_unless_zero(&kobj->kref))
kobj = NULL;
return kobj;
}
+EXPORT_SYMBOL(kobject_get_unless_zero);
/*
* kobject_cleanup - free kobject resources.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 60e800e0b5a0..80aa8d5463fa 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -79,15 +79,15 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
}
EXPORT_SYMBOL_GPL(sbitmap_resize);
-static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
- bool wrap)
+static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
+ unsigned int hint, bool wrap)
{
unsigned int orig_hint = hint;
int nr;
while (1) {
- nr = find_next_zero_bit(&word->word, word->depth, hint);
- if (unlikely(nr >= word->depth)) {
+ nr = find_next_zero_bit(word, depth, hint);
+ if (unlikely(nr >= depth)) {
/*
* We started with an offset, and we didn't reset the
* offset to 0 in a failure case, so start from 0 to
@@ -100,11 +100,11 @@ static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
return -1;
}
- if (!test_and_set_bit(nr, &word->word))
+ if (!test_and_set_bit(nr, word))
break;
hint = nr + 1;
- if (hint >= word->depth - 1)
+ if (hint >= depth - 1)
hint = 0;
}
@@ -119,7 +119,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
index = SB_NR_TO_INDEX(sb, alloc_hint);
for (i = 0; i < sb->map_nr; i++) {
- nr = __sbitmap_get_word(&sb->map[index],
+ nr = __sbitmap_get_word(&sb->map[index].word,
+ sb->map[index].depth,
SB_NR_TO_BIT(sb, alloc_hint),
!round_robin);
if (nr != -1) {
@@ -141,6 +142,37 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
}
EXPORT_SYMBOL_GPL(sbitmap_get);
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+ unsigned long shallow_depth)
+{
+ unsigned int i, index;
+ int nr = -1;
+
+ index = SB_NR_TO_INDEX(sb, alloc_hint);
+
+ for (i = 0; i < sb->map_nr; i++) {
+ nr = __sbitmap_get_word(&sb->map[index].word,
+ min(sb->map[index].depth, shallow_depth),
+ SB_NR_TO_BIT(sb, alloc_hint), true);
+ if (nr != -1) {
+ nr += index << sb->shift;
+ break;
+ }
+
+ /* Jump to next index. */
+ index++;
+ alloc_hint = index << sb->shift;
+
+ if (index >= sb->map_nr) {
+ index = 0;
+ alloc_hint = 0;
+ }
+ }
+
+ return nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
+
bool sbitmap_any_bit_set(const struct sbitmap *sb)
{
unsigned int i;
@@ -342,6 +374,35 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq)
}
EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+ unsigned int shallow_depth)
+{
+ unsigned int hint, depth;
+ int nr;
+
+ hint = this_cpu_read(*sbq->alloc_hint);
+ depth = READ_ONCE(sbq->sb.depth);
+ if (unlikely(hint >= depth)) {
+ hint = depth ? prandom_u32() % depth : 0;
+ this_cpu_write(*sbq->alloc_hint, hint);
+ }
+ nr = sbitmap_get_shallow(&sbq->sb, hint, shallow_depth);
+
+ if (nr == -1) {
+ /* If the map is full, a hint won't do us much good. */
+ this_cpu_write(*sbq->alloc_hint, 0);
+ } else if (nr == hint || unlikely(sbq->round_robin)) {
+ /* Only update the hint if we used it. */
+ hint = nr + 1;
+ if (hint >= depth - 1)
+ hint = 0;
+ this_cpu_write(*sbq->alloc_hint, hint);
+ }
+
+ return nr;
+}
+EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
+
static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
{
int i, wake_index;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c6f2a37028c2..f028a9a472fd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -12,8 +12,6 @@
#include <linux/device.h>
#include <trace/events/writeback.h>
-static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -242,6 +240,8 @@ static __init int bdi_class_init(void)
}
postcore_initcall(bdi_class_init);
+static int bdi_init(struct backing_dev_info *bdi);
+
static int __init default_bdi_init(void)
{
int err;
@@ -294,6 +294,8 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
memset(wb, 0, sizeof(*wb));
+ if (wb != &bdi->wb)
+ bdi_get(bdi);
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
@@ -314,8 +316,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
wb->dirty_sleep = jiffies;
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
- if (!wb->congested)
- return -ENOMEM;
+ if (!wb->congested) {
+ err = -ENOMEM;
+ goto out_put_bdi;
+ }
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -335,9 +339,14 @@ out_destroy_stat:
fprop_local_destroy_percpu(&wb->completions);
out_put_cong:
wb_congested_put(wb->congested);
+out_put_bdi:
+ if (wb != &bdi->wb)
+ bdi_put(bdi);
return err;
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
+
/*
* Remove bdi from the global list and shutdown any threads we have running
*/
@@ -347,10 +356,18 @@ static void wb_shutdown(struct bdi_writeback *wb)
spin_lock_bh(&wb->work_lock);
if (!test_and_clear_bit(WB_registered, &wb->state)) {
spin_unlock_bh(&wb->work_lock);
+ /*
+ * Wait for wb shutdown to finish if someone else is just
+ * running wb_shutdown(). Otherwise we could proceed to wb /
+ * bdi destruction before wb_shutdown() is finished.
+ */
+ wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
return;
}
+ set_bit(WB_shutting_down, &wb->state);
spin_unlock_bh(&wb->work_lock);
+ cgwb_remove_from_bdi_list(wb);
/*
* Drain work list and shutdown the delayed_work. !WB_registered
* tells wb_workfn() that @wb is dying and its work_list needs to
@@ -359,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb)
mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list));
+ /*
+ * Make sure bit gets cleared after shutdown is finished. Matches with
+ * the barrier provided by test_and_clear_bit() above.
+ */
+ smp_wmb();
+ clear_bit(WB_shutting_down, &wb->state);
}
static void wb_exit(struct bdi_writeback *wb)
@@ -372,6 +395,8 @@ static void wb_exit(struct bdi_writeback *wb)
fprop_local_destroy_percpu(&wb->completions);
wb_congested_put(wb->congested);
+ if (wb != &wb->bdi->wb)
+ bdi_put(wb->bdi);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -381,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb)
/*
* cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
* blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
- * protected. cgwb_release_wait is used to wait for the completion of cgwb
- * releases from bdi destruction path.
+ * protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
-static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
/**
* wb_congested_get_create - get or create a wb_congested
@@ -438,7 +461,7 @@ retry:
return NULL;
atomic_set(&new_congested->refcnt, 0);
- new_congested->bdi = bdi;
+ new_congested->__bdi = bdi;
new_congested->blkcg_id = blkcg_id;
goto retry;
@@ -466,10 +489,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
}
/* bdi might already have been destroyed leaving @congested unlinked */
- if (congested->bdi) {
+ if (congested->__bdi) {
rb_erase(&congested->rb_node,
- &congested->bdi->cgwb_congested_tree);
- congested->bdi = NULL;
+ &congested->__bdi->cgwb_congested_tree);
+ congested->__bdi = NULL;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -480,11 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
- struct backing_dev_info *bdi = wb->bdi;
-
- spin_lock_irq(&cgwb_lock);
- list_del_rcu(&wb->bdi_node);
- spin_unlock_irq(&cgwb_lock);
wb_shutdown(wb);
@@ -495,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work)
percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
kfree_rcu(wb, rcu);
-
- if (atomic_dec_and_test(&bdi->usage_cnt))
- wake_up_all(&cgwb_release_wait);
}
static void cgwb_release(struct percpu_ref *refcnt)
@@ -517,6 +532,13 @@ static void cgwb_kill(struct bdi_writeback *wb)
percpu_ref_kill(&wb->refcnt);
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_del_rcu(&wb->bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+}
+
static int cgwb_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
@@ -580,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
/* we might have raced another instance of this function */
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
if (!ret) {
- atomic_inc(&bdi->usage_cnt);
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
@@ -670,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
- atomic_set(&bdi->usage_cnt, 1);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
@@ -680,29 +700,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return ret;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
{
struct radix_tree_iter iter;
void **slot;
+ struct bdi_writeback *wb;
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
spin_lock_irq(&cgwb_lock);
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
- spin_unlock_irq(&cgwb_lock);
- /*
- * All cgwb's must be shutdown and released before returning. Drain
- * the usage counter to wait for all cgwb's ever created on @bdi.
- */
- atomic_dec(&bdi->usage_cnt);
- wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
- /*
- * Grab back our reference so that we hold it when @bdi gets
- * re-registered.
- */
- atomic_inc(&bdi->usage_cnt);
+ while (!list_empty(&bdi->wb_list)) {
+ wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
+ bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+ wb_shutdown(wb);
+ spin_lock_irq(&cgwb_lock);
+ }
+ spin_unlock_irq(&cgwb_lock);
}
/**
@@ -752,11 +769,18 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi)
rb_entry(rbn, struct bdi_writeback_congested, rb_node);
rb_erase(rbn, &bdi->cgwb_congested_tree);
- congested->bdi = NULL; /* mark @congested unlinked */
+ congested->__bdi = NULL; /* mark @congested unlinked */
}
spin_unlock_irq(&cgwb_lock);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+ spin_unlock_irq(&cgwb_lock);
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int cgwb_bdi_init(struct backing_dev_info *bdi)
@@ -777,16 +801,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return 0;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
static void cgwb_bdi_exit(struct backing_dev_info *bdi)
{
wb_congested_put(bdi->wb_congested);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+}
+
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ list_del_rcu(&wb->bdi_node);
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
-int bdi_init(struct backing_dev_info *bdi)
+static int bdi_init(struct backing_dev_info *bdi)
{
int ret;
@@ -802,11 +836,8 @@ int bdi_init(struct backing_dev_info *bdi)
ret = cgwb_bdi_init(bdi);
- list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
-
return ret;
}
-EXPORT_SYMBOL(bdi_init);
struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
{
@@ -823,22 +854,20 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
}
return bdi;
}
+EXPORT_SYMBOL(bdi_alloc_node);
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
- const char *fmt, ...)
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
- va_list args;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
- va_start(args, fmt);
- dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
- va_end(args);
+ dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
if (IS_ERR(dev))
return PTR_ERR(dev);
+ cgwb_bdi_register(bdi);
bdi->dev = dev;
bdi_debug_register(bdi, dev_name(dev));
@@ -851,20 +880,25 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
trace_writeback_bdi_register(bdi);
return 0;
}
-EXPORT_SYMBOL(bdi_register);
+EXPORT_SYMBOL(bdi_register_va);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
- return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+ va_list args;
+ int ret;
+
+ va_start(args, fmt);
+ ret = bdi_register_va(bdi, fmt, args);
+ va_end(args);
+ return ret;
}
-EXPORT_SYMBOL(bdi_register_dev);
+EXPORT_SYMBOL(bdi_register);
int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
{
int rc;
- rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt),
- MINOR(owner->devt));
+ rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt));
if (rc)
return rc;
/* Leaking owner reference... */
@@ -892,7 +926,7 @@ void bdi_unregister(struct backing_dev_info *bdi)
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
- cgwb_bdi_destroy(bdi);
+ cgwb_bdi_unregister(bdi);
if (bdi->dev) {
bdi_debug_unregister(bdi);
@@ -906,19 +940,16 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
}
-static void bdi_exit(struct backing_dev_info *bdi)
-{
- WARN_ON_ONCE(bdi->dev);
- wb_exit(&bdi->wb);
- cgwb_bdi_exit(bdi);
-}
-
static void release_bdi(struct kref *ref)
{
struct backing_dev_info *bdi =
container_of(ref, struct backing_dev_info, refcnt);
- bdi_exit(bdi);
+ if (test_bit(WB_registered, &bdi->wb.state))
+ bdi_unregister(bdi);
+ WARN_ON_ONCE(bdi->dev);
+ wb_exit(&bdi->wb);
+ cgwb_bdi_exit(bdi);
kfree(bdi);
}
@@ -926,38 +957,7 @@ void bdi_put(struct backing_dev_info *bdi)
{
kref_put(&bdi->refcnt, release_bdi);
}
-
-void bdi_destroy(struct backing_dev_info *bdi)
-{
- bdi_unregister(bdi);
- bdi_exit(bdi);
-}
-EXPORT_SYMBOL(bdi_destroy);
-
-/*
- * For use from filesystems to quickly init and register a bdi associated
- * with dirty writeback
- */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
-{
- int err;
-
- bdi->name = name;
- bdi->capabilities = 0;
- err = bdi_init(bdi);
- if (err)
- return err;
-
- err = bdi_register(bdi, NULL, "%.28s-%ld", name,
- atomic_long_inc_return(&bdi_seq));
- if (err) {
- bdi_destroy(bdi);
- return err;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bdi_setup_and_register);
+EXPORT_SYMBOL(bdi_put);
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
diff --git a/net/9p/client.c b/net/9p/client.c
index 3ce672af1596..8e5c6a8d0a37 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -2101,6 +2101,10 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
trace_9p_protocol_dump(clnt, req->rc);
goto free_and_error;
}
+ if (rsize < count) {
+ pr_err("bogus RREADDIR count (%d > %d)\n", count, rsize);
+ count = rsize;
+ }
p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 90f49a194249..430b53e7d941 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -123,6 +123,7 @@ static void br_dev_uninit(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
+ br_multicast_dev_del(br);
br_multicast_uninit_stats(br);
br_vlan_flush(br);
free_percpu(br->stats);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 56a2a72e7738..a8d0ed282a10 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -311,7 +311,6 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
br_fdb_delete_by_port(br, NULL, 0, 1);
- br_multicast_dev_del(br);
cancel_delayed_work_sync(&br->gc_work);
br_sysfs_delbr(br->dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 533a6d6f6092..9b5875388c23 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2450,6 +2450,9 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
{
unsigned long flags;
+ if (unlikely(!skb))
+ return;
+
if (likely(atomic_read(&skb->users) == 1)) {
smp_rmb();
atomic_set(&skb->users, 0);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f86bf69cfb8d..f1d04592ace0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1576,6 +1576,8 @@ done:
skb_set_tail_pointer(skb, len);
}
+ if (!skb->sk || skb->destructor == sock_edemux)
+ skb_condense(skb);
return 0;
}
EXPORT_SYMBOL(___pskb_trim);
diff --git a/net/core/utils.c b/net/core/utils.c
index 6592d7bbed39..32c467cf52d6 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -26,9 +26,11 @@
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/ratelimit.h>
+#include <linux/socket.h>
#include <net/sock.h>
#include <net/net_ratelimit.h>
+#include <net/ipv6.h>
#include <asm/byteorder.h>
#include <linux/uaccess.h>
@@ -300,6 +302,107 @@ out:
}
EXPORT_SYMBOL(in6_pton);
+static int inet4_pton(const char *src, u16 port_num,
+ struct sockaddr_storage *addr)
+{
+ struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+ int srclen = strlen(src);
+
+ if (srclen > INET_ADDRSTRLEN)
+ return -EINVAL;
+
+ if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr,
+ '\n', NULL) == 0)
+ return -EINVAL;
+
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = htons(port_num);
+
+ return 0;
+}
+
+static int inet6_pton(struct net *net, const char *src, u16 port_num,
+ struct sockaddr_storage *addr)
+{
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+ const char *scope_delim;
+ int srclen = strlen(src);
+
+ if (srclen > INET6_ADDRSTRLEN)
+ return -EINVAL;
+
+ if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr,
+ '%', &scope_delim) == 0)
+ return -EINVAL;
+
+ if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL &&
+ src + srclen != scope_delim && *scope_delim == '%') {
+ struct net_device *dev;
+ char scope_id[16];
+ size_t scope_len = min_t(size_t, sizeof(scope_id) - 1,
+ src + srclen - scope_delim - 1);
+
+ memcpy(scope_id, scope_delim + 1, scope_len);
+ scope_id[scope_len] = '\0';
+
+ dev = dev_get_by_name(net, scope_id);
+ if (dev) {
+ addr6->sin6_scope_id = dev->ifindex;
+ dev_put(dev);
+ } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) {
+ return -EINVAL;
+ }
+ }
+
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(port_num);
+
+ return 0;
+}
+
+/**
+ * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address
+ * @net: net namespace (used for scope handling)
+ * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either
+ * @src: the start of the address string
+ * @port: the start of the port string (or NULL for none)
+ * @addr: output socket address
+ *
+ * Return zero on success, return errno when any error occurs.
+ */
+int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
+ const char *src, const char *port, struct sockaddr_storage *addr)
+{
+ u16 port_num;
+ int ret = -EINVAL;
+
+ if (port) {
+ if (kstrtou16(port, 0, &port_num))
+ return -EINVAL;
+ } else {
+ port_num = 0;
+ }
+
+ switch (af) {
+ case AF_INET:
+ ret = inet4_pton(src, port_num, addr);
+ break;
+ case AF_INET6:
+ ret = inet6_pton(net, src, port_num, addr);
+ break;
+ case AF_UNSPEC:
+ ret = inet4_pton(src, port_num, addr);
+ if (ret)
+ ret = inet6_pton(net, src, port_num, addr);
+ break;
+ default:
+ pr_err("unexpected address family %d\n", af);
+ };
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_pton_with_scope);
+
void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
__be32 from, __be32 to, bool pseudohdr)
{
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6b1fc6e4278e..13a9a3297eae 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1343,6 +1343,9 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
if (*(u8 *)iph != 0x45)
goto out_unlock;
+ if (ip_is_fragment(iph))
+ goto out_unlock;
+
if (unlikely(ip_fast_csum((u8 *)iph, 5)))
goto out_unlock;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index acd69cfe2951..d9724889ff09 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2359,7 +2359,8 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
}
/* L3 master device is the loopback for that domain */
- dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
+ dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(res)) ? :
+ net->loopback_dev;
fl4->flowi4_oif = dev_out->ifindex;
flags |= RTCF_LOCAL;
goto make_route;
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 79c4817abc94..6e3c512054a6 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -168,12 +168,8 @@ void tcp_assign_congestion_control(struct sock *sk)
}
out:
rcu_read_unlock();
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
- /* Clear out private data before diag gets it and
- * the ca has not been initialized.
- */
- if (ca->get_info)
- memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (ca->flags & TCP_CONG_NEEDS_ECN)
INET_ECN_xmit(sk);
else
@@ -200,11 +196,10 @@ static void tcp_reinit_congestion_control(struct sock *sk,
tcp_cleanup_congestion_control(sk);
icsk->icsk_ca_ops = ca;
icsk->icsk_ca_setsockopt = 1;
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
- if (sk->sk_state != TCP_CLOSE) {
- memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ if (sk->sk_state != TCP_CLOSE)
tcp_init_congestion_control(sk);
- }
}
/* Manage refcounts on socket close. */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c3c082ed3879..a85d863c4419 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1267,7 +1267,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
* eventually). The difference is that pulled data not copied, but
* immediately discarded.
*/
-static void __pskb_trim_head(struct sk_buff *skb, int len)
+static int __pskb_trim_head(struct sk_buff *skb, int len)
{
struct skb_shared_info *shinfo;
int i, k, eat;
@@ -1277,7 +1277,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
__skb_pull(skb, eat);
len -= eat;
if (!len)
- return;
+ return 0;
}
eat = len;
k = 0;
@@ -1303,23 +1303,28 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
skb_reset_tail_pointer(skb);
skb->data_len -= len;
skb->len = skb->data_len;
+ return len;
}
/* Remove acked data from a packet in the transmit queue. */
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
+ u32 delta_truesize;
+
if (skb_unclone(skb, GFP_ATOMIC))
return -ENOMEM;
- __pskb_trim_head(skb, len);
+ delta_truesize = __pskb_trim_head(skb, len);
TCP_SKB_CB(skb)->seq += len;
skb->ip_summed = CHECKSUM_PARTIAL;
- skb->truesize -= len;
- sk->sk_wmem_queued -= len;
- sk_mem_uncharge(sk, len);
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+ if (delta_truesize) {
+ skb->truesize -= delta_truesize;
+ sk->sk_wmem_queued -= delta_truesize;
+ sk_mem_uncharge(sk, delta_truesize);
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+ }
/* Any change of skb->len requires recalculation of tso factor. */
if (tcp_skb_pcount(skb) > 1)
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index b2be1d9757ef..781250151d40 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -29,6 +29,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
u16 mac_len = skb->mac_len;
int udp_offset, outer_hlen;
__wsum partial;
+ bool need_ipsec;
if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
goto out;
@@ -62,8 +63,10 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+ need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
/* Try to offload checksum if possible */
offload_csum = !!(need_csum &&
+ !need_ipsec &&
(skb->dev->features &
(is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) :
(NETIF_F_HW_CSUM | NETIF_F_IP_CSUM))));
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 80ce478c4851..0ea96c4d334d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3271,14 +3271,24 @@ static void addrconf_gre_config(struct net_device *dev)
static int fixup_permanent_addr(struct inet6_dev *idev,
struct inet6_ifaddr *ifp)
{
- if (!ifp->rt) {
- struct rt6_info *rt;
+ /* rt6i_ref == 0 means the host route was removed from the
+ * FIB, for example, if 'lo' device is taken down. In that
+ * case regenerate the host route.
+ */
+ if (!ifp->rt || !atomic_read(&ifp->rt->rt6i_ref)) {
+ struct rt6_info *rt, *prev;
rt = addrconf_dst_alloc(idev, &ifp->addr, false);
if (unlikely(IS_ERR(rt)))
return PTR_ERR(rt);
+ /* ifp->rt can be accessed outside of rtnl */
+ spin_lock(&ifp->lock);
+ prev = ifp->rt;
ifp->rt = rt;
+ spin_unlock(&ifp->lock);
+
+ ip6_rt_put(prev);
}
if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index a9a9553ee63d..e82e59f22dfc 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -933,8 +933,6 @@ static int __init inet6_init(void)
if (err)
goto igmp_fail;
- ipv6_stub = &ipv6_stub_impl;
-
err = ipv6_netfilter_init();
if (err)
goto netfilter_fail;
@@ -1010,6 +1008,10 @@ static int __init inet6_init(void)
if (err)
goto sysctl_fail;
#endif
+
+ /* ensure that ipv6 stubs are visible only after ipv6 is ready */
+ wmb();
+ ipv6_stub = &ipv6_stub_impl;
out:
return err;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 25192a3b0cd7..d32e2110aff2 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -909,6 +909,8 @@ static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
{
switch (opt->type) {
case IPV6_SRCRT_TYPE_0:
+ case IPV6_SRCRT_STRICT:
+ case IPV6_SRCRT_TYPE_2:
ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr);
break;
case IPV6_SRCRT_TYPE_4:
@@ -1163,6 +1165,8 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
switch (opt->srcrt->type) {
case IPV6_SRCRT_TYPE_0:
+ case IPV6_SRCRT_STRICT:
+ case IPV6_SRCRT_TYPE_2:
fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
break;
case IPV6_SRCRT_TYPE_4:
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 75fac933c209..a9692ec0cd6d 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1037,7 +1037,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
struct ip6_tnl *t = netdev_priv(dev);
struct net *net = t->net;
struct net_device_stats *stats = &t->dev->stats;
- struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct ipv6hdr *ipv6h;
struct ipv6_tel_txoption opt;
struct dst_entry *dst = NULL, *ndst = NULL;
struct net_device *tdev;
@@ -1057,26 +1057,28 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
/* NBMA tunnel */
if (ipv6_addr_any(&t->parms.raddr)) {
- struct in6_addr *addr6;
- struct neighbour *neigh;
- int addr_type;
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct in6_addr *addr6;
+ struct neighbour *neigh;
+ int addr_type;
- if (!skb_dst(skb))
- goto tx_err_link_failure;
+ if (!skb_dst(skb))
+ goto tx_err_link_failure;
- neigh = dst_neigh_lookup(skb_dst(skb),
- &ipv6_hdr(skb)->daddr);
- if (!neigh)
- goto tx_err_link_failure;
+ neigh = dst_neigh_lookup(skb_dst(skb),
+ &ipv6_hdr(skb)->daddr);
+ if (!neigh)
+ goto tx_err_link_failure;
- addr6 = (struct in6_addr *)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
+ addr6 = (struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
- if (addr_type == IPV6_ADDR_ANY)
- addr6 = &ipv6_hdr(skb)->daddr;
+ if (addr_type == IPV6_ADDR_ANY)
+ addr6 = &ipv6_hdr(skb)->daddr;
- memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
- neigh_release(neigh);
+ memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
+ neigh_release(neigh);
+ }
} else if (!(t->parms.flags &
(IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) {
/* enable the cache only only if the routing decision does
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7ebac630d3c6..cb1766724a4c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1749,7 +1749,8 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
idev = in6_dev_get(dev);
if (!idev)
break;
- if (idev->cnf.ndisc_notify)
+ if (idev->cnf.ndisc_notify ||
+ net->ipv6.devconf_all->ndisc_notify)
ndisc_send_unsol_na(dev);
in6_dev_put(idev);
break;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index f174e76e6505..0da6a12b5472 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1178,8 +1178,7 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
- amount = skb_tail_pointer(skb) -
- skb_transport_header(skb);
+ amount = skb->len;
spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg);
}
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8489beff5c25..ea81ccf3c7d6 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3836,6 +3836,8 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
case PACKET_HDRLEN:
if (len > sizeof(int))
len = sizeof(int);
+ if (len < sizeof(int))
+ return -EINVAL;
if (copy_from_user(&val, optval, len))
return -EFAULT;
switch (val) {
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 7130e73bd42c..bdce99f9407a 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -866,6 +866,14 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
if (!tsk_peer_msg(tsk, hdr))
goto exit;
+ if (unlikely(msg_errcode(hdr))) {
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk),
+ tsk_peer_port(tsk));
+ sk->sk_state_change(sk);
+ goto exit;
+ }
+
tsk->probe_unacked = false;
if (mtyp == CONN_PROBE) {
@@ -1083,7 +1091,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
}
} while (sent < dlen && !rc);
- return rc ? rc : sent;
+ return sent ? sent : rc;
}
/**
@@ -1259,7 +1267,10 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
struct sock *sk = sock->sk;
DEFINE_WAIT(wait);
long timeo = *timeop;
- int err;
+ int err = sock_error(sk);
+
+ if (err)
+ return err;
for (;;) {
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
@@ -1281,6 +1292,10 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
err = sock_intr_errno(timeo);
if (signal_pending(current))
break;
+
+ err = sock_error(sk);
+ if (err)
+ break;
}
finish_wait(sk_sleep(sk), &wait);
*timeop = timeo;
@@ -1484,7 +1499,7 @@ restart:
if (unlikely(flags & MSG_PEEK))
goto exit;
- tsk->rcv_unacked += tsk_inc(tsk, hlen + sz);
+ tsk->rcv_unacked += tsk_inc(tsk, hlen + msg_data_sz(msg));
if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4)))
tipc_sk_send_ack(tsk);
tsk_advance_rx_queue(sk);
@@ -1551,6 +1566,8 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
struct sock *sk = &tsk->sk;
struct net *net = sock_net(sk);
struct tipc_msg *hdr = buf_msg(skb);
+ u32 pport = msg_origport(hdr);
+ u32 pnode = msg_orignode(hdr);
if (unlikely(msg_mcast(hdr)))
return false;
@@ -1558,18 +1575,28 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
switch (sk->sk_state) {
case TIPC_CONNECTING:
/* Accept only ACK or NACK message */
- if (unlikely(!msg_connected(hdr)))
- return false;
+ if (unlikely(!msg_connected(hdr))) {
+ if (pport != tsk_peer_port(tsk) ||
+ pnode != tsk_peer_node(tsk))
+ return false;
+
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ sk->sk_err = ECONNREFUSED;
+ sk->sk_state_change(sk);
+ return true;
+ }
if (unlikely(msg_errcode(hdr))) {
tipc_set_sk_state(sk, TIPC_DISCONNECTING);
sk->sk_err = ECONNREFUSED;
+ sk->sk_state_change(sk);
return true;
}
if (unlikely(!msg_isdata(hdr))) {
tipc_set_sk_state(sk, TIPC_DISCONNECTING);
sk->sk_err = EINVAL;
+ sk->sk_state_change(sk);
return true;
}
@@ -1581,8 +1608,7 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
return true;
/* If empty 'ACK-' message, wake up sleeping connect() */
- if (waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible(sk_sleep(sk));
+ sk->sk_data_ready(sk);
/* 'ACK-' message is neither accepted nor rejected: */
msg_set_dest_droppable(hdr, 1);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 46bdb4fbed0b..e23570b647ae 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -395,7 +395,7 @@ resume:
if (xo)
xfrm_gro = xo->flags & XFRM_GRO;
- err = x->inner_mode->afinfo->transport_finish(skb, async);
+ err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async);
if (xfrm_gro) {
skb_dst_drop(skb);
gro_cells_receive(&gro_cells, skb);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 236cbbc0ab9c..dfc77b9c5e5a 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1006,6 +1006,10 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
err = -ESRCH;
out:
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ if (cnt)
+ xfrm_garbage_collect(net);
+
return err;
}
EXPORT_SYMBOL(xfrm_policy_flush);
diff --git a/sound/core/seq/seq_lock.c b/sound/core/seq/seq_lock.c
index 3b693e924db7..12ba83367b1b 100644
--- a/sound/core/seq/seq_lock.c
+++ b/sound/core/seq/seq_lock.c
@@ -28,19 +28,16 @@
/* wait until all locks are released */
void snd_use_lock_sync_helper(snd_use_lock_t *lockp, const char *file, int line)
{
- int max_count = 5 * HZ;
+ int warn_count = 5 * HZ;
if (atomic_read(lockp) < 0) {
pr_warn("ALSA: seq_lock: lock trouble [counter = %d] in %s:%d\n", atomic_read(lockp), file, line);
return;
}
while (atomic_read(lockp) > 0) {
- if (max_count == 0) {
- pr_warn("ALSA: seq_lock: timeout [%d left] in %s:%d\n", atomic_read(lockp), file, line);
- break;
- }
+ if (warn_count-- == 0)
+ pr_warn("ALSA: seq_lock: waiting [%d left] in %s:%d\n", atomic_read(lockp), file, line);
schedule_timeout_uninterruptible(1);
- max_count--;
}
}
diff --git a/sound/firewire/lib.h b/sound/firewire/lib.h
index f6769312ebfc..c3768cd494a5 100644
--- a/sound/firewire/lib.h
+++ b/sound/firewire/lib.h
@@ -45,7 +45,7 @@ struct snd_fw_async_midi_port {
struct snd_rawmidi_substream *substream;
snd_fw_async_midi_port_fill fill;
- unsigned int consume_bytes;
+ int consume_bytes;
};
int snd_fw_async_midi_port_init(struct snd_fw_async_midi_port *port,
diff --git a/sound/firewire/oxfw/oxfw.c b/sound/firewire/oxfw/oxfw.c
index 74d7fb6efce6..413ab6313bb6 100644
--- a/sound/firewire/oxfw/oxfw.c
+++ b/sound/firewire/oxfw/oxfw.c
@@ -227,11 +227,11 @@ static void do_registration(struct work_struct *work)
if (err < 0)
goto error;
- err = detect_quirks(oxfw);
+ err = snd_oxfw_stream_discover(oxfw);
if (err < 0)
goto error;
- err = snd_oxfw_stream_discover(oxfw);
+ err = detect_quirks(oxfw);
if (err < 0)
goto error;
diff --git a/sound/pci/hda/dell_wmi_helper.c b/sound/pci/hda/dell_wmi_helper.c
index 19d41da79f93..7efa7bd7acb2 100644
--- a/sound/pci/hda/dell_wmi_helper.c
+++ b/sound/pci/hda/dell_wmi_helper.c
@@ -2,11 +2,11 @@
* to be included from codec driver
*/
-#if IS_ENABLED(CONFIG_LEDS_DELL_NETBOOKS)
+#if IS_ENABLED(CONFIG_DELL_LAPTOP)
#include <linux/dell-led.h>
static int dell_led_value;
-static int (*dell_led_set_func)(int, int);
+static int (*dell_micmute_led_set_func)(int);
static void (*dell_old_cap_hook)(struct hda_codec *,
struct snd_kcontrol *,
struct snd_ctl_elem_value *);
@@ -18,7 +18,7 @@ static void update_dell_wmi_micmute_led(struct hda_codec *codec,
if (dell_old_cap_hook)
dell_old_cap_hook(codec, kcontrol, ucontrol);
- if (!ucontrol || !dell_led_set_func)
+ if (!ucontrol || !dell_micmute_led_set_func)
return;
if (strcmp("Capture Switch", ucontrol->id.name) == 0 && ucontrol->id.index == 0) {
/* TODO: How do I verify if it's a mono or stereo here? */
@@ -26,8 +26,8 @@ static void update_dell_wmi_micmute_led(struct hda_codec *codec,
if (val == dell_led_value)
return;
dell_led_value = val;
- if (dell_led_set_func)
- dell_led_set_func(DELL_LED_MICMUTE, dell_led_value);
+ if (dell_micmute_led_set_func)
+ dell_micmute_led_set_func(dell_led_value);
}
}
@@ -39,15 +39,15 @@ static void alc_fixup_dell_wmi(struct hda_codec *codec,
bool removefunc = false;
if (action == HDA_FIXUP_ACT_PROBE) {
- if (!dell_led_set_func)
- dell_led_set_func = symbol_request(dell_app_wmi_led_set);
- if (!dell_led_set_func) {
- codec_warn(codec, "Failed to find dell wmi symbol dell_app_wmi_led_set\n");
+ if (!dell_micmute_led_set_func)
+ dell_micmute_led_set_func = symbol_request(dell_micmute_led_set);
+ if (!dell_micmute_led_set_func) {
+ codec_warn(codec, "Failed to find dell wmi symbol dell_micmute_led_set\n");
return;
}
removefunc = true;
- if (dell_led_set_func(DELL_LED_MICMUTE, false) >= 0) {
+ if (dell_micmute_led_set_func(false) >= 0) {
dell_led_value = 0;
if (spec->gen.num_adc_nids > 1 && !spec->gen.dyn_adc_switch)
codec_dbg(codec, "Skipping micmute LED control due to several ADCs");
@@ -60,17 +60,17 @@ static void alc_fixup_dell_wmi(struct hda_codec *codec,
}
- if (dell_led_set_func && (action == HDA_FIXUP_ACT_FREE || removefunc)) {
- symbol_put(dell_app_wmi_led_set);
- dell_led_set_func = NULL;
+ if (dell_micmute_led_set_func && (action == HDA_FIXUP_ACT_FREE || removefunc)) {
+ symbol_put(dell_micmute_led_set);
+ dell_micmute_led_set_func = NULL;
dell_old_cap_hook = NULL;
}
}
-#else /* CONFIG_LEDS_DELL_NETBOOKS */
+#else /* CONFIG_DELL_LAPTOP */
static void alc_fixup_dell_wmi(struct hda_codec *codec,
const struct hda_fixup *fix, int action)
{
}
-#endif /* CONFIG_LEDS_DELL_NETBOOKS */
+#endif /* CONFIG_DELL_LAPTOP */
diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c
index 5c7219fb3aa8..9e2a3404a836 100644
--- a/sound/soc/intel/boards/bytcr_rt5640.c
+++ b/sound/soc/intel/boards/bytcr_rt5640.c
@@ -621,7 +621,7 @@ static struct snd_soc_dai_link byt_rt5640_dais[] = {
.codec_dai_name = "snd-soc-dummy-dai",
.codec_name = "snd-soc-dummy",
.platform_name = "sst-mfld-platform",
- .ignore_suspend = 1,
+ .nonatomic = true,
.dynamic = 1,
.dpcm_playback = 1,
.dpcm_capture = 1,
@@ -634,7 +634,6 @@ static struct snd_soc_dai_link byt_rt5640_dais[] = {
.codec_dai_name = "snd-soc-dummy-dai",
.codec_name = "snd-soc-dummy",
.platform_name = "sst-mfld-platform",
- .ignore_suspend = 1,
.nonatomic = true,
.dynamic = 1,
.dpcm_playback = 1,
@@ -661,6 +660,7 @@ static struct snd_soc_dai_link byt_rt5640_dais[] = {
| SND_SOC_DAIFMT_CBS_CFS,
.be_hw_params_fixup = byt_rt5640_codec_fixup,
.ignore_suspend = 1,
+ .nonatomic = true,
.dpcm_playback = 1,
.dpcm_capture = 1,
.init = byt_rt5640_init,
diff --git a/sound/soc/intel/boards/bytcr_rt5651.c b/sound/soc/intel/boards/bytcr_rt5651.c
index 3186f015939f..8164bec63bf1 100644
--- a/sound/soc/intel/boards/bytcr_rt5651.c
+++ b/sound/soc/intel/boards/bytcr_rt5651.c
@@ -235,7 +235,6 @@ static struct snd_soc_dai_link byt_rt5651_dais[] = {
.codec_dai_name = "snd-soc-dummy-dai",
.codec_name = "snd-soc-dummy",
.platform_name = "sst-mfld-platform",
- .ignore_suspend = 1,
.nonatomic = true,
.dynamic = 1,
.dpcm_playback = 1,
@@ -249,7 +248,6 @@ static struct snd_soc_dai_link byt_rt5651_dais[] = {
.codec_dai_name = "snd-soc-dummy-dai",
.codec_name = "snd-soc-dummy",
.platform_name = "sst-mfld-platform",
- .ignore_suspend = 1,
.nonatomic = true,
.dynamic = 1,
.dpcm_playback = 1,
diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index 3e9b1c0bb1ce..058bc99c6c34 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -933,6 +933,7 @@ static int soc_tplg_denum_create_texts(struct soc_enum *se,
}
}
+ se->texts = (const char * const *)se->dobj.control.dtexts;
return 0;
err:
diff --git a/sound/soc/sti/uniperif.h b/sound/soc/sti/uniperif.h
index d487dd2ef016..cfcb0ea9d99d 100644
--- a/sound/soc/sti/uniperif.h
+++ b/sound/soc/sti/uniperif.h
@@ -1299,6 +1299,7 @@ struct uniperif {
int ver; /* IP version, used by register access macros */
struct regmap_field *clk_sel;
struct regmap_field *valid_sel;
+ spinlock_t irq_lock; /* use to prevent race condition with IRQ */
/* capabilities */
const struct snd_pcm_hardware *hw;
diff --git a/sound/soc/sti/uniperif_player.c b/sound/soc/sti/uniperif_player.c
index 60ae31a303ab..d7e8dd46d2cc 100644
--- a/sound/soc/sti/uniperif_player.c
+++ b/sound/soc/sti/uniperif_player.c
@@ -65,10 +65,13 @@ static irqreturn_t uni_player_irq_handler(int irq, void *dev_id)
unsigned int status;
unsigned int tmp;
- if (player->state == UNIPERIF_STATE_STOPPED) {
- /* Unexpected IRQ: do nothing */
- return IRQ_NONE;
- }
+ spin_lock(&player->irq_lock);
+ if (!player->substream)
+ goto irq_spin_unlock;
+
+ snd_pcm_stream_lock(player->substream);
+ if (player->state == UNIPERIF_STATE_STOPPED)
+ goto stream_unlock;
/* Get interrupt status & clear them immediately */
status = GET_UNIPERIF_ITS(player);
@@ -88,9 +91,7 @@ static irqreturn_t uni_player_irq_handler(int irq, void *dev_id)
SET_UNIPERIF_ITM_BCLR_FIFO_ERROR(player);
/* Stop the player */
- snd_pcm_stream_lock(player->substream);
snd_pcm_stop(player->substream, SNDRV_PCM_STATE_XRUN);
- snd_pcm_stream_unlock(player->substream);
}
ret = IRQ_HANDLED;
@@ -104,9 +105,7 @@ static irqreturn_t uni_player_irq_handler(int irq, void *dev_id)
SET_UNIPERIF_ITM_BCLR_DMA_ERROR(player);
/* Stop the player */
- snd_pcm_stream_lock(player->substream);
snd_pcm_stop(player->substream, SNDRV_PCM_STATE_XRUN);
- snd_pcm_stream_unlock(player->substream);
ret = IRQ_HANDLED;
}
@@ -116,7 +115,8 @@ static irqreturn_t uni_player_irq_handler(int irq, void *dev_id)
if (!player->underflow_enabled) {
dev_err(player->dev,
"unexpected Underflow recovering\n");
- return -EPERM;
+ ret = -EPERM;
+ goto stream_unlock;
}
/* Read the underflow recovery duration */
tmp = GET_UNIPERIF_STATUS_1_UNDERFLOW_DURATION(player);
@@ -138,13 +138,16 @@ static irqreturn_t uni_player_irq_handler(int irq, void *dev_id)
dev_err(player->dev, "Underflow recovery failed\n");
/* Stop the player */
- snd_pcm_stream_lock(player->substream);
snd_pcm_stop(player->substream, SNDRV_PCM_STATE_XRUN);
- snd_pcm_stream_unlock(player->substream);
ret = IRQ_HANDLED;
}
+stream_unlock:
+ snd_pcm_stream_unlock(player->substream);
+irq_spin_unlock:
+ spin_unlock(&player->irq_lock);
+
return ret;
}
@@ -588,6 +591,7 @@ static int uni_player_ctl_iec958_put(struct snd_kcontrol *kcontrol,
struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
struct uniperif *player = priv->dai_data.uni;
struct snd_aes_iec958 *iec958 = &player->stream_settings.iec958;
+ unsigned long flags;
mutex_lock(&player->ctrl_lock);
iec958->status[0] = ucontrol->value.iec958.status[0];
@@ -596,12 +600,14 @@ static int uni_player_ctl_iec958_put(struct snd_kcontrol *kcontrol,
iec958->status[3] = ucontrol->value.iec958.status[3];
mutex_unlock(&player->ctrl_lock);
+ spin_lock_irqsave(&player->irq_lock, flags);
if (player->substream && player->substream->runtime)
uni_player_set_channel_status(player,
player->substream->runtime);
else
uni_player_set_channel_status(player, NULL);
+ spin_unlock_irqrestore(&player->irq_lock, flags);
return 0;
}
@@ -686,9 +692,12 @@ static int uni_player_startup(struct snd_pcm_substream *substream,
{
struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
struct uniperif *player = priv->dai_data.uni;
+ unsigned long flags;
int ret;
+ spin_lock_irqsave(&player->irq_lock, flags);
player->substream = substream;
+ spin_unlock_irqrestore(&player->irq_lock, flags);
player->clk_adj = 0;
@@ -986,12 +995,15 @@ static void uni_player_shutdown(struct snd_pcm_substream *substream,
{
struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
struct uniperif *player = priv->dai_data.uni;
+ unsigned long flags;
+ spin_lock_irqsave(&player->irq_lock, flags);
if (player->state != UNIPERIF_STATE_STOPPED)
/* Stop the player */
uni_player_stop(player);
player->substream = NULL;
+ spin_unlock_irqrestore(&player->irq_lock, flags);
}
static int uni_player_parse_dt_audio_glue(struct platform_device *pdev,
@@ -1096,6 +1108,7 @@ int uni_player_init(struct platform_device *pdev,
}
mutex_init(&player->ctrl_lock);
+ spin_lock_init(&player->irq_lock);
/* Ensure that disabled by default */
SET_UNIPERIF_CONFIG_BACK_STALL_REQ_DISABLE(player);
diff --git a/sound/soc/sti/uniperif_reader.c b/sound/soc/sti/uniperif_reader.c
index 93a8df6ed880..ee0055e60852 100644
--- a/sound/soc/sti/uniperif_reader.c
+++ b/sound/soc/sti/uniperif_reader.c
@@ -46,10 +46,15 @@ static irqreturn_t uni_reader_irq_handler(int irq, void *dev_id)
struct uniperif *reader = dev_id;
unsigned int status;
+ spin_lock(&reader->irq_lock);
+ if (!reader->substream)
+ goto irq_spin_unlock;
+
+ snd_pcm_stream_lock(reader->substream);
if (reader->state == UNIPERIF_STATE_STOPPED) {
/* Unexpected IRQ: do nothing */
dev_warn(reader->dev, "unexpected IRQ\n");
- return IRQ_HANDLED;
+ goto stream_unlock;
}
/* Get interrupt status & clear them immediately */
@@ -60,13 +65,16 @@ static irqreturn_t uni_reader_irq_handler(int irq, void *dev_id)
if (unlikely(status & UNIPERIF_ITS_FIFO_ERROR_MASK(reader))) {
dev_err(reader->dev, "FIFO error detected\n");
- snd_pcm_stream_lock(reader->substream);
snd_pcm_stop(reader->substream, SNDRV_PCM_STATE_XRUN);
- snd_pcm_stream_unlock(reader->substream);
- return IRQ_HANDLED;
+ ret = IRQ_HANDLED;
}
+stream_unlock:
+ snd_pcm_stream_unlock(reader->substream);
+irq_spin_unlock:
+ spin_unlock(&reader->irq_lock);
+
return ret;
}
@@ -347,9 +355,12 @@ static int uni_reader_startup(struct snd_pcm_substream *substream,
{
struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
struct uniperif *reader = priv->dai_data.uni;
+ unsigned long flags;
int ret;
+ spin_lock_irqsave(&reader->irq_lock, flags);
reader->substream = substream;
+ spin_unlock_irqrestore(&reader->irq_lock, flags);
if (!UNIPERIF_TYPE_IS_TDM(reader))
return 0;
@@ -375,12 +386,15 @@ static void uni_reader_shutdown(struct snd_pcm_substream *substream,
{
struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
struct uniperif *reader = priv->dai_data.uni;
+ unsigned long flags;
+ spin_lock_irqsave(&reader->irq_lock, flags);
if (reader->state != UNIPERIF_STATE_STOPPED) {
/* Stop the reader */
uni_reader_stop(reader);
}
reader->substream = NULL;
+ spin_unlock_irqrestore(&reader->irq_lock, flags);
}
static const struct snd_soc_dai_ops uni_reader_dai_ops = {
@@ -415,6 +429,8 @@ int uni_reader_init(struct platform_device *pdev,
return -EBUSY;
}
+ spin_lock_init(&reader->irq_lock);
+
return 0;
}
EXPORT_SYMBOL_GPL(uni_reader_init);