559 files changed, 19662 insertions, 9281 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd
index 501adc2a9ec7..2ddd680929d8 100644
--- a/Documentation/ABI/testing/sysfs-bus-rbd
+++ b/Documentation/ABI/testing/sysfs-bus-rbd
@@ -94,5 +94,5 @@ current_snap
 
 parent
 
-	Information identifying the pool, image, and snapshot id for
-	the parent image in a layered rbd image (format 2 only).
+	Information identifying the chain of parent images in a layered rbd
+	image.  Entries are separated by empty lines.
diff --git a/Documentation/ABI/testing/sysfs-fs-xfs b/Documentation/ABI/testing/sysfs-fs-xfs
new file mode 100644
index 000000000000..ea0cc8c42093
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-xfs
@@ -0,0 +1,39 @@
+What:		/sys/fs/xfs/<disk>/log/log_head_lsn
+Date:		July 2014
+KernelVersion:	3.17
+Contact:	xfs@oss.sgi.com
+Description:
+		The log sequence number (LSN) of the current head of the
+		log. The LSN is exported in "cycle:basic block" format.
+Users:		xfstests
+
+What:		/sys/fs/xfs/<disk>/log/log_tail_lsn
+Date:		July 2014
+KernelVersion:	3.17
+Contact:	xfs@oss.sgi.com
+Description:
+		The log sequence number (LSN) of the current tail of the
+		log. The LSN is exported in "cycle:basic block" format.
+
+What:		/sys/fs/xfs/<disk>/log/reserve_grant_head
+Date:		July 2014
+KernelVersion:	3.17
+Contact:	xfs@oss.sgi.com
+Description:
+		The current state of the log reserve grant head. It
+		represents the total log reservation of all currently
+		outstanding transactions. The grant head is exported in
+		"cycle:bytes" format.
+Users:		xfstests
+
+What:		/sys/fs/xfs/<disk>/log/write_grant_head
+Date:		July 2014
+KernelVersion:	3.17
+Contact:	xfs@oss.sgi.com
+Description:
+		The current state of the log write grant head. It
+		represents the total log reservation of all currently
+		oustanding transactions, including regrants due to
+		rolling transactions. The grant head is exported in
+		"cycle:bytes" format.
+Users:		xfstests
diff --git a/Documentation/device-mapper/switch.txt b/Documentation/device-mapper/switch.txt
index 2fa749387be8..8897d0494838 100644
--- a/Documentation/device-mapper/switch.txt
+++ b/Documentation/device-mapper/switch.txt
@@ -106,6 +106,11 @@ which paths.
     The path number in the range 0 ... (<num_paths> - 1).
     Expressed in hexadecimal (WITHOUT any prefix like 0x).
 
+R<n>,<m>
+    This parameter allows repetitive patterns to be loaded quickly. <n> and <m>
+    are hexadecimal numbers. The last <n> mappings are repeated in the next <m>
+    slots.
+
 Status
 ======
 
@@ -124,3 +129,10 @@ Create a switch device with 64kB region size:
 Set mappings for the first 7 entries to point to devices switch0, switch1,
 switch2, switch0, switch1, switch2, switch1:
     dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1
+
+Set repetitive mapping. This command:
+    dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10
+is equivalent to:
+    dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \
+	:1 :2 :1 :2 :1 :2 :1 :2 :1 :2
+
diff --git a/Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt b/Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt
index e577196a12c0..4659fd952301 100644
--- a/Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt
+++ b/Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt
@@ -47,6 +47,7 @@ The full ID of peripheral types can be found below.
 	20	ASRC
 	21	ESAI
 	22	SSI Dual FIFO	(needs firmware ver >= 2)
+	23	Shared ASRC
 
 The third cell specifies the transfer priority as below.
 
diff --git a/Documentation/devicetree/bindings/dma/mpc512x-dma.txt b/Documentation/devicetree/bindings/dma/mpc512x-dma.txt
new file mode 100644
index 000000000000..a6511df165c5
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/mpc512x-dma.txt
@@ -0,0 +1,29 @@
+* Freescale MPC512x and MPC8308 DMA Controller
+
+The DMA controller in Freescale MPC512x and MPC8308 SoCs can move
+blocks of memory contents between memory and peripherals or
+from memory to memory.
+
+Refer to "Generic DMA Controller and DMA request bindings" in
+the dma/dma.txt file for a more detailed description of binding.
+
+Required properties:
+- compatible: should be "fsl,mpc5121-dma" or "fsl,mpc8308-dma";
+- reg: should contain the DMA controller registers location and length;
+- interrupt for the DMA controller: syntax of interrupt client node
+	is described in interrupt-controller/interrupts.txt file.
+- #dma-cells: the length of the DMA specifier, must be <1>.
+	Each channel of this DMA controller has a peripheral request line,
+	the assignment is fixed in hardware. This one cell
+	in dmas property of a client device represents the channel number.
+
+Example:
+
+	dma0: dma@14000 {
+		compatible = "fsl,mpc5121-dma";
+		reg = <0x14000 0x1800>;
+		interrupts = <65 0x8>;
+		#dma-cells = <1>;
+	};
+
+DMA clients must use the format described in dma/dma.txt file.
diff --git a/Documentation/devicetree/bindings/dma/nbpfaxi.txt b/Documentation/devicetree/bindings/dma/nbpfaxi.txt
new file mode 100644
index 000000000000..d5e2522b9ec1
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/nbpfaxi.txt
@@ -0,0 +1,61 @@
+* Renesas "Type-AXI" NBPFAXI* DMA controllers
+
+* DMA controller
+
+Required properties
+
+- compatible:	must be one of
+		"renesas,nbpfaxi64dmac1b4"
+		"renesas,nbpfaxi64dmac1b8"
+		"renesas,nbpfaxi64dmac1b16"
+		"renesas,nbpfaxi64dmac4b4"
+		"renesas,nbpfaxi64dmac4b8"
+		"renesas,nbpfaxi64dmac4b16"
+		"renesas,nbpfaxi64dmac8b4"
+		"renesas,nbpfaxi64dmac8b8"
+		"renesas,nbpfaxi64dmac8b16"
+- #dma-cells:	must be 2: the first integer is a terminal number, to which this
+		slave is connected, the second one is flags. Flags is a bitmask
+		with the following bits defined:
+
+#define NBPF_SLAVE_RQ_HIGH	1
+#define NBPF_SLAVE_RQ_LOW	2
+#define NBPF_SLAVE_RQ_LEVEL	4
+
+Optional properties:
+
+You can use dma-channels and dma-requests as described in dma.txt, although they
+won't be used, this information is derived from the compatibility string.
+
+Example:
+
+	dma: dma-controller@48000000 {
+		compatible = "renesas,nbpfaxi64dmac8b4";
+		reg = <0x48000000 0x400>;
+		interrupts = <0 12 0x4
+			      0 13 0x4
+			      0 14 0x4
+			      0 15 0x4
+			      0 16 0x4
+			      0 17 0x4
+			      0 18 0x4
+			      0 19 0x4>;
+		#dma-cells = <2>;
+		dma-channels = <8>;
+		dma-requests = <8>;
+	};
+
+* DMA client
+
+Required properties:
+
+dmas and dma-names are required, as described in dma.txt.
+
+Example:
+
+#include <dt-bindings/dma/nbpfaxi.h>
+
+...
+		dmas = <&dma 0 (NBPF_SLAVE_RQ_HIGH | NBPF_SLAVE_RQ_LEVEL)
+			&dma 1 (NBPF_SLAVE_RQ_HIGH | NBPF_SLAVE_RQ_LEVEL)>;
+		dma-names = "rx", "tx";
diff --git a/Documentation/devicetree/bindings/dma/rcar-audmapp.txt b/Documentation/devicetree/bindings/dma/rcar-audmapp.txt
new file mode 100644
index 000000000000..9f1d750d76de
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/rcar-audmapp.txt
@@ -0,0 +1,29 @@
+* R-Car Audio DMAC peri peri Device Tree bindings
+
+Required properties:
+- compatible:	should be "renesas,rcar-audmapp"
+- #dma-cells:	should be <1>, see "dmas" property below
+
+Example:
+	audmapp: audio-dma-pp@0xec740000 {
+		compatible = "renesas,rcar-audmapp";
+		#dma-cells = <1>;
+
+		reg = <0 0xec740000 0 0x200>;
+	};
+
+
+* DMA client
+
+Required properties:
+- dmas:		a list of <[DMA multiplexer phandle] [SRS/DRS value]> pairs,
+		where SRS/DRS values are fixed handles, specified in the SoC
+		manual as the value that would be written into the PDMACHCR.
+- dma-names:	a list of DMA channel names, one per "dmas" entry
+
+Example:
+
+	dmas = <&audmapp 0x2d00
+		&audmapp 0x3700>;
+	dma-names =  "src0_ssiu0",
+		     "dvc0_ssiu0";
diff --git a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt
new file mode 100644
index 000000000000..df0f48bcf75a
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.txt
@@ -0,0 +1,98 @@
+* Renesas R-Car DMA Controller Device Tree bindings
+
+Renesas R-Car Generation 2 SoCs have have multiple multi-channel DMA
+controller instances named DMAC capable of serving multiple clients. Channels
+can be dedicated to specific clients or shared between a large number of
+clients.
+
+DMA clients are connected to the DMAC ports referenced by an 8-bit identifier
+called MID/RID.
+
+Each DMA client is connected to one dedicated port of the DMAC, identified by
+an 8-bit port number called the MID/RID. A DMA controller can thus serve up to
+256 clients in total. When the number of hardware channels is lower than the
+number of clients to be served, channels must be shared between multiple DMA
+clients. The association of DMA clients to DMAC channels is fully dynamic and
+not described in these device tree bindings.
+
+Required Properties:
+
+- compatible: must contain "renesas,rcar-dmac"
+
+- reg: base address and length of the registers block for the DMAC
+
+- interrupts: interrupt specifiers for the DMAC, one for each entry in
+  interrupt-names.
+- interrupt-names: one entry per channel, named "ch%u", where %u is the
+  channel number ranging from zero to the number of channels minus one.
+
+- clock-names: "fck" for the functional clock
+- clocks: a list of phandle + clock-specifier pairs, one for each entry
+  in clock-names.
+- clock-names: must contain "fck" for the functional clock.
+
+- #dma-cells: must be <1>, the cell specifies the MID/RID of the DMAC port
+  connected to the DMA client
+- dma-channels: number of DMA channels
+
+Example: R8A7790 (R-Car H2) SYS-DMACs
+
+	dmac0: dma-controller@e6700000 {
+		compatible = "renesas,rcar-dmac";
+		reg = <0 0xe6700000 0 0x20000>;
+		interrupts = <0 197 IRQ_TYPE_LEVEL_HIGH
+			      0 200 IRQ_TYPE_LEVEL_HIGH
+			      0 201 IRQ_TYPE_LEVEL_HIGH
+			      0 202 IRQ_TYPE_LEVEL_HIGH
+			      0 203 IRQ_TYPE_LEVEL_HIGH
+			      0 204 IRQ_TYPE_LEVEL_HIGH
+			      0 205 IRQ_TYPE_LEVEL_HIGH
+			      0 206 IRQ_TYPE_LEVEL_HIGH
+			      0 207 IRQ_TYPE_LEVEL_HIGH
+			      0 208 IRQ_TYPE_LEVEL_HIGH
+			      0 209 IRQ_TYPE_LEVEL_HIGH
+			      0 210 IRQ_TYPE_LEVEL_HIGH
+			      0 211 IRQ_TYPE_LEVEL_HIGH
+			      0 212 IRQ_TYPE_LEVEL_HIGH
+			      0 213 IRQ_TYPE_LEVEL_HIGH
+			      0 214 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-names = "error",
+				"ch0", "ch1", "ch2", "ch3",
+				"ch4", "ch5", "ch6", "ch7",
+				"ch8", "ch9", "ch10", "ch11",
+				"ch12", "ch13", "ch14";
+		clocks = <&mstp2_clks R8A7790_CLK_SYS_DMAC0>;
+		clock-names = "fck";
+		#dma-cells = <1>;
+		dma-channels = <15>;
+	};
+
+	dmac1: dma-controller@e6720000 {
+		compatible = "renesas,rcar-dmac";
+		reg = <0 0xe6720000 0 0x20000>;
+		interrupts = <0 220 IRQ_TYPE_LEVEL_HIGH
+			      0 216 IRQ_TYPE_LEVEL_HIGH
+			      0 217 IRQ_TYPE_LEVEL_HIGH
+			      0 218 IRQ_TYPE_LEVEL_HIGH
+			      0 219 IRQ_TYPE_LEVEL_HIGH
+			      0 308 IRQ_TYPE_LEVEL_HIGH
+			      0 309 IRQ_TYPE_LEVEL_HIGH
+			      0 310 IRQ_TYPE_LEVEL_HIGH
+			      0 311 IRQ_TYPE_LEVEL_HIGH
+			      0 312 IRQ_TYPE_LEVEL_HIGH
+			      0 313 IRQ_TYPE_LEVEL_HIGH
+			      0 314 IRQ_TYPE_LEVEL_HIGH
+			      0 315 IRQ_TYPE_LEVEL_HIGH
+			      0 316 IRQ_TYPE_LEVEL_HIGH
+			      0 317 IRQ_TYPE_LEVEL_HIGH
+			      0 318 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-names = "error",
+				"ch0", "ch1", "ch2", "ch3",
+				"ch4", "ch5", "ch6", "ch7",
+				"ch8", "ch9", "ch10", "ch11",
+				"ch12", "ch13", "ch14";
+		clocks = <&mstp2_clks R8A7790_CLK_SYS_DMAC1>;
+		clock-names = "fck";
+		#dma-cells = <1>;
+		dma-channels = <15>;
+	};
diff --git a/Documentation/devicetree/bindings/dma/ste-dma40.txt b/Documentation/devicetree/bindings/dma/ste-dma40.txt
index 1f5729f10621..95800ab37bb0 100644
--- a/Documentation/devicetree/bindings/dma/ste-dma40.txt
+++ b/Documentation/devicetree/bindings/dma/ste-dma40.txt
@@ -35,9 +35,11 @@ Required properties:
 
 Each dmas request consists of 4 cells:
   1. A phandle pointing to the DMA controller
-  2. Device Type
+  2. Device signal number, the signal line for single and burst requests
+     connected from the device to the DMA40 engine
   3. The DMA request line number (only when 'use fixed channel' is set)
-  4. A 32bit mask specifying; mode, direction and endianness [NB: This list will grow]
+  4. A 32bit mask specifying; mode, direction and endianness
+     [NB: This list will grow]
         0x00000001: Mode:
                 Logical channel when unset
                 Physical channel when set
@@ -54,6 +56,74 @@ Each dmas request consists of 4 cells:
                 Normal priority when unset
                 High priority when set
 
+Existing signal numbers for the DB8500 ASIC. Unless specified, the signals are
+bidirectional, i.e. the same for RX and TX operations:
+
+0:  SPI controller 0
+1:  SD/MMC controller 0 (unused)
+2:  SD/MMC controller 1 (unused)
+3:  SD/MMC controller 2 (unused)
+4:  I2C port 1
+5:  I2C port 3
+6:  I2C port 2
+7:  I2C port 4
+8:  Synchronous Serial Port SSP0
+9:  Synchronous Serial Port SSP1
+10: Multi-Channel Display Engine MCDE RX
+11: UART port 2
+12: UART port 1
+13: UART port 0
+14: Multirate Serial Port MSP2
+15: I2C port 0
+16: USB OTG in/out endpoints 7 & 15
+17: USB OTG in/out endpoints 6 & 14
+18: USB OTG in/out endpoints 5 & 13
+19: USB OTG in/out endpoints 4 & 12
+20: SLIMbus or HSI channel 0
+21: SLIMbus or HSI channel 1
+22: SLIMbus or HSI channel 2
+23: SLIMbus or HSI channel 3
+24: Multimedia DSP SXA0
+25: Multimedia DSP SXA1
+26: Multimedia DSP SXA2
+27: Multimedia DSP SXA3
+28: SD/MM controller 2
+29: SD/MM controller 0
+30: MSP port 1 on DB8500 v1, MSP port 3 on DB8500 v2
+31: MSP port 0 or SLIMbus channel 0
+32: SD/MM controller 1
+33: SPI controller 2
+34: i2c3 RX2 TX2
+35: SPI controller 1
+36: USB OTG in/out endpoints 3 & 11
+37: USB OTG in/out endpoints 2 & 10
+38: USB OTG in/out endpoints 1 & 9
+39: USB OTG in/out endpoints 8
+40: SPI controller 3
+41: SD/MM controller 3
+42: SD/MM controller 4
+43: SD/MM controller 5
+44: Multimedia DSP SXA4
+45: Multimedia DSP SXA5
+46: SLIMbus channel 8 or Multimedia DSP SXA6
+47: SLIMbus channel 9 or Multimedia DSP SXA7
+48: Crypto Accelerator 1
+49: Crypto Accelerator 1 TX or Hash Accelerator 1 TX
+50: Hash Accelerator 1 TX
+51: memcpy TX (to be used by the DMA driver for memcpy operations)
+52: SLIMbus or HSI channel 4
+53: SLIMbus or HSI channel 5
+54: SLIMbus or HSI channel 6
+55: SLIMbus or HSI channel 7
+56: memcpy (to be used by the DMA driver for memcpy operations)
+57: memcpy (to be used by the DMA driver for memcpy operations)
+58: memcpy (to be used by the DMA driver for memcpy operations)
+59: memcpy (to be used by the DMA driver for memcpy operations)
+60: memcpy (to be used by the DMA driver for memcpy operations)
+61: Crypto Accelerator 0
+62: Crypto Accelerator 0 TX or Hash Accelerator 0 TX
+63: Hash Accelerator 0 TX
+
 Example:
 
 	uart@80120000 {
diff --git a/Documentation/devicetree/bindings/dma/sun6i-dma.txt b/Documentation/devicetree/bindings/dma/sun6i-dma.txt
new file mode 100644
index 000000000000..3e145c1675b1
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/sun6i-dma.txt
@@ -0,0 +1,45 @@
+Allwinner A31 DMA Controller
+
+This driver follows the generic DMA bindings defined in dma.txt.
+
+Required properties:
+
+- compatible:	Must be "allwinner,sun6i-a31-dma"
+- reg:		Should contain the registers base address and length
+- interrupts:	Should contain a reference to the interrupt used by this device
+- clocks:	Should contain a reference to the parent AHB clock
+- resets:	Should contain a reference to the reset controller asserting
+		this device in reset
+- #dma-cells :	Should be 1, a single cell holding a line request number
+
+Example:
+	dma: dma-controller@01c02000 {
+		compatible = "allwinner,sun6i-a31-dma";
+		reg = <0x01c02000 0x1000>;
+		interrupts = <0 50 4>;
+		clocks = <&ahb1_gates 6>;
+		resets = <&ahb1_rst 6>;
+		#dma-cells = <1>;
+	};
+
+Clients:
+
+DMA clients connected to the A31 DMA controller must use the format
+described in the dma.txt file, using a two-cell specifier for each
+channel: a phandle plus one integer cells.
+The two cells in order are:
+
+1. A phandle pointing to the DMA controller.
+2. The port ID as specified in the datasheet
+
+Example:
+spi2: spi@01c6a000 {
+	compatible = "allwinner,sun6i-a31-spi";
+	reg = <0x01c6a000 0x1000>;
+	interrupts = <0 67 4>;
+	clocks = <&ahb1_gates 22>, <&spi2_clk>;
+	clock-names = "ahb", "mod";
+	dmas = <&dma 25>, <&dma 25>;
+	dma-names = "rx", "tx";
+	resets = <&ahb1_rst 22>;
+};
diff --git a/Documentation/devicetree/bindings/mmc/exynos-dw-mshc.txt b/Documentation/devicetree/bindings/mmc/exynos-dw-mshc.txt
index 532b1d440abc..6cd3525d0e09 100644
--- a/Documentation/devicetree/bindings/mmc/exynos-dw-mshc.txt
+++ b/Documentation/devicetree/bindings/mmc/exynos-dw-mshc.txt
@@ -46,13 +46,14 @@ Required Properties:
       - if CIU clock divider value is 0 (that is divide by 1), both tx and rx
         phase shift clocks should be 0.
 
-Required properties for a slot:
+Required properties for a slot (Deprecated - Recommend to use one slot per host):
 
 * gpios: specifies a list of gpios used for command, clock and data bus. The
   first gpio is the command line and the second gpio is the clock line. The
   rest of the gpios (depending on the bus-width property) are the data lines in
   no particular order. The format of the gpio specifier depends on the gpio
   controller.
+(Deprecated - Refer to Documentation/devicetree/binding/pinctrl/samsung-pinctrl.txt)
 
 Example:
 
@@ -69,21 +70,13 @@ Example:
 
 	dwmmc0@12200000 {
 		num-slots = <1>;
-		supports-highspeed;
+		cap-mmc-highspeed;
+		cap-sd-highspeed;
 		broken-cd;
 		fifo-depth = <0x80>;
 		card-detect-delay = <200>;
 		samsung,dw-mshc-ciu-div = <3>;
 		samsung,dw-mshc-sdr-timing = <2 3>;
 		samsung,dw-mshc-ddr-timing = <1 2>;
-
-		slot@0 {
-			reg = <0>;
-			bus-width = <8>;
-			gpios = <&gpc0 0 2 0 3>, <&gpc0 1 2 0 3>,
-				<&gpc1 0 2 3 3>, <&gpc1 1 2 3 3>,
-				<&gpc1 2 2 3 3>, <&gpc1 3 2 3 3>,
-				<&gpc0 3 2 3 3>, <&gpc0 4 2 3 3>,
-				<&gpc0 5 2 3 3>, <&gpc0 6 2 3 3>;
-		};
+		bus-width = <8>;
 	};
diff --git a/Documentation/devicetree/bindings/mmc/k3-dw-mshc.txt b/Documentation/devicetree/bindings/mmc/k3-dw-mshc.txt
index e5bc49f764d1..3b3544931437 100644
--- a/Documentation/devicetree/bindings/mmc/k3-dw-mshc.txt
+++ b/Documentation/devicetree/bindings/mmc/k3-dw-mshc.txt
@@ -34,13 +34,11 @@ Example:
 		num-slots = <1>;
 		vmmc-supply = <&ldo12>;
 		fifo-depth = <0x100>;
-		supports-highspeed;
 		pinctrl-names = "default";
 		pinctrl-0 = <&sd_pmx_pins &sd_cfg_func1 &sd_cfg_func2>;
-		slot@0 {
-			reg = <0>;
-			bus-width = <4>;
-			disable-wp;
-			cd-gpios = <&gpio10 3 0>;
-		};
+		bus-width = <4>;
+		disable-wp;
+		cd-gpios = <&gpio10 3 0>;
+		cap-mmc-highspeed;
+		cap-sd-highspeed;
 	};
diff --git a/Documentation/devicetree/bindings/mmc/mmc.txt b/Documentation/devicetree/bindings/mmc/mmc.txt
index 3c18001dfd5d..431716e37a39 100644
--- a/Documentation/devicetree/bindings/mmc/mmc.txt
+++ b/Documentation/devicetree/bindings/mmc/mmc.txt
@@ -34,8 +34,8 @@ Optional properties:
 - cap-power-off-card: powering off the card is safe
 - cap-sdio-irq: enable SDIO IRQ signalling on this interface
 - full-pwr-cycle: full power cycle of the card is supported
-- mmc-highspeed-ddr-1_8v: eMMC high-speed DDR mode(1.8V I/O) is supported
-- mmc-highspeed-ddr-1_2v: eMMC high-speed DDR mode(1.2V I/O) is supported
+- mmc-ddr-1_8v: eMMC high-speed DDR mode(1.8V I/O) is supported
+- mmc-ddr-1_2v: eMMC high-speed DDR mode(1.2V I/O) is supported
 - mmc-hs200-1_8v: eMMC HS200 mode(1.8V I/O) is supported
 - mmc-hs200-1_2v: eMMC HS200 mode(1.2V I/O) is supported
 - mmc-hs400-1_8v: eMMC HS400 mode(1.8V I/O) is supported
diff --git a/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt b/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt
new file mode 100644
index 000000000000..299081f94abd
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt
@@ -0,0 +1,32 @@
+* Renesas Multi Media Card Interface (MMCIF) Controller
+
+This file documents differences between the core properties in mmc.txt
+and the properties used by the MMCIF device.
+
+
+Required properties:
+
+- compatible: must contain one of the following
+	- "renesas,mmcif-r8a7740" for the MMCIF found in r8a7740 SoCs
+	- "renesas,mmcif-r8a7790" for the MMCIF found in r8a7790 SoCs
+	- "renesas,mmcif-r8a7791" for the MMCIF found in r8a7791 SoCs
+	- "renesas,sh-mmcif" for the generic MMCIF
+
+- clocks: reference to the functional clock
+
+- dmas: reference to the DMA channels, one per channel name listed in the
+  dma-names property.
+- dma-names: must contain "tx" for the transmit DMA channel and "rx" for the
+  receive DMA channel.
+
+
+Example: R8A7790 (R-Car H2) MMCIF0
+
+	mmcif0: mmc@ee200000 {
+		compatible = "renesas,mmcif-r8a7790", "renesas,sh-mmcif";
+		reg = <0 0xee200000 0 0x80>;
+		interrupts = <0 169 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&mstp3_clks R8A7790_CLK_MMCIF0>;
+		dmas = <&dmac0 0xd1>, <&dmac0 0xd2>;
+		dma-names = "tx", "rx";
+	};
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-msm.txt b/Documentation/devicetree/bindings/mmc/sdhci-msm.txt
index 81b33b5b20fc..485483a63d8c 100644
--- a/Documentation/devicetree/bindings/mmc/sdhci-msm.txt
+++ b/Documentation/devicetree/bindings/mmc/sdhci-msm.txt
@@ -27,8 +27,8 @@ Example:
 		bus-width = <8>;
 		non-removable;
 
-		vmmc = <&pm8941_l20>;
-		vqmmc = <&pm8941_s3>;
+		vmmc-supply = <&pm8941_l20>;
+		vqmmc-supply = <&pm8941_s3>;
 
 		pinctrl-names = "default";
 		pinctrl-0 = <&sdc1_clk &sdc1_cmd &sdc1_data>;
@@ -44,8 +44,8 @@ Example:
 		bus-width = <4>;
 		cd-gpios = <&msmgpio 62 0x1>;
 
-		vmmc = <&pm8941_l21>;
-		vqmmc = <&pm8941_l13>;
+		vmmc-supply = <&pm8941_l21>;
+		vqmmc-supply = <&pm8941_l13>;
 
 		pinctrl-names = "default";
 		pinctrl-0 = <&sdc2_clk &sdc2_cmd &sdc2_data>;
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-st.txt b/Documentation/devicetree/bindings/mmc/sdhci-st.txt
new file mode 100644
index 000000000000..7527db447a35
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/sdhci-st.txt
@@ -0,0 +1,33 @@
+* STMicroelectronics sdhci-st MMC/SD controller
+
+This file documents the differences between the core properties in
+Documentation/devicetree/bindings/mmc/mmc.txt and the properties
+used by the sdhci-st driver.
+
+Required properties:
+- compatible :  Must be "st,sdhci"
+- clock-names : Should be "mmc"
+                See: Documentation/devicetree/bindings/resource-names.txt
+- clocks :      Phandle of the clock used by the sdhci controler
+                See: Documentation/devicetree/bindings/clock/clock-bindings.txt
+
+Optional properties:
+- non-removable: non-removable slot
+                 See: Documentation/devicetree/bindings/mmc/mmc.txt
+- bus-width: Number of data lines
+                 See: Documentation/devicetree/bindings/mmc/mmc.txt
+
+Example:
+
+mmc0: sdhci@fe81e000 {
+	compatible	= "st,sdhci";
+	status		= "disabled";
+	reg		= <0xfe81e000 0x1000>;
+	interrupts	= <GIC_SPI 127 IRQ_TYPE_NONE>;
+	interrupt-names	= "mmcirq";
+	pinctrl-names	= "default";
+	pinctrl-0	= <&pinctrl_mmc0>;
+	clock-names	= "mmc";
+	clocks		= <&clk_s_a1_ls 1>;
+	bus-width       = <8>
+};
diff --git a/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt b/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt
index 2d4a7258a10d..346c6095a615 100644
--- a/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt
+++ b/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt
@@ -67,7 +67,8 @@ Optional properties:
 * card-detect-delay: Delay in milli-seconds before detecting card after card
   insert event. The default value is 0.
 
-* supports-highspeed: Enables support for high speed cards (up to 50MHz)
+* supports-highspeed (DEPRECATED): Enables support for high speed cards (up to 50MHz)
+			   (use "cap-mmc-highspeed" or "cap-sd-highspeed" instead)
 
 * broken-cd: as documented in mmc core bindings.
 
@@ -98,14 +99,11 @@ board specific portions as listed below.
 		clock-frequency = <400000000>;
 		clock-freq-min-max = <400000 200000000>;
 		num-slots = <1>;
-		supports-highspeed;
 		broken-cd;
 		fifo-depth = <0x80>;
 		card-detect-delay = <200>;
 		vmmc-supply = <&buck8>;
-
-		slot@0 {
-			reg = <0>;
-			bus-width = <8>;
-		};
+		bus-width = <8>;
+		cap-mmc-highspeed;
+		cap-sd-highspeed;
 	};
diff --git a/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt b/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt
index ce8056116fb0..76bf087bc889 100644
--- a/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt
+++ b/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt
@@ -12,6 +12,7 @@ Required properties:
  Should be "ti,omap3-hsmmc", for OMAP3 controllers
  Should be "ti,omap3-pre-es3-hsmmc" for OMAP3 controllers pre ES3.0
  Should be "ti,omap4-hsmmc", for OMAP4 controllers
+ Should be "ti,am33xx-hsmmc", for AM335x controllers
 - ti,hwmods: Must be "mmc<n>", n is controller instance starting 1
 
 Optional properties:
@@ -56,3 +57,56 @@ Examples:
 			&edma 25>;
 		dma-names = "tx", "rx";
 	};
+
+[workaround for missing swakeup on am33xx]
+
+This SOC is missing the swakeup line, it will not detect SDIO irq
+while in suspend.
+
+                             ------
+                             | PRCM |
+                              ------
+                               ^ |
+                       swakeup | | fclk
+                               | v
+       ------                -------               -----
+      | card | -- CIRQ -->  | hsmmc | -- IRQ -->  | CPU |
+       ------                -------               -----
+
+In suspend the fclk is off and the module is disfunctional. Even register reads
+will fail. A small logic in the host will request fclk restore, when an
+external event is detected. Once the clock is restored, the host detects the
+event normally. Since am33xx doesn't have this line it never wakes from
+suspend.
+
+The workaround is to reconfigure the dat1 line as a GPIO upon suspend. To make
+this work, we need to set the named pinctrl states "default" and "idle".
+Prepare idle to remux dat1 as a gpio, and default to remux it back as sdio
+dat1. The MMC driver will then toggle between idle and default state during
+runtime.
+
+In summary:
+1. select matching 'compatible' section, see example below.
+2. specify pinctrl states "default" and "idle", "sleep" is optional.
+3. specify the gpio irq used for detecting sdio irq in suspend
+
+If configuration is incomplete, a warning message is emitted "falling back to
+polling". Also check the "sdio irq mode" in /sys/kernel/debug/mmc0/regs. Mind
+not every application needs SDIO irq, e.g. MMC cards.
+
+	mmc1: mmc@48060100 {
+		compatible = "ti,am33xx-hsmmc";
+		...
+		pinctrl-names = "default", "idle", "sleep"
+		pinctrl-0 = <&mmc1_pins>;
+		pinctrl-1 = <&mmc1_idle>;
+		pinctrl-2 = <&mmc1_sleep>;
+		...
+		interrupts-extended = <&intc 64 &gpio2 28 0>;
+	};
+
+	mmc1_idle : pinmux_cirq_pin {
+		pinctrl-single,pins = <
+		        0x0f8 0x3f      /* GPIO2_28 */
+		>;
+	};
diff --git a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt b/Documentation/devicetree/bindings/mmc/tmio_mmc.txt
index 6a2a1160a70d..fa0f327cde01 100644
--- a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt
+++ b/Documentation/devicetree/bindings/mmc/tmio_mmc.txt
@@ -18,6 +18,7 @@ Required properties:
 		"renesas,sdhi-r8a7778" - SDHI IP on R8A7778 SoC
 		"renesas,sdhi-r8a7779" - SDHI IP on R8A7779 SoC
 		"renesas,sdhi-r8a7790" - SDHI IP on R8A7790 SoC
+		"renesas,sdhi-r8a7791" - SDHI IP on R8A7791 SoC
 
 Optional properties:
 - toshiba,mmc-wrprotect-disable: write-protect detection is unavailable
diff --git a/Documentation/devicetree/bindings/net/apm-xgene-enet.txt b/Documentation/devicetree/bindings/net/apm-xgene-enet.txt
new file mode 100644
index 000000000000..ebcad25efd0a
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/apm-xgene-enet.txt
@@ -0,0 +1,66 @@
+APM X-Gene SoC Ethernet nodes
+
+Ethernet nodes are defined to describe on-chip ethernet interfaces in
+APM X-Gene SoC.
+
+Required properties:
+- compatible: Should be "apm,xgene-enet"
+- reg: Address and length of the register set for the device. It contains the
+  information of registers in the same order as described by reg-names
+- reg-names: Should contain the register set names
+  - "enet_csr": Ethernet control and status register address space
+  - "ring_csr": Descriptor ring control and status register address space
+  - "ring_cmd": Descriptor ring command register address space
+- interrupts: Ethernet main interrupt
+- clocks: Reference to the clock entry.
+- local-mac-address: MAC address assigned to this device
+- phy-connection-type: Interface type between ethernet device and PHY device
+- phy-handle: Reference to a PHY node connected to this device
+
+- mdio: Device tree subnode with the following required properties:
+  - compatible: Must be "apm,xgene-mdio".
+  - #address-cells: Must be <1>.
+  - #size-cells: Must be <0>.
+
+  For the phy on the mdio bus, there must be a node with the following fields:
+  - compatible: PHY identifier.  Please refer ./phy.txt for the format.
+  - reg: The ID number for the phy.
+
+Optional properties:
+- status: Should be "ok" or "disabled" for enabled/disabled. Default is "ok".
+
+Example:
+	menetclk: menetclk {
+		compatible = "apm,xgene-device-clock";
+		clock-output-names = "menetclk";
+		status = "ok";
+	};
+
+	menet: ethernet@17020000 {
+		compatible = "apm,xgene-enet";
+		status = "disabled";
+		reg = <0x0 0x17020000 0x0 0xd100>,
+		      <0x0 0X17030000 0x0 0X400>,
+		      <0x0 0X10000000 0x0 0X200>;
+		reg-names = "enet_csr", "ring_csr", "ring_cmd";
+		interrupts = <0x0 0x3c 0x4>;
+		clocks = <&menetclk 0>;
+		local-mac-address = [00 01 73 00 00 01];
+		phy-connection-type = "rgmii";
+		phy-handle = <&menetphy>;
+		mdio {
+			compatible = "apm,xgene-mdio";
+			#address-cells = <1>;
+			#size-cells = <0>;
+			menetphy: menetphy@3 {
+				compatible = "ethernet-phy-id001c.c915";
+				reg = <0x3>;
+			};
+
+		};
+	};
+
+/* Board-specific peripheral configurations */
+&menet {
+        status = "ok";
+};
diff --git a/Documentation/devicetree/bindings/net/fsl-fec.txt b/Documentation/devicetree/bindings/net/fsl-fec.txt
index 6bc84adb10c0..8a2c7b55ec16 100644
--- a/Documentation/devicetree/bindings/net/fsl-fec.txt
+++ b/Documentation/devicetree/bindings/net/fsl-fec.txt
@@ -12,7 +12,14 @@ Optional properties:
   only if property "phy-reset-gpios" is available.  Missing the property
   will have the duration be 1 millisecond.  Numbers greater than 1000 are
   invalid and 1 millisecond will be used instead.
-- phy-supply: regulator that powers the Ethernet PHY.
+- phy-supply : regulator that powers the Ethernet PHY.
+- phy-handle : phandle to the PHY device connected to this device.
+- fixed-link : Assume a fixed link. See fixed-link.txt in the same directory.
+  Use instead of phy-handle.
+
+Optional subnodes:
+- mdio : specifies the mdio bus in the FEC, used as a container for phy nodes
+  according to phy.txt in the same directory
 
 Example:
 
@@ -25,3 +32,23 @@ ethernet@83fec000 {
 	local-mac-address = [00 04 9F 01 1B B9];
 	phy-supply = <&reg_fec_supply>;
 };
+
+Example with phy specified:
+
+ethernet@83fec000 {
+	compatible = "fsl,imx51-fec", "fsl,imx27-fec";
+	reg = <0x83fec000 0x4000>;
+	interrupts = <87>;
+	phy-mode = "mii";
+	phy-reset-gpios = <&gpio2 14 0>; /* GPIO2_14 */
+	local-mac-address = [00 04 9F 01 1B B9];
+	phy-supply = <&reg_fec_supply>;
+	phy-handle = <&ethphy>;
+	mdio {
+		ethphy: ethernet-phy@6 {
+			compatible = "ethernet-phy-ieee802.3-c22";
+			reg = <6>;
+			max-speed = <100>;
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/thermal/exynos-thermal.txt b/Documentation/devicetree/bindings/thermal/exynos-thermal.txt
index c94909215c07..ae738f562acc 100644
--- a/Documentation/devicetree/bindings/thermal/exynos-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/exynos-thermal.txt
@@ -3,6 +3,7 @@
 ** Required properties:
 
 - compatible : One of the following:
+	       "samsung,exynos3250-tmu"
 	       "samsung,exynos4412-tmu"
 	       "samsung,exynos4210-tmu"
 	       "samsung,exynos5250-tmu"
diff --git a/Documentation/devicetree/bindings/thermal/rcar-thermal.txt b/Documentation/devicetree/bindings/thermal/rcar-thermal.txt
index 28ef498a66e5..0ef00be44b01 100644
--- a/Documentation/devicetree/bindings/thermal/rcar-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/rcar-thermal.txt
@@ -1,7 +1,13 @@
 * Renesas R-Car Thermal
 
 Required properties:
-- compatible		: "renesas,rcar-thermal"
+- compatible		: "renesas,thermal-<soctype>", "renesas,rcar-thermal"
+			  as fallback.
+			  Examples with soctypes are:
+			    - "renesas,thermal-r8a73a4" (R-Mobile AP6)
+			    - "renesas,thermal-r8a7779" (R-Car H1)
+			    - "renesas,thermal-r8a7790" (R-Car H2)
+			    - "renesas,thermal-r8a7791" (R-Car M2)
 - reg			: Address range of the thermal registers.
 			  The 1st reg will be recognized as common register
 			  if it has "interrupts".
@@ -12,18 +18,18 @@ Option properties:
 
 Example (non interrupt support):
 
-thermal@e61f0100 {
-	compatible = "renesas,rcar-thermal";
-	reg = <0xe61f0100 0x38>;
+thermal@ffc48000 {
+	compatible = "renesas,thermal-r8a7779", "renesas,rcar-thermal";
+	reg = <0xffc48000 0x38>;
 };
 
 Example (interrupt support):
 
 thermal@e61f0000 {
-	compatible = "renesas,rcar-thermal";
+	compatible = "renesas,thermal-r8a73a4", "renesas,rcar-thermal";
 	reg = <0xe61f0000 0x14
 		0xe61f0100 0x38
 		0xe61f0200 0x38
 		0xe61f0300 0x38>;
-	interrupts = <0 69 4>;
+	interrupts = <0 69 IRQ_TYPE_LEVEL_HIGH>;
 };
diff --git a/Documentation/devicetree/bindings/thermal/st-thermal.txt b/Documentation/devicetree/bindings/thermal/st-thermal.txt
new file mode 100644
index 000000000000..3b9251b4a145
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/st-thermal.txt
@@ -0,0 +1,42 @@
+Binding for Thermal Sensor driver for STMicroelectronics STi series of SoCs.
+
+Required parameters:
+-------------------
+
+compatible : 	st,<SoC>-<module>-thermal; should be one of:
+		  "st,stih415-sas-thermal",
+		  "st,stih415-mpe-thermal",
+		  "st,stih416-sas-thermal"
+		  "st,stih416-mpe-thermal"
+		  "st,stid127-thermal" or
+		  "st,stih407-thermal"
+		according to the SoC type (stih415, stih416, stid127, stih407)
+		and module type (sas or mpe). On stid127 & stih407 there is only
+		one die/module, so there is no module type in the compatible
+		string.
+clock-names : 	Should be "thermal".
+		  See: Documentation/devicetree/bindings/resource-names.txt
+clocks : 	Phandle of the clock used by the thermal sensor.
+		  See: Documentation/devicetree/bindings/clock/clock-bindings.txt
+
+Optional parameters:
+-------------------
+
+reg : 		For non-sysconf based sensors, this should be the physical base
+		address and length of the sensor's registers.
+interrupts :	Standard way to define interrupt number.
+		Interrupt is mandatory to be defined when compatible is
+		"stih416-mpe-thermal".
+		  NB: For thermal sensor's for which no interrupt has been
+		  defined, a polling delay of 1000ms will be used to read the
+		  temperature from device.
+
+Example:
+
+	temp1@fdfe8000 {
+		compatible	= "st,stih416-mpe-thermal";
+		reg		= <0xfdfe8000 0x10>;
+		clock-names	= "thermal";
+		clocks		= <&clk_m_mpethsens>;
+		interrupts	= <GIC_SPI 23 IRQ_TYPE_NONE>;
+	};
diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt
index 879b6e31e2da..573e28ce9751 100644
--- a/Documentation/dmaengine.txt
+++ b/Documentation/dmaengine.txt
@@ -84,31 +84,32 @@ The slave DMA usage consists of following steps:
    the given transaction.
 
    Interface:
-	struct dma_async_tx_descriptor *(*chan->device->device_prep_slave_sg)(
+	struct dma_async_tx_descriptor *dmaengine_prep_slave_sg(
 		struct dma_chan *chan, struct scatterlist *sgl,
 		unsigned int sg_len, enum dma_data_direction direction,
 		unsigned long flags);
 
-	struct dma_async_tx_descriptor *(*chan->device->device_prep_dma_cyclic)(
+	struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic(
 		struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
 		size_t period_len, enum dma_data_direction direction);
 
-	struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
+	struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
 		struct dma_chan *chan, struct dma_interleaved_template *xt,
 		unsigned long flags);
 
    The peripheral driver is expected to have mapped the scatterlist for
    the DMA operation prior to calling device_prep_slave_sg, and must
    keep the scatterlist mapped until the DMA operation has completed.
-   The scatterlist must be mapped using the DMA struct device.  So,
-   normal setup should look like this:
+   The scatterlist must be mapped using the DMA struct device.
+   If a mapping needs to be synchronized later, dma_sync_*_for_*() must be
+   called using the DMA struct device, too.
+   So, normal setup should look like this:
 
 	nr_sg = dma_map_sg(chan->device->dev, sgl, sg_len);
 	if (nr_sg == 0)
 		/* error */
 
-	desc = chan->device->device_prep_slave_sg(chan, sgl, nr_sg,
-			direction, flags);
+	desc = dmaengine_prep_slave_sg(chan, sgl, nr_sg, direction, flags);
 
    Once a descriptor has been obtained, the callback information can be
    added and the descriptor must then be submitted.  Some DMA engine
@@ -188,7 +189,7 @@ Further APIs:
    description of this API.
 
    This can be used in conjunction with dma_async_is_complete() and
-   the cookie returned from 'descriptor->submit()' to check for
+   the cookie returned from dmaengine_submit() to check for
    completion of a specific DMA transaction.
 
    Note:
diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting
index e543b1a619cc..c8f036a9b13f 100644
--- a/Documentation/filesystems/nfs/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
@@ -66,23 +66,31 @@ b/ A per-superblock list "s_anon" of dentries which are the roots of
 
 c/ Helper routines to allocate anonymous dentries, and to help attach
    loose directory dentries at lookup time. They are:
-    d_alloc_anon(inode) will return a dentry for the given inode.
+    d_obtain_alias(inode) will return a dentry for the given inode.
       If the inode already has a dentry, one of those is returned.
       If it doesn't, a new anonymous (IS_ROOT and
         DCACHE_DISCONNECTED) dentry is allocated and attached.
       In the case of a directory, care is taken that only one dentry
       can ever be attached.
-    d_splice_alias(inode, dentry) will make sure that there is a
-      dentry with the same name and parent as the given dentry, and
-      which refers to the given inode.
-      If the inode is a directory and already has a dentry, then that
-      dentry is d_moved over the given dentry.
-      If the passed dentry gets attached, care is taken that this is
-      mutually exclusive to a d_alloc_anon operation.
-      If the passed dentry is used, NULL is returned, else the used
-      dentry is returned.  This corresponds to the calling pattern of
-      ->lookup.
-  
+    d_splice_alias(inode, dentry) or d_materialise_unique(dentry, inode)
+      will introduce a new dentry into the tree; either the passed-in
+      dentry or a preexisting alias for the given inode (such as an
+      anonymous one created by d_obtain_alias), if appropriate.  The two
+      functions differ in their handling of directories with preexisting
+      aliases:
+        d_splice_alias will use any existing IS_ROOT dentry, but it will
+	  return -EIO rather than try to move a dentry with a different
+	  parent.  This is appropriate for local filesystems, which
+	  should never see such an alias unless the filesystem is
+	  corrupted somehow (for example, if two on-disk directory
+	  entries refer to the same directory.)
+	d_materialise_unique will attempt to move any dentry.  This is
+	  appropriate for distributed filesystems, where finding a
+	  directory other than where we last cached it may be a normal
+	  consequence of concurrent operations on other hosts.
+      Both functions return NULL when the passed-in dentry is used,
+      following the calling convention of ->lookup.
+
  
 Filesystem Issues
 -----------------
@@ -120,12 +128,12 @@ struct which has the following members:
 
   fh_to_dentry (mandatory)
     Given a filehandle fragment, this should find the implied object and
-    create a dentry for it (possibly with d_alloc_anon).
+    create a dentry for it (possibly with d_obtain_alias).
 
   fh_to_parent (optional but strongly recommended)
     Given a filehandle fragment, this should find the parent of the
-    implied object and create a dentry for it (possibly with d_alloc_anon).
-    May fail if the filehandle fragment is too small.
+    implied object and create a dentry for it (possibly with
+    d_obtain_alias).  May fail if the filehandle fragment is too small.
 
   get_parent (optional but strongly recommended)
     When given a dentry for a directory, this should return  a dentry for
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index a1d0d7a30165..61d65cc65c54 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -1053,7 +1053,8 @@ struct dentry_operations {
 	If the 'rcu_walk' parameter is true, then the caller is doing a
 	pathwalk in RCU-walk mode.  Sleeping is not permitted in this mode,
 	and the caller can be asked to leave it and call again by returning
-	-ECHILD.
+	-ECHILD.  -EISDIR may also be returned to tell pathwalk to
+	ignore d_automount or any mounts.
 
 	This function is only used if DCACHE_MANAGE_TRANSIT is set on the
 	dentry being transited from.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a8eb6afce6a4..5ae8608ca9f5 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2200,6 +2200,21 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			and restore using xsave. The kernel will fallback to
 			enabling legacy floating-point and sse state.
 
+	noxsaveopt	[X86] Disables xsaveopt used in saving x86 extended
+			register states. The kernel will fall back to use
+			xsave to save the states. By using this parameter,
+			performance of saving the states is degraded because
+			xsave doesn't support modified optimization while
+			xsaveopt supports it on xsaveopt enabled systems.
+
+	noxsaves	[X86] Disables xsaves and xrstors used in saving and
+			restoring x86 extended register state in compacted
+			form of xsave area. The kernel will fall back to use
+			xsaveopt and xrstor to save and restore the states
+			in standard form of xsave area. By using this
+			parameter, xsave area per process might occupy more
+			memory on xsaves enabled systems.
+
 	eagerfpu=	[X86]
 			on	enable eager fpu restore
 			off	disable eager fpu restore
diff --git a/MAINTAINERS b/MAINTAINERS
index 7e2eb4c646e3..2f85f55c8fb8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -719,6 +719,14 @@ S:	Maintained
 F:	drivers/net/appletalk/
 F:	net/appletalk/
 
+APPLIED MICRO (APM) X-GENE SOC ETHERNET DRIVER
+M:	Iyappan Subramanian <isubramanian@apm.com>
+M:	Keyur Chudgar <kchudgar@apm.com>
+M:	Ravi Patel <rapatel@apm.com>
+S:	Supported
+F:	drivers/net/ethernet/apm/xgene/
+F:	Documentation/devicetree/bindings/net/apm-xgene-enet.txt
+
 APTINA CAMERA SENSOR PLL
 M:	Laurent Pinchart <Laurent.pinchart@ideasonboard.com>
 L:	linux-media@vger.kernel.org
diff --git a/arch/arm/common/edma.c b/arch/arm/common/edma.c
index 485be42519b9..88099175fc56 100644
--- a/arch/arm/common/edma.c
+++ b/arch/arm/common/edma.c
@@ -1414,6 +1414,34 @@ void edma_clear_event(unsigned channel)
 }
 EXPORT_SYMBOL(edma_clear_event);
 
+/*
+ * edma_assign_channel_eventq - move given channel to desired eventq
+ * Arguments:
+ *	channel - channel number
+ *	eventq_no - queue to move the channel
+ *
+ * Can be used to move a channel to a selected event queue.
+ */
+void edma_assign_channel_eventq(unsigned channel, enum dma_event_q eventq_no)
+{
+	unsigned ctlr;
+
+	ctlr = EDMA_CTLR(channel);
+	channel = EDMA_CHAN_SLOT(channel);
+
+	if (channel >= edma_cc[ctlr]->num_channels)
+		return;
+
+	/* default to low priority queue */
+	if (eventq_no == EVENTQ_DEFAULT)
+		eventq_no = edma_cc[ctlr]->default_queue;
+	if (eventq_no >= edma_cc[ctlr]->num_tc)
+		return;
+
+	map_dmach_queue(ctlr, channel, eventq_no);
+}
+EXPORT_SYMBOL(edma_assign_channel_eventq);
+
 static int edma_setup_from_hw(struct device *dev, struct edma_soc_info *pdata,
 			      struct edma *edma_cc)
 {
@@ -1470,7 +1498,8 @@ static int edma_setup_from_hw(struct device *dev, struct edma_soc_info *pdata,
 	queue_priority_map[i][1] = -1;
 
 	pdata->queue_priority_mapping = queue_priority_map;
-	pdata->default_queue = 0;
+	/* Default queue has the lowest priority */
+	pdata->default_queue = i - 1;
 
 	return 0;
 }
diff --git a/arch/arm64/boot/dts/apm-mustang.dts b/arch/arm64/boot/dts/apm-mustang.dts
index 6541962f5d70..b2f56229aa5e 100644
--- a/arch/arm64/boot/dts/apm-mustang.dts
+++ b/arch/arm64/boot/dts/apm-mustang.dts
@@ -28,3 +28,7 @@
 &serial0 {
 	status = "ok";
 };
+
+&menet {
+	status = "ok";
+};
diff --git a/arch/arm64/boot/dts/apm-storm.dtsi b/arch/arm64/boot/dts/apm-storm.dtsi
index 40aa96ce13c4..c0aceef7f5b3 100644
--- a/arch/arm64/boot/dts/apm-storm.dtsi
+++ b/arch/arm64/boot/dts/apm-storm.dtsi
@@ -167,14 +167,13 @@
 				clock-output-names = "ethclk";
 			};
 
-			eth8clk: eth8clk {
+			menetclk: menetclk {
 				compatible = "apm,xgene-device-clock";
 				#clock-cells = <1>;
 				clocks = <&ethclk 0>;
-				clock-names = "eth8clk";
 				reg = <0x0 0x1702C000 0x0 0x1000>;
 				reg-names = "csr-reg";
-				clock-output-names = "eth8clk";
+				clock-output-names = "menetclk";
 			};
 
 			sataphy1clk: sataphy1clk@1f21c000 {
@@ -397,5 +396,30 @@
 			#clock-cells = <1>;
 			clocks = <&rtcclk 0>;
 		};
+
+		menet: ethernet@17020000 {
+			compatible = "apm,xgene-enet";
+			status = "disabled";
+			reg = <0x0 0x17020000 0x0 0xd100>,
+			      <0x0 0X17030000 0x0 0X400>,
+			      <0x0 0X10000000 0x0 0X200>;
+			reg-names = "enet_csr", "ring_csr", "ring_cmd";
+			interrupts = <0x0 0x3c 0x4>;
+			dma-coherent;
+			clocks = <&menetclk 0>;
+			local-mac-address = [00 01 73 00 00 01];
+			phy-connection-type = "rgmii";
+			phy-handle = <&menetphy>;
+			mdio {
+				compatible = "apm,xgene-mdio";
+				#address-cells = <1>;
+				#size-cells = <0>;
+				menetphy: menetphy@3 {
+					compatible = "ethernet-phy-id001c.c915";
+					reg = <0x3>;
+				};
+
+			};
+		};
 	};
 };
diff --git a/arch/metag/kernel/cachepart.c b/arch/metag/kernel/cachepart.c
index 0a2385fa2a1d..04b7d4f8429a 100644
--- a/arch/metag/kernel/cachepart.c
+++ b/arch/metag/kernel/cachepart.c
@@ -55,7 +55,7 @@ unsigned int get_global_icache_size(void)
 	return (get_icache_size() * ((temp >> SYSC_xCPARTG_AND_S) + 1)) >> 4;
 }
 
-static unsigned int get_thread_cache_size(unsigned int cache, int thread_id)
+static int get_thread_cache_size(unsigned int cache, int thread_id)
 {
 	unsigned int cache_size;
 	unsigned int t_cache_part;
@@ -94,7 +94,7 @@ static unsigned int get_thread_cache_size(unsigned int cache, int thread_id)
 
 void check_for_cache_aliasing(int thread_id)
 {
-	unsigned int thread_cache_size;
+	int thread_cache_size;
 	unsigned int cache_type;
 	for (cache_type = ICACHE; cache_type <= DCACHE; cache_type++) {
 		thread_cache_size =
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 3c52fa6d0f8e..3c32075d2945 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -173,7 +173,7 @@ new_search:
 				mm->context.part_huge = 0;
 			return addr;
 		}
-		if (vma && (vma->vm_flags & MAP_HUGETLB)) {
+		if (vma->vm_flags & MAP_HUGETLB) {
 			/* space after a huge vma in 2nd level page table? */
 			if (vma->vm_end & HUGEPT_MASK) {
 				after_huge = 1;
diff --git a/arch/powerpc/boot/dts/mpc5121.dtsi b/arch/powerpc/boot/dts/mpc5121.dtsi
index 2c0e1552d20b..7f9d14f5c4da 100644
--- a/arch/powerpc/boot/dts/mpc5121.dtsi
+++ b/arch/powerpc/boot/dts/mpc5121.dtsi
@@ -498,6 +498,7 @@
 			compatible = "fsl,mpc5121-dma";
 			reg = <0x14000 0x1800>;
 			interrupts = <65 0x8>;
+			#dma-cells = <1>;
 		};
 	};
 
diff --git a/arch/sh/drivers/dma/dma-sh.c b/arch/sh/drivers/dma/dma-sh.c
index b22565623142..afde2a7d3eb3 100644
--- a/arch/sh/drivers/dma/dma-sh.c
+++ b/arch/sh/drivers/dma/dma-sh.c
@@ -25,7 +25,7 @@
  * Define the default configuration for dual address memory-memory transfer.
  * The 0x400 value represents auto-request, external->external.
  */
-#define RS_DUAL	(DM_INC | SM_INC | 0x400 | TS_INDEX2VAL(XMIT_SZ_32BIT))
+#define RS_DUAL	(DM_INC | SM_INC | RS_AUTO | TS_INDEX2VAL(XMIT_SZ_32BIT))
 
 static unsigned long dma_find_base(unsigned int chan)
 {
diff --git a/arch/sh/include/asm/dma-register.h b/arch/sh/include/asm/dma-register.h
index 51cd78feacff..c757b47e6b64 100644
--- a/arch/sh/include/asm/dma-register.h
+++ b/arch/sh/include/asm/dma-register.h
@@ -13,17 +13,17 @@
 #ifndef DMA_REGISTER_H
 #define DMA_REGISTER_H
 
-/* DMA register */
-#define SAR	0x00
-#define DAR	0x04
-#define TCR	0x08
-#define CHCR	0x0C
-#define DMAOR	0x40
+/* DMA registers */
+#define SAR	0x00	/* Source Address Register */
+#define DAR	0x04	/* Destination Address Register */
+#define TCR	0x08	/* Transfer Count Register */
+#define CHCR	0x0C	/* Channel Control Register */
+#define DMAOR	0x40	/* DMA Operation Register */
 
 /* DMAOR definitions */
-#define DMAOR_AE	0x00000004
+#define DMAOR_AE	0x00000004	/* Address Error Flag */
 #define DMAOR_NMIF	0x00000002
-#define DMAOR_DME	0x00000001
+#define DMAOR_DME	0x00000001	/* DMA Master Enable */
 
 /* Definitions for the SuperH DMAC */
 #define REQ_L	0x00000000
@@ -34,18 +34,20 @@
 #define ACK_W	0x00020000
 #define ACK_H	0x00000000
 #define ACK_L	0x00010000
-#define DM_INC	0x00004000
-#define DM_DEC	0x00008000
-#define DM_FIX	0x0000c000
-#define SM_INC	0x00001000
-#define SM_DEC	0x00002000
-#define SM_FIX	0x00003000
+#define DM_INC	0x00004000	/* Destination addresses are incremented */
+#define DM_DEC	0x00008000	/* Destination addresses are decremented */
+#define DM_FIX	0x0000c000	/* Destination address is fixed */
+#define SM_INC	0x00001000	/* Source addresses are incremented */
+#define SM_DEC	0x00002000	/* Source addresses are decremented */
+#define SM_FIX	0x00003000	/* Source address is fixed */
 #define RS_IN	0x00000200
 #define RS_OUT	0x00000300
+#define RS_AUTO	0x00000400	/* Auto Request */
+#define RS_ERS	0x00000800	/* DMA extended resource selector */
 #define TS_BLK	0x00000040
 #define TM_BUR	0x00000020
-#define CHCR_DE	0x00000001
-#define CHCR_TE	0x00000002
-#define CHCR_IE	0x00000004
+#define CHCR_DE	0x00000001	/* DMA Enable */
+#define CHCR_TE	0x00000002	/* Transfer End Flag */
+#define CHCR_IE	0x00000004	/* Interrupt Enable */
 
 #endif
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 57f83a92a505..7aa733307afc 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -30,62 +30,62 @@ static const struct sh_dmae_slave_config sh7722_dmae_slaves[] = {
 	{
 		.slave_id	= SHDMA_SLAVE_SCIF0_TX,
 		.addr		= 0xffe0000c,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x21,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF0_RX,
 		.addr		= 0xffe00014,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x22,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF1_TX,
 		.addr		= 0xffe1000c,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x25,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF1_RX,
 		.addr		= 0xffe10014,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x26,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF2_TX,
 		.addr		= 0xffe2000c,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x29,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF2_RX,
 		.addr		= 0xffe20014,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x2a,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SIUA_TX,
 		.addr		= 0xa454c098,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0xb1,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SIUA_RX,
 		.addr		= 0xa454c090,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0xb2,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SIUB_TX,
 		.addr		= 0xa454c09c,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0xb5,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SIUB_RX,
 		.addr		= 0xa454c094,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0xb6,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SDHI0_TX,
 		.addr		= 0x04ce0030,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
 		.mid_rid	= 0xc1,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SDHI0_RX,
 		.addr		= 0x04ce0030,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
 		.mid_rid	= 0xc2,
 	},
 };
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index b9e84b1d3aa7..ea5780b3c7f6 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -36,122 +36,122 @@ static const struct sh_dmae_slave_config sh7724_dmae_slaves[] = {
 	{
 		.slave_id	= SHDMA_SLAVE_SCIF0_TX,
 		.addr		= 0xffe0000c,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x21,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF0_RX,
 		.addr		= 0xffe00014,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x22,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF1_TX,
 		.addr		= 0xffe1000c,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x25,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF1_RX,
 		.addr		= 0xffe10014,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x26,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF2_TX,
 		.addr		= 0xffe2000c,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x29,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF2_RX,
 		.addr		= 0xffe20014,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x2a,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF3_TX,
 		.addr		= 0xa4e30020,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x2d,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF3_RX,
 		.addr		= 0xa4e30024,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x2e,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF4_TX,
 		.addr		= 0xa4e40020,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x31,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF4_RX,
 		.addr		= 0xa4e40024,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x32,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF5_TX,
 		.addr		= 0xa4e50020,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x35,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SCIF5_RX,
 		.addr		= 0xa4e50024,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x36,
 	}, {
 		.slave_id	= SHDMA_SLAVE_USB0D0_TX,
 		.addr		= 0xA4D80100,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0x73,
 	}, {
 		.slave_id	= SHDMA_SLAVE_USB0D0_RX,
 		.addr		= 0xA4D80100,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0x73,
 	}, {
 		.slave_id	= SHDMA_SLAVE_USB0D1_TX,
 		.addr		= 0xA4D80120,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0x77,
 	}, {
 		.slave_id	= SHDMA_SLAVE_USB0D1_RX,
 		.addr		= 0xA4D80120,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0x77,
 	}, {
 		.slave_id	= SHDMA_SLAVE_USB1D0_TX,
 		.addr		= 0xA4D90100,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0xab,
 	}, {
 		.slave_id	= SHDMA_SLAVE_USB1D0_RX,
 		.addr		= 0xA4D90100,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0xab,
 	}, {
 		.slave_id	= SHDMA_SLAVE_USB1D1_TX,
 		.addr		= 0xA4D90120,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0xaf,
 	}, {
 		.slave_id	= SHDMA_SLAVE_USB1D1_RX,
 		.addr		= 0xA4D90120,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0xaf,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SDHI0_TX,
 		.addr		= 0x04ce0030,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
 		.mid_rid	= 0xc1,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SDHI0_RX,
 		.addr		= 0x04ce0030,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
 		.mid_rid	= 0xc2,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SDHI1_TX,
 		.addr		= 0x04cf0030,
-		.chcr		= DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+		.chcr		= DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
 		.mid_rid	= 0xc9,
 	}, {
 		.slave_id	= SHDMA_SLAVE_SDHI1_RX,
 		.addr		= 0x04cf0030,
-		.chcr		= DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_16BIT),
+		.chcr		= DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL(XMIT_SZ_16BIT),
 		.mid_rid	= 0xca,
 	},
 };
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7757.c b/arch/sh/kernel/cpu/sh4a/setup-sh7757.c
index 7b24ec4b409a..18bcd70cd813 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7757.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7757.c
@@ -123,28 +123,28 @@ static const struct sh_dmae_slave_config sh7757_dmae0_slaves[] = {
 	{
 		.slave_id	= SHDMA_SLAVE_SDHI_TX,
 		.addr		= 0x1fe50030,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_16BIT),
 		.mid_rid	= 0xc5,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_SDHI_RX,
 		.addr		= 0x1fe50030,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_16BIT),
 		.mid_rid	= 0xc6,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_MMCIF_TX,
 		.addr		= 0x1fcb0034,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0xd3,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_MMCIF_RX,
 		.addr		= 0x1fcb0034,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_32BIT),
 		.mid_rid	= 0xd7,
 	},
@@ -154,56 +154,56 @@ static const struct sh_dmae_slave_config sh7757_dmae1_slaves[] = {
 	{
 		.slave_id	= SHDMA_SLAVE_SCIF2_TX,
 		.addr		= 0x1f4b000c,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x21,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_SCIF2_RX,
 		.addr		= 0x1f4b0014,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x22,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_SCIF3_TX,
 		.addr		= 0x1f4c000c,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x29,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_SCIF3_RX,
 		.addr		= 0x1f4c0014,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x2a,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_SCIF4_TX,
 		.addr		= 0x1f4d000c,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x41,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_SCIF4_RX,
 		.addr		= 0x1f4d0014,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x42,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RSPI_TX,
 		.addr		= 0xfe480004,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_16BIT),
 		.mid_rid	= 0xc1,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RSPI_RX,
 		.addr		= 0xfe480004,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_16BIT),
 		.mid_rid	= 0xc2,
 	},
@@ -213,70 +213,70 @@ static const struct sh_dmae_slave_config sh7757_dmae2_slaves[] = {
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC0_TX,
 		.addr		= 0x1e500012,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x21,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC0_RX,
 		.addr		= 0x1e500013,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x22,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC1_TX,
 		.addr		= 0x1e510012,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x29,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC1_RX,
 		.addr		= 0x1e510013,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x2a,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC2_TX,
 		.addr		= 0x1e520012,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0xa1,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC2_RX,
 		.addr		= 0x1e520013,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0xa2,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC3_TX,
 		.addr		= 0x1e530012,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0xa9,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC3_RX,
 		.addr		= 0x1e530013,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0xaf,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC4_TX,
 		.addr		= 0x1e540012,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0xc5,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC4_RX,
 		.addr		= 0x1e540013,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0xc6,
 	},
@@ -286,70 +286,70 @@ static const struct sh_dmae_slave_config sh7757_dmae3_slaves[] = {
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC5_TX,
 		.addr		= 0x1e550012,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x21,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC5_RX,
 		.addr		= 0x1e550013,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x22,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC6_TX,
 		.addr		= 0x1e560012,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x29,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC6_RX,
 		.addr		= 0x1e560013,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x2a,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC7_TX,
 		.addr		= 0x1e570012,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x41,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC7_RX,
 		.addr		= 0x1e570013,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x42,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC8_TX,
 		.addr		= 0x1e580012,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x45,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC8_RX,
 		.addr		= 0x1e580013,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x46,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC9_TX,
 		.addr		= 0x1e590012,
-		.chcr		= SM_INC | 0x800 | 0x40000000 |
+		.chcr		= SM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x51,
 	},
 	{
 		.slave_id	= SHDMA_SLAVE_RIIC9_RX,
 		.addr		= 0x1e590013,
-		.chcr		= DM_INC | 0x800 | 0x40000000 |
+		.chcr		= DM_INC | RS_ERS | 0x40000000 |
 				  TS_INDEX2VAL(XMIT_SZ_8BIT),
 		.mid_rid	= 0x52,
 	},
diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h
index 42f2bca1d338..886cab456e1b 100644
--- a/arch/sparc/include/uapi/asm/unistd.h
+++ b/arch/sparc/include/uapi/asm/unistd.h
@@ -411,8 +411,10 @@
 #define __NR_sched_setattr	343
 #define __NR_sched_getattr	344
 #define __NR_renameat2		345
+#define __NR_seccomp		346
+#define __NR_getrandom		347
 
-#define NR_syscalls		346
+#define NR_syscalls		348
 
 /* Bitmask values returned from kern_features system call.  */
 #define KERN_FEATURE_MIXED_MODE_STACK	0x00000001
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index 337094556916..5b1151dcba13 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -130,7 +130,6 @@ static inline unsigned int get_nmi_count(int cpu)
 
 static __init void nmi_cpu_busy(void *data)
 {
-	local_irq_enable_in_hardirq();
 	while (endflag == 0)
 		mb();
 }
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 8efd33753ad3..d35c490a91cb 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1671,9 +1671,12 @@ static bool __init supported_pmu(void)
 
 static int __init init_hw_perf_events(void)
 {
+	int err;
+
 	pr_info("Performance events: ");
 
-	if (!supported_pmu()) {
+	err = pcr_arch_init();
+	if (err || !supported_pmu()) {
 		pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
 		return 0;
 	}
@@ -1685,7 +1688,7 @@ static int __init init_hw_perf_events(void)
 
 	return 0;
 }
-early_initcall(init_hw_perf_events);
+pure_initcall(init_hw_perf_events);
 
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			   struct pt_regs *regs)
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 027e09986194..0be7bf978cb1 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -312,6 +312,9 @@ static void __global_pmu_self(int this_cpu)
 	struct global_pmu_snapshot *pp;
 	int i, num;
 
+	if (!pcr_ops)
+		return;
+
 	pp = &global_cpu_snapshot[this_cpu].pmu;
 
 	num = 1;
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 7958242d63c5..b3a5d81b20f0 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -68,7 +68,7 @@ void smp_store_cpu_info(int id)
 	mid = cpu_get_hwmid(cpu_node);
 
 	if (mid < 0) {
-		printk(KERN_NOTICE "No MID found for CPU%d at node 0x%08d", id, cpu_node);
+		printk(KERN_NOTICE "No MID found for CPU%d at node 0x%08x", id, cpu_node);
 		mid = 0;
 	}
 	cpu_data(id).mid = mid;
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 41aa2478f3ca..f7ba87543e5f 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1383,7 +1383,6 @@ void __cpu_die(unsigned int cpu)
 
 void __init smp_cpus_done(unsigned int max_cpus)
 {
-	pcr_arch_init();
 }
 
 void smp_send_reschedule(int cpu)
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index 85fe9b1087cd..217893e18d78 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -86,4 +86,4 @@ sys_call_table:
 /*330*/	.long sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, sys_open_by_handle_at, sys_clock_adjtime
 /*335*/	.long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
 /*340*/	.long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
-/*345*/	.long sys_renameat2
+/*345*/	.long sys_renameat2, sys_seccomp, sys_getrandom
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 33ecba2826ea..d93b49d1b420 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -87,7 +87,7 @@ sys_call_table32:
 /*330*/	.word compat_sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, compat_sys_open_by_handle_at, compat_sys_clock_adjtime
 	.word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev
 /*340*/	.word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
-	.word sys32_renameat2
+	.word sys32_renameat2, sys_seccomp, sys_getrandom
 
 #endif /* CONFIG_COMPAT */
 
@@ -166,4 +166,4 @@ sys_call_table:
 /*330*/	.word sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, sys_open_by_handle_at, sys_clock_adjtime
 	.word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
 /*340*/	.word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
-	.word sys_renameat2
+	.word sys_renameat2, sys_seccomp, sys_getrandom
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4aafd322e21e..5d0bf1aa9dcb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -434,6 +434,7 @@ config X86_INTEL_CE
 	bool "CE4100 TV platform"
 	depends on PCI
 	depends on PCI_GODIRECT
+	depends on X86_IO_APIC
 	depends on X86_32
 	depends on X86_EXTENDED_PLATFORM
 	select X86_REBOOTFIXUPS
@@ -840,6 +841,7 @@ config X86_IO_APIC
 	def_bool y
 	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI
 	select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+	select IRQ_DOMAIN
 
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	bool "Reroute for broken boot IRQs"
@@ -1541,7 +1543,8 @@ config EFI
 
 config EFI_STUB
        bool "EFI stub support"
-       depends on EFI
+       depends on EFI && !X86_USE_3DNOW
+       select RELOCATABLE
        ---help---
           This kernel feature allows a bzImage to be loaded directly
 	  by EFI firmware without the use of a bootloader.
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 0a3f9c9f98d5..473bdbee378a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -161,6 +161,20 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)		\
 		: : "i" (0), ## input)
 
+/*
+ * This is similar to alternative_input. But it has two features and
+ * respective instructions.
+ *
+ * If CPU has feature2, newinstr2 is used.
+ * Otherwise, if CPU has feature1, newinstr1 is used.
+ * Otherwise, oldinstr is used.
+ */
+#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2,	     \
+			   feature2, input...)				     \
+	asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1,	     \
+		newinstr2, feature2)					     \
+		: : "i" (0), ## input)
+
 /* Like alternative_input, but with a single output argument */
 #define alternative_io(oldinstr, newinstr, feature, output, input...)	\
 	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)		\
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 79752f2bdec5..465b309af254 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -85,14 +85,6 @@ static inline bool apic_from_smp_config(void)
 #include <asm/paravirt.h>
 #endif
 
-#ifdef CONFIG_X86_64
-extern int is_vsmp_box(void);
-#else
-static inline int is_vsmp_box(void)
-{
-	return 0;
-}
-#endif
 extern int setup_profiling_timer(unsigned int);
 
 static inline void native_apic_mem_write(u32 reg, u32 v)
@@ -300,7 +292,6 @@ struct apic {
 
 	int dest_logical;
 	unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
-	unsigned long (*check_apicid_present)(int apicid);
 
 	void (*vector_allocation_domain)(int cpu, struct cpumask *retmask,
 					 const struct cpumask *mask);
@@ -309,21 +300,11 @@ struct apic {
 	void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
 
 	void (*setup_apic_routing)(void);
-	int (*multi_timer_check)(int apic, int irq);
 	int (*cpu_present_to_apicid)(int mps_cpu);
 	void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
-	void (*setup_portio_remap)(void);
 	int (*check_phys_apicid_present)(int phys_apicid);
-	void (*enable_apic_mode)(void);
 	int (*phys_pkg_id)(int cpuid_apic, int index_msb);
 
-	/*
-	 * When one of the next two hooks returns 1 the apic
-	 * is switched to this. Essentially they are additional
-	 * probe functions:
-	 */
-	int (*mps_oem_check)(struct mpc_table *mpc, char *oem, char *productid);
-
 	unsigned int (*get_apic_id)(unsigned long x);
 	unsigned long (*set_apic_id)(unsigned int id);
 	unsigned long apic_id_mask;
@@ -343,11 +324,7 @@ struct apic {
 	/* wakeup_secondary_cpu */
 	int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
 
-	int trampoline_phys_low;
-	int trampoline_phys_high;
-
 	bool wait_for_init_deassert;
-	void (*smp_callin_clear_local_apic)(void);
 	void (*inquire_remote_apic)(int apicid);
 
 	/* apic ops */
@@ -378,14 +355,6 @@ struct apic {
 	 * won't be applied properly during early boot in this case.
 	 */
 	int (*x86_32_early_logical_apicid)(int cpu);
-
-	/*
-	 * Optional method called from setup_local_APIC() after logical
-	 * apicid is guaranteed to be known to initialize apicid -> node
-	 * mapping if NUMA initialization hasn't done so already.  Don't
-	 * add new users.
-	 */
-	int (*x86_32_numa_cpu_node)(int cpu);
 #endif
 };
 
@@ -496,14 +465,12 @@ static inline unsigned default_get_apic_id(unsigned long x)
 }
 
 /*
- * Warm reset vector default position:
+ * Warm reset vector position:
  */
-#define DEFAULT_TRAMPOLINE_PHYS_LOW		0x467
-#define DEFAULT_TRAMPOLINE_PHYS_HIGH		0x469
+#define TRAMPOLINE_PHYS_LOW		0x467
+#define TRAMPOLINE_PHYS_HIGH		0x469
 
 #ifdef CONFIG_X86_64
-extern int default_acpi_madt_oem_check(char *, char *);
-
 extern void apic_send_IPI_self(int vector);
 
 DECLARE_PER_CPU(int, x2apic_extra_bits);
@@ -552,6 +519,8 @@ static inline int default_apic_id_valid(int apicid)
 	return (apicid < 255);
 }
 
+extern int default_acpi_madt_oem_check(char *, char *);
+
 extern void default_setup_apic_routing(void);
 
 extern struct apic apic_noop;
@@ -635,11 +604,6 @@ static inline unsigned long default_check_apicid_used(physid_mask_t *map, int ap
 	return physid_isset(apicid, *map);
 }
 
-static inline unsigned long default_check_apicid_present(int bit)
-{
-	return physid_isset(bit, phys_cpu_present_map);
-}
-
 static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
 {
 	*retmap = *phys_map;
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index e3b85422cf12..412ececa00b9 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -508,9 +508,12 @@ static inline void user_fpu_begin(void)
 
 static inline void __save_fpu(struct task_struct *tsk)
 {
-	if (use_xsave())
-		xsave_state(&tsk->thread.fpu.state->xsave, -1);
-	else
+	if (use_xsave()) {
+		if (unlikely(system_state == SYSTEM_BOOTING))
+			xsave_state_booting(&tsk->thread.fpu.state->xsave, -1);
+		else
+			xsave_state(&tsk->thread.fpu.state->xsave, -1);
+	} else
 		fpu_fxsave(&tsk->thread.fpu);
 }
 
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 230853da4ec0..0f5fb6b6567e 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -40,9 +40,6 @@ typedef struct {
 
 DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 
-/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
-#define MAX_HARDIRQS_PER_CPU NR_VECTORS
-
 #define __ARCH_IRQ_STAT
 
 #define inc_irq_stat(member)	this_cpu_inc(irq_stat.member)
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index a20365953bf8..ccffa53750a8 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -67,4 +67,9 @@ struct legacy_pic {
 extern struct legacy_pic *legacy_pic;
 extern struct legacy_pic null_legacy_pic;
 
+static inline int nr_legacy_irqs(void)
+{
+	return legacy_pic->nr_legacy_irqs;
+}
+
 #endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 90f97b4b9347..0aeed5ca356e 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -98,6 +98,8 @@ struct IR_IO_APIC_route_entry {
 #define IOAPIC_AUTO     -1
 #define IOAPIC_EDGE     0
 #define IOAPIC_LEVEL    1
+#define	IOAPIC_MAP_ALLOC		0x1
+#define	IOAPIC_MAP_CHECK		0x2
 
 #ifdef CONFIG_X86_IO_APIC
 
@@ -118,9 +120,6 @@ extern int mp_irq_entries;
 /* MP IRQ source entries */
 extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
-/* non-0 if default (table-less) MP configuration */
-extern int mpc_default_type;
-
 /* Older SiS APIC requires we rewrite the index register */
 extern int sis_apic_bug;
 
@@ -133,9 +132,6 @@ extern int noioapicquirk;
 /* -1 if "noapic" boot option passed */
 extern int noioapicreroute;
 
-/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
-extern int timer_through_8259;
-
 /*
  * If we use the IO-APIC for IRQ routing, disable automatic
  * assignment of PCI IRQ's.
@@ -145,24 +141,17 @@ extern int timer_through_8259;
 
 struct io_apic_irq_attr;
 struct irq_cfg;
-extern int io_apic_set_pci_routing(struct device *dev, int irq,
-		 struct io_apic_irq_attr *irq_attr);
-void setup_IO_APIC_irq_extra(u32 gsi);
 extern void ioapic_insert_resources(void);
 
 extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
 				     unsigned int, int,
 				     struct io_apic_irq_attr *);
-extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
-				     unsigned int, int,
-				     struct io_apic_irq_attr *);
 extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg);
 
 extern void native_compose_msi_msg(struct pci_dev *pdev,
 				   unsigned int irq, unsigned int dest,
 				   struct msi_msg *msg, u8 hpet_id);
 extern void native_eoi_ioapic_pin(int apic, int pin, int vector);
-int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
 
 extern int save_ioapic_entries(void);
 extern void mask_ioapic_entries(void);
@@ -171,15 +160,40 @@ extern int restore_ioapic_entries(void);
 extern void setup_ioapic_ids_from_mpc(void);
 extern void setup_ioapic_ids_from_mpc_nocheck(void);
 
+enum ioapic_domain_type {
+	IOAPIC_DOMAIN_INVALID,
+	IOAPIC_DOMAIN_LEGACY,
+	IOAPIC_DOMAIN_STRICT,
+	IOAPIC_DOMAIN_DYNAMIC,
+};
+
+struct device_node;
+struct irq_domain;
+struct irq_domain_ops;
+
+struct ioapic_domain_cfg {
+	enum ioapic_domain_type		type;
+	const struct irq_domain_ops	*ops;
+	struct device_node		*dev;
+};
+
 struct mp_ioapic_gsi{
 	u32 gsi_base;
 	u32 gsi_end;
 };
-extern struct mp_ioapic_gsi  mp_gsi_routing[];
 extern u32 gsi_top;
-int mp_find_ioapic(u32 gsi);
-int mp_find_ioapic_pin(int ioapic, u32 gsi);
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
+
+extern int mp_find_ioapic(u32 gsi);
+extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
+extern u32 mp_pin_to_gsi(int ioapic, int pin);
+extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags);
+extern void mp_unmap_irq(int irq);
+extern void __init mp_register_ioapic(int id, u32 address, u32 gsi_base,
+				      struct ioapic_domain_cfg *cfg);
+extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
+			    irq_hw_number_t hwirq);
+extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq);
+extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node);
 extern void __init pre_init_apic_IRQ0(void);
 
 extern void mp_save_irq(struct mpc_intsrc *m);
@@ -217,14 +231,12 @@ extern void io_apic_eoi(unsigned int apic, unsigned int vector);
 
 #define io_apic_assign_pci_irqs 0
 #define setup_ioapic_ids_from_mpc x86_init_noop
-static const int timer_through_8259 = 0;
 static inline void ioapic_insert_resources(void) { }
 #define gsi_top (NR_IRQS_LEGACY)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
-
-struct io_apic_irq_attr;
-static inline int io_apic_set_pci_routing(struct device *dev, int irq,
-		 struct io_apic_irq_attr *irq_attr) { return 0; }
+static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; }
+static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; }
+static inline void mp_unmap_irq(int irq) { }
 
 static inline int save_ioapic_entries(void)
 {
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index f5a617956735..b07233b64578 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -40,8 +40,6 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
 extern unsigned int boot_cpu_physical_apicid;
-extern unsigned int max_physical_apicid;
-extern int mpc_default_type;
 extern unsigned long mp_lapic_addr;
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -88,15 +86,6 @@ static inline void early_reserve_e820_mpc_new(void) { }
 #endif
 
 int generic_processor_info(int apicid, int version);
-#ifdef CONFIG_ACPI
-extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
-extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
-				   u32 gsi);
-extern void mp_config_acpi_legacy_irqs(void);
-struct device;
-extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
-				 int active_high_low);
-#endif /* CONFIG_ACPI */
 
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_LOCAL_APIC)
 
@@ -161,8 +150,4 @@ static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
 
 extern physid_mask_t phys_cpu_present_map;
 
-extern int generic_mps_oem_check(struct mpc_table *, char *, char *);
-
-extern int default_acpi_madt_oem_check(char *, char *);
-
 #endif /* _ASM_X86_MPSPEC_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ee30b9f0b91c..eb71ec794732 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -385,8 +385,8 @@ struct bndcsr_struct {
 
 struct xsave_hdr_struct {
 	u64 xstate_bv;
-	u64 reserved1[2];
-	u64 reserved2[5];
+	u64 xcomp_bv;
+	u64 reserved[6];
 } __attribute__((packed));
 
 struct xsave_struct {
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index fbeb06ed0eaa..1d081ac1cd69 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -26,12 +26,10 @@
 extern int of_ioapic;
 extern u64 initial_dtb;
 extern void add_dtb(u64 data);
-extern void x86_add_irq_domains(void);
 void x86_of_pci_init(void);
 void x86_dtb_init(void);
 #else
 static inline void add_dtb(u64 data) { }
-static inline void x86_add_irq_domains(void) { }
 static inline void x86_of_pci_init(void) { }
 static inline void x86_dtb_init(void) { }
 #define of_ioapic 0
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 49adfd7bb4a4..0da7409f0bec 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -17,11 +17,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 	spin_unlock_irqrestore(&rtc_lock, flags);
 	local_flush_tlb();
 	pr_debug("1.\n");
-	*((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
-								 start_eip >> 4;
+	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+							start_eip >> 4;
 	pr_debug("2.\n");
-	*((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_low)) =
-							 start_eip & 0xf;
+	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+							start_eip & 0xf;
 	pr_debug("3.\n");
 }
 
@@ -42,7 +42,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
 	CMOS_WRITE(0, 0xf);
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
-	*((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
+	*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
 }
 
 static inline void __init smpboot_setup_io_apic(void)
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index d949ef28c48b..7e7a79ada658 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -52,24 +52,170 @@ extern void xsave_init(void);
 extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
 extern int init_fpu(struct task_struct *child);
 
-static inline int fpu_xrstor_checking(struct xsave_struct *fx)
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES		".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+#define xstate_fault	".section .fixup,\"ax\"\n"	\
+			"3:  movl $-1,%[err]\n"		\
+			"    jmp  2b\n"			\
+			".previous\n"			\
+			_ASM_EXTABLE(1b, 3b)		\
+			: [err] "=r" (err)
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
 {
-	int err;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	WARN_ON(system_state != SYSTEM_BOOTING);
+
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		asm volatile("1:"XSAVES"\n\t"
+			"2:\n\t"
+			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			:   "memory");
+	else
+		asm volatile("1:"XSAVE"\n\t"
+			"2:\n\t"
+			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			:   "memory");
+
+	asm volatile(xstate_fault
+		     : "0" (0)
+		     : "memory");
+
+	return err;
+}
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	WARN_ON(system_state != SYSTEM_BOOTING);
 
-	asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
-		     "2:\n"
-		     ".section .fixup,\"ax\"\n"
-		     "3:  movl $-1,%[err]\n"
-		     "    jmp  2b\n"
-		     ".previous\n"
-		     _ASM_EXTABLE(1b, 3b)
-		     : [err] "=r" (err)
-		     : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		asm volatile("1:"XRSTORS"\n\t"
+			"2:\n\t"
+			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			:   "memory");
+	else
+		asm volatile("1:"XRSTOR"\n\t"
+			"2:\n\t"
+			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			:   "memory");
+
+	asm volatile(xstate_fault
+		     : "0" (0)
+		     : "memory");
+
+	return err;
+}
+
+/*
+ * Save processor xstate to xsave area.
+ */
+static inline int xsave_state(struct xsave_struct *fx, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	/*
+	 * If xsaves is enabled, xsaves replaces xsaveopt because
+	 * it supports compact format and supervisor states in addition to
+	 * modified optimization in xsaveopt.
+	 *
+	 * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
+	 * because xsaveopt supports modified optimization which is not
+	 * supported by xsave.
+	 *
+	 * If none of xsaves and xsaveopt is enabled, use xsave.
+	 */
+	alternative_input_2(
+		"1:"XSAVE,
+		"1:"XSAVEOPT,
+		X86_FEATURE_XSAVEOPT,
+		"1:"XSAVES,
+		X86_FEATURE_XSAVES,
+		[fx] "D" (fx), "a" (lmask), "d" (hmask) :
+		"memory");
+	asm volatile("2:\n\t"
+		     xstate_fault
+		     : "0" (0)
 		     : "memory");
 
 	return err;
 }
 
+/*
+ * Restore processor xstate from xsave area.
+ */
+static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
+{
+	int err = 0;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+
+	/*
+	 * Use xrstors to restore context if it is enabled. xrstors supports
+	 * compacted format of xsave area which is not supported by xrstor.
+	 */
+	alternative_input(
+		"1: " XRSTOR,
+		"1: " XRSTORS,
+		X86_FEATURE_XSAVES,
+		"D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+		: "memory");
+
+	asm volatile("2:\n"
+		     xstate_fault
+		     : "0" (0)
+		     : "memory");
+
+	return err;
+}
+
+/*
+ * Save xstate context for old process during context switch.
+ */
+static inline void fpu_xsave(struct fpu *fpu)
+{
+	xsave_state(&fpu->state->xsave, -1);
+}
+
+/*
+ * Restore xstate context for new process during context switch.
+ */
+static inline int fpu_xrstor_checking(struct xsave_struct *fx)
+{
+	return xrstor_state(fx, -1);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for
+ * backward compatibility for old applications which don't understand
+ * compacted format of xsave area.
+ */
 static inline int xsave_user(struct xsave_struct __user *buf)
 {
 	int err;
@@ -83,69 +229,34 @@ static inline int xsave_user(struct xsave_struct __user *buf)
 		return -EFAULT;
 
 	__asm__ __volatile__(ASM_STAC "\n"
-			     "1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
+			     "1:"XSAVE"\n"
 			     "2: " ASM_CLAC "\n"
-			     ".section .fixup,\"ax\"\n"
-			     "3:  movl $-1,%[err]\n"
-			     "    jmp  2b\n"
-			     ".previous\n"
-			     _ASM_EXTABLE(1b,3b)
-			     : [err] "=r" (err)
+			     xstate_fault
 			     : "D" (buf), "a" (-1), "d" (-1), "0" (0)
 			     : "memory");
 	return err;
 }
 
+/*
+ * Restore xstate from user space xsave area.
+ */
 static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
 {
-	int err;
+	int err = 0;
 	struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 
 	__asm__ __volatile__(ASM_STAC "\n"
-			     "1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
+			     "1:"XRSTOR"\n"
 			     "2: " ASM_CLAC "\n"
-			     ".section .fixup,\"ax\"\n"
-			     "3:  movl $-1,%[err]\n"
-			     "    jmp  2b\n"
-			     ".previous\n"
-			     _ASM_EXTABLE(1b,3b)
-			     : [err] "=r" (err)
+			     xstate_fault
 			     : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
 			     : "memory");	/* memory required? */
 	return err;
 }
 
-static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-
-	asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
-		     : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-		     :   "memory");
-}
-
-static inline void xsave_state(struct xsave_struct *fx, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
+void *get_xsave_addr(struct xsave_struct *xsave, int xstate);
+void setup_xstate_comp(void);
 
-	asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
-		     : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-		     :   "memory");
-}
-
-static inline void fpu_xsave(struct fpu *fpu)
-{
-	/* This, however, we can work around by forcing the compiler to select
-	   an addressing mode that doesn't require extended registers. */
-	alternative_input(
-		".byte " REX_PREFIX "0x0f,0xae,0x27",
-		".byte " REX_PREFIX "0x0f,0xae,0x37",
-		X86_FEATURE_XSAVEOPT,
-		[fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
-		"memory");
-}
 #endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a531f6564ed0..b436fc735aa4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,6 +31,7 @@
 #include <linux/module.h>
 #include <linux/dmi.h>
 #include <linux/irq.h>
+#include <linux/irqdomain.h>
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
@@ -43,6 +44,7 @@
 #include <asm/io.h>
 #include <asm/mpspec.h>
 #include <asm/smp.h>
+#include <asm/i8259.h>
 
 #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
 static int __initdata acpi_force = 0;
@@ -93,44 +95,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 
-static unsigned int gsi_to_irq(unsigned int gsi)
-{
-	unsigned int irq = gsi + NR_IRQS_LEGACY;
-	unsigned int i;
-
-	for (i = 0; i < NR_IRQS_LEGACY; i++) {
-		if (isa_irq_to_gsi[i] == gsi) {
-			return i;
-		}
-	}
-
-	/* Provide an identity mapping of gsi == irq
-	 * except on truly weird platforms that have
-	 * non isa irqs in the first 16 gsis.
-	 */
-	if (gsi >= NR_IRQS_LEGACY)
-		irq = gsi;
-	else
-		irq = gsi_top + gsi;
-
-	return irq;
-}
-
-static u32 irq_to_gsi(int irq)
-{
-	unsigned int gsi;
-
-	if (irq < NR_IRQS_LEGACY)
-		gsi = isa_irq_to_gsi[irq];
-	else if (irq < gsi_top)
-		gsi = irq;
-	else if (irq < (gsi_top + NR_IRQS_LEGACY))
-		gsi = irq - gsi_top;
-	else
-		gsi = 0xffffffff;
-
-	return gsi;
-}
+#define	ACPI_INVALID_GSI		INT_MIN
 
 /*
  * This is just a simple wrapper around early_ioremap(),
@@ -341,11 +306,145 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
 #endif				/*CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS		0
+
+static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
+					  u32 gsi)
+{
+	int ioapic;
+	int pin;
+	struct mpc_intsrc mp_irq;
+
+	/*
+	 * Convert 'gsi' to 'ioapic.pin'.
+	 */
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return;
+	pin = mp_find_ioapic_pin(ioapic, gsi);
+
+	/*
+	 * TBD: This check is for faulty timer entries, where the override
+	 *      erroneously sets the trigger to level, resulting in a HUGE
+	 *      increase of timer interrupts!
+	 */
+	if ((bus_irq == 0) && (trigger == 3))
+		trigger = 1;
+
+	mp_irq.type = MP_INTSRC;
+	mp_irq.irqtype = mp_INT;
+	mp_irq.irqflag = (trigger << 2) | polarity;
+	mp_irq.srcbus = MP_ISA_BUS;
+	mp_irq.srcbusirq = bus_irq;	/* IRQ */
+	mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
+	mp_irq.dstirq = pin;	/* INTIN# */
+
+	mp_save_irq(&mp_irq);
+
+	/*
+	 * Reset default identity mapping if gsi is also an legacy IRQ,
+	 * otherwise there will be more than one entry with the same GSI
+	 * and acpi_isa_irq_to_gsi() may give wrong result.
+	 */
+	if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi)
+		isa_irq_to_gsi[gsi] = ACPI_INVALID_GSI;
+	isa_irq_to_gsi[bus_irq] = gsi;
+}
+
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+			int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+	struct mpc_intsrc mp_irq;
+	struct pci_dev *pdev;
+	unsigned char number;
+	unsigned int devfn;
+	int ioapic;
+	u8 pin;
+
+	if (!acpi_ioapic)
+		return 0;
+	if (!dev || !dev_is_pci(dev))
+		return 0;
+
+	pdev = to_pci_dev(dev);
+	number = pdev->bus->number;
+	devfn = pdev->devfn;
+	pin = pdev->pin;
+	/* print the entry should happen on mptable identically */
+	mp_irq.type = MP_INTSRC;
+	mp_irq.irqtype = mp_INT;
+	mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+	mp_irq.srcbus = number;
+	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+	ioapic = mp_find_ioapic(gsi);
+	mp_irq.dstapic = mpc_ioapic_id(ioapic);
+	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+	mp_save_irq(&mp_irq);
+#endif
+	return 0;
+}
+
+static int mp_register_gsi(struct device *dev, u32 gsi, int trigger,
+			   int polarity)
+{
+	int irq, node;
+
+	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+		return gsi;
+
+	/* Don't set up the ACPI SCI because it's already set up */
+	if (acpi_gbl_FADT.sci_interrupt == gsi)
+		return gsi;
+
+	trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+	polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+	node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+	if (mp_set_gsi_attr(gsi, trigger, polarity, node)) {
+		pr_warn("Failed to set pin attr for GSI%d\n", gsi);
+		return -1;
+	}
+
+	irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC);
+	if (irq < 0)
+		return irq;
+
+	if (enable_update_mptable)
+		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
+
+	return irq;
+}
+
+static void mp_unregister_gsi(u32 gsi)
+{
+	int irq;
+
+	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+		return;
+
+	if (acpi_gbl_FADT.sci_interrupt == gsi)
+		return;
+
+	irq = mp_map_gsi_to_irq(gsi, 0);
+	if (irq > 0)
+		mp_unmap_irq(irq);
+}
+
+static struct irq_domain_ops acpi_irqdomain_ops = {
+	.map = mp_irqdomain_map,
+	.unmap = mp_irqdomain_unmap,
+};
 
 static int __init
 acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 {
 	struct acpi_madt_io_apic *ioapic = NULL;
+	struct ioapic_domain_cfg cfg = {
+		.type = IOAPIC_DOMAIN_DYNAMIC,
+		.ops = &acpi_irqdomain_ops,
+	};
 
 	ioapic = (struct acpi_madt_io_apic *)header;
 
@@ -354,8 +453,12 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 
 	acpi_table_print_madt_entry(header);
 
-	mp_register_ioapic(ioapic->id,
-			   ioapic->address, ioapic->global_irq_base);
+	/* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */
+	if (ioapic->global_irq_base < nr_legacy_irqs())
+		cfg.type = IOAPIC_DOMAIN_LEGACY;
+
+	mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base,
+			   &cfg);
 
 	return 0;
 }
@@ -378,11 +481,6 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
 	if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
 		polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
 
-	/*
-	 * mp_config_acpi_legacy_irqs() already setup IRQs < 16
-	 * If GSI is < 16, this will update its flags,
-	 * else it will create a new mp_irqs[] entry.
-	 */
 	mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
 
 	/*
@@ -504,25 +602,28 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
 	outb(new >> 8, 0x4d1);
 }
 
-int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
 {
-	*irq = gsi_to_irq(gsi);
+	int irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK);
 
-#ifdef CONFIG_X86_IO_APIC
-	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
-		setup_IO_APIC_irq_extra(gsi);
-#endif
+	if (irq >= 0) {
+		*irqp = irq;
+		return 0;
+	}
 
-	return 0;
+	return -1;
 }
 EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
 
 int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
 {
-	if (isa_irq >= 16)
-		return -1;
-	*gsi = irq_to_gsi(isa_irq);
-	return 0;
+	if (isa_irq < nr_legacy_irqs() &&
+	    isa_irq_to_gsi[isa_irq] != ACPI_INVALID_GSI) {
+		*gsi = isa_irq_to_gsi[isa_irq];
+		return 0;
+	}
+
+	return -1;
 }
 
 static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
@@ -542,15 +643,25 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
 static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
 				    int trigger, int polarity)
 {
+	int irq = gsi;
+
 #ifdef CONFIG_X86_IO_APIC
-	gsi = mp_register_gsi(dev, gsi, trigger, polarity);
+	irq = mp_register_gsi(dev, gsi, trigger, polarity);
 #endif
 
-	return gsi;
+	return irq;
+}
+
+static void acpi_unregister_gsi_ioapic(u32 gsi)
+{
+#ifdef CONFIG_X86_IO_APIC
+	mp_unregister_gsi(gsi);
+#endif
 }
 
 int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
 			   int trigger, int polarity) = acpi_register_gsi_pic;
+void (*__acpi_unregister_gsi)(u32 gsi) = NULL;
 
 #ifdef CONFIG_ACPI_SLEEP
 int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel;
@@ -564,32 +675,22 @@ int (*acpi_suspend_lowlevel)(void);
  */
 int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 {
-	unsigned int irq;
-	unsigned int plat_gsi = gsi;
-
-	plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity);
-	irq = gsi_to_irq(plat_gsi);
-
-	return irq;
+	return __acpi_register_gsi(dev, gsi, trigger, polarity);
 }
 EXPORT_SYMBOL_GPL(acpi_register_gsi);
 
 void acpi_unregister_gsi(u32 gsi)
 {
+	if (__acpi_unregister_gsi)
+		__acpi_unregister_gsi(gsi);
 }
 EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
 
-void __init acpi_set_irq_model_pic(void)
-{
-	acpi_irq_model = ACPI_IRQ_MODEL_PIC;
-	__acpi_register_gsi = acpi_register_gsi_pic;
-	acpi_ioapic = 0;
-}
-
-void __init acpi_set_irq_model_ioapic(void)
+static void __init acpi_set_irq_model_ioapic(void)
 {
 	acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
 	__acpi_register_gsi = acpi_register_gsi_ioapic;
+	__acpi_unregister_gsi = acpi_unregister_gsi_ioapic;
 	acpi_ioapic = 1;
 }
 
@@ -825,9 +926,8 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
 	 * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
 	 */
 
-	count =
-	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
-				  acpi_parse_lapic_addr_ovr, 0);
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+				      acpi_parse_lapic_addr_ovr, 0);
 	if (count < 0) {
 		printk(KERN_ERR PREFIX
 		       "Error parsing LAPIC address override entry\n");
@@ -852,9 +952,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
 	 * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
 	 */
 
-	count =
-	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
-				  acpi_parse_lapic_addr_ovr, 0);
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+				      acpi_parse_lapic_addr_ovr, 0);
 	if (count < 0) {
 		printk(KERN_ERR PREFIX
 		       "Error parsing LAPIC address override entry\n");
@@ -882,11 +981,10 @@ static int __init acpi_parse_madt_lapic_entries(void)
 		return count;
 	}
 
-	x2count =
-	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
-				  acpi_parse_x2apic_nmi, 0);
-	count =
-	    acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
+	x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+					acpi_parse_x2apic_nmi, 0);
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI,
+				      acpi_parse_lapic_nmi, 0);
 	if (count < 0 || x2count < 0) {
 		printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
 		/* TBD: Cleanup to allow fallback to MPS */
@@ -897,44 +995,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
 #endif				/* CONFIG_X86_LOCAL_APIC */
 
 #ifdef	CONFIG_X86_IO_APIC
-#define MP_ISA_BUS		0
-
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
-	int ioapic;
-	int pin;
-	struct mpc_intsrc mp_irq;
-
-	/*
-	 * Convert 'gsi' to 'ioapic.pin'.
-	 */
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return;
-	pin = mp_find_ioapic_pin(ioapic, gsi);
-
-	/*
-	 * TBD: This check is for faulty timer entries, where the override
-	 *      erroneously sets the trigger to level, resulting in a HUGE
-	 *      increase of timer interrupts!
-	 */
-	if ((bus_irq == 0) && (trigger == 3))
-		trigger = 1;
-
-	mp_irq.type = MP_INTSRC;
-	mp_irq.irqtype = mp_INT;
-	mp_irq.irqflag = (trigger << 2) | polarity;
-	mp_irq.srcbus = MP_ISA_BUS;
-	mp_irq.srcbusirq = bus_irq;	/* IRQ */
-	mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
-	mp_irq.dstirq = pin;	/* INTIN# */
-
-	mp_save_irq(&mp_irq);
-
-	isa_irq_to_gsi[bus_irq] = gsi;
-}
-
-void __init mp_config_acpi_legacy_irqs(void)
+static void __init mp_config_acpi_legacy_irqs(void)
 {
 	int i;
 	struct mpc_intsrc mp_irq;
@@ -952,7 +1013,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	 * Use the default configuration for the IRQs 0-15.  Unless
 	 * overridden by (MADT) interrupt source override entries.
 	 */
-	for (i = 0; i < 16; i++) {
+	for (i = 0; i < nr_legacy_irqs(); i++) {
 		int ioapic, pin;
 		unsigned int dstapic;
 		int idx;
@@ -1000,84 +1061,6 @@ void __init mp_config_acpi_legacy_irqs(void)
 	}
 }
 
-static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
-			int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
-	struct mpc_intsrc mp_irq;
-	struct pci_dev *pdev;
-	unsigned char number;
-	unsigned int devfn;
-	int ioapic;
-	u8 pin;
-
-	if (!acpi_ioapic)
-		return 0;
-	if (!dev || !dev_is_pci(dev))
-		return 0;
-
-	pdev = to_pci_dev(dev);
-	number = pdev->bus->number;
-	devfn = pdev->devfn;
-	pin = pdev->pin;
-	/* print the entry should happen on mptable identically */
-	mp_irq.type = MP_INTSRC;
-	mp_irq.irqtype = mp_INT;
-	mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
-				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
-	mp_irq.srcbus = number;
-	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
-	ioapic = mp_find_ioapic(gsi);
-	mp_irq.dstapic = mpc_ioapic_id(ioapic);
-	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
-	mp_save_irq(&mp_irq);
-#endif
-	return 0;
-}
-
-int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
-{
-	int ioapic;
-	int ioapic_pin;
-	struct io_apic_irq_attr irq_attr;
-	int ret;
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return gsi;
-
-	/* Don't set up the ACPI SCI because it's already set up */
-	if (acpi_gbl_FADT.sci_interrupt == gsi)
-		return gsi;
-
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0) {
-		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-		return gsi;
-	}
-
-	ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
-
-	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
-		printk(KERN_ERR "Invalid reference to IOAPIC pin "
-		       "%d-%d\n", mpc_ioapic_id(ioapic),
-		       ioapic_pin);
-		return gsi;
-	}
-
-	if (enable_update_mptable)
-		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-
-	set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
-			     trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
-			     polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-	ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
-	if (ret < 0)
-		gsi = INT_MIN;
-
-	return gsi;
-}
-
 /*
  * Parse IOAPIC related entries in MADT
  * returns 0 on success, < 0 on error
@@ -1107,9 +1090,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 		return -ENODEV;
 	}
 
-	count =
-	    acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
-				  MAX_IO_APICS);
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
+				      MAX_IO_APICS);
 	if (!count) {
 		printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
 		return -ENODEV;
@@ -1118,9 +1100,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 		return count;
 	}
 
-	count =
-	    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
-				  nr_irqs);
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
+				      acpi_parse_int_src_ovr, nr_irqs);
 	if (count < 0) {
 		printk(KERN_ERR PREFIX
 		       "Error parsing interrupt source overrides entry\n");
@@ -1139,9 +1120,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 	/* Fill in identity legacy mappings where no override */
 	mp_config_acpi_legacy_irqs();
 
-	count =
-	    acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
-				  nr_irqs);
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
+				      acpi_parse_nmi_src, nr_irqs);
 	if (count < 0) {
 		printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
 		/* TBD: Cleanup to allow fallback to MPS */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ad28db7e6bde..67760275544b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
 /*
  * The highest APIC ID seen during enumeration.
  */
-unsigned int max_physical_apicid;
+static unsigned int max_physical_apicid;
 
 /*
  * Bitmask of physically existing CPUs:
@@ -1342,17 +1342,6 @@ void setup_local_APIC(void)
 	/* always use the value from LDR */
 	early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
 		logical_smp_processor_id();
-
-	/*
-	 * Some NUMA implementations (NUMAQ) don't initialize apicid to
-	 * node mapping during NUMA init.  Now that logical apicid is
-	 * guaranteed to be known, give it another chance.  This is already
-	 * a bit too late - percpu allocation has already happened without
-	 * proper NUMA affinity.
-	 */
-	if (apic->x86_32_numa_cpu_node)
-		set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
-				   apic->x86_32_numa_cpu_node(cpu));
 #endif
 
 	/*
@@ -2053,8 +2042,6 @@ void __init connect_bsp_APIC(void)
 		imcr_pic_to_apic();
 	}
 #endif
-	if (apic->enable_apic_mode)
-		apic->enable_apic_mode();
 }
 
 /**
@@ -2451,51 +2438,6 @@ static void apic_pm_activate(void) { }
 
 #ifdef CONFIG_X86_64
 
-static int apic_cluster_num(void)
-{
-	int i, clusters, zeros;
-	unsigned id;
-	u16 *bios_cpu_apicid;
-	DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
-
-	bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-
-	for (i = 0; i < nr_cpu_ids; i++) {
-		/* are we being called early in kernel startup? */
-		if (bios_cpu_apicid) {
-			id = bios_cpu_apicid[i];
-		} else if (i < nr_cpu_ids) {
-			if (cpu_present(i))
-				id = per_cpu(x86_bios_cpu_apicid, i);
-			else
-				continue;
-		} else
-			break;
-
-		if (id != BAD_APICID)
-			__set_bit(APIC_CLUSTERID(id), clustermap);
-	}
-
-	/* Problem:  Partially populated chassis may not have CPUs in some of
-	 * the APIC clusters they have been allocated.  Only present CPUs have
-	 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
-	 * Since clusters are allocated sequentially, count zeros only if
-	 * they are bounded by ones.
-	 */
-	clusters = 0;
-	zeros = 0;
-	for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
-		if (test_bit(i, clustermap)) {
-			clusters += 1 + zeros;
-			zeros = 0;
-		} else
-			++zeros;
-	}
-
-	return clusters;
-}
-
 static int multi_checked;
 static int multi;
 
@@ -2540,20 +2482,7 @@ static void dmi_check_multi(void)
 int apic_is_clustered_box(void)
 {
 	dmi_check_multi();
-	if (multi)
-		return 1;
-
-	if (!is_vsmp_box())
-		return 0;
-
-	/*
-	 * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
-	 * not guaranteed to be synced between boards
-	 */
-	if (apic_cluster_num() > 1)
-		return 1;
-
-	return 0;
+	return multi;
 }
 #endif
 
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 7c1b29479513..de918c410eae 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -168,21 +168,16 @@ static struct apic apic_flat =  {
 	.disable_esr			= 0,
 	.dest_logical			= APIC_DEST_LOGICAL,
 	.check_apicid_used		= NULL,
-	.check_apicid_present		= NULL,
 
 	.vector_allocation_domain	= flat_vector_allocation_domain,
 	.init_apic_ldr			= flat_init_apic_ldr,
 
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
-	.multi_timer_check		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
-	.setup_portio_remap		= NULL,
 	.check_phys_apicid_present	= default_check_phys_apicid_present,
-	.enable_apic_mode		= NULL,
 	.phys_pkg_id			= flat_phys_pkg_id,
-	.mps_oem_check			= NULL,
 
 	.get_apic_id			= flat_get_apic_id,
 	.set_apic_id			= set_apic_id,
@@ -196,10 +191,7 @@ static struct apic apic_flat =  {
 	.send_IPI_all			= flat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,
 
-	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
-	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 	.wait_for_init_deassert		= false,
-	.smp_callin_clear_local_apic	= NULL,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
@@ -283,7 +275,6 @@ static struct apic apic_physflat =  {
 	.disable_esr			= 0,
 	.dest_logical			= 0,
 	.check_apicid_used		= NULL,
-	.check_apicid_present		= NULL,
 
 	.vector_allocation_domain	= default_vector_allocation_domain,
 	/* not needed, but shouldn't hurt: */
@@ -291,14 +282,10 @@ static struct apic apic_physflat =  {
 
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
-	.multi_timer_check		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
-	.setup_portio_remap		= NULL,
 	.check_phys_apicid_present	= default_check_phys_apicid_present,
-	.enable_apic_mode		= NULL,
 	.phys_pkg_id			= flat_phys_pkg_id,
-	.mps_oem_check			= NULL,
 
 	.get_apic_id			= flat_get_apic_id,
 	.set_apic_id			= set_apic_id,
@@ -312,10 +299,7 @@ static struct apic apic_physflat =  {
 	.send_IPI_all			= physflat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,
 
-	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
-	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 	.wait_for_init_deassert		= false,
-	.smp_callin_clear_local_apic	= NULL,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 8c7c98249c20..b205cdbdbe6a 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -89,16 +89,6 @@ static const struct cpumask *noop_target_cpus(void)
 	return cpumask_of(0);
 }
 
-static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
-{
-	return physid_isset(apicid, *map);
-}
-
-static unsigned long noop_check_apicid_present(int bit)
-{
-	return physid_isset(bit, phys_cpu_present_map);
-}
-
 static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
 					  const struct cpumask *mask)
 {
@@ -133,27 +123,21 @@ struct apic apic_noop = {
 	.target_cpus			= noop_target_cpus,
 	.disable_esr			= 0,
 	.dest_logical			= APIC_DEST_LOGICAL,
-	.check_apicid_used		= noop_check_apicid_used,
-	.check_apicid_present		= noop_check_apicid_present,
+	.check_apicid_used		= default_check_apicid_used,
 
 	.vector_allocation_domain	= noop_vector_allocation_domain,
 	.init_apic_ldr			= noop_init_apic_ldr,
 
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map,
 	.setup_apic_routing		= NULL,
-	.multi_timer_check		= NULL,
 
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= physid_set_mask_of_physid,
 
-	.setup_portio_remap		= NULL,
 	.check_phys_apicid_present	= default_check_phys_apicid_present,
-	.enable_apic_mode		= NULL,
 
 	.phys_pkg_id			= noop_phys_pkg_id,
 
-	.mps_oem_check			= NULL,
-
 	.get_apic_id			= noop_get_apic_id,
 	.set_apic_id			= NULL,
 	.apic_id_mask			= 0x0F << 24,
@@ -168,12 +152,7 @@ struct apic apic_noop = {
 
 	.wakeup_secondary_cpu		= noop_wakeup_secondary_cpu,
 
-	/* should be safe */
-	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
-	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
 	.wait_for_init_deassert		= false,
-	.smp_callin_clear_local_apic	= NULL,
 	.inquire_remote_apic		= NULL,
 
 	.read				= noop_apic_read,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index a5b45df8bc88..ae915391ebec 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -217,21 +217,16 @@ static const struct apic apic_numachip __refconst = {
 	.disable_esr			= 0,
 	.dest_logical			= 0,
 	.check_apicid_used		= NULL,
-	.check_apicid_present		= NULL,
 
 	.vector_allocation_domain	= default_vector_allocation_domain,
 	.init_apic_ldr			= flat_init_apic_ldr,
 
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
-	.multi_timer_check		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
-	.setup_portio_remap		= NULL,
 	.check_phys_apicid_present	= default_check_phys_apicid_present,
-	.enable_apic_mode		= NULL,
 	.phys_pkg_id			= numachip_phys_pkg_id,
-	.mps_oem_check			= NULL,
 
 	.get_apic_id			= get_apic_id,
 	.set_apic_id			= set_apic_id,
@@ -246,10 +241,7 @@ static const struct apic apic_numachip __refconst = {
 	.send_IPI_self			= numachip_send_IPI_self,
 
 	.wakeup_secondary_cpu		= numachip_wakeup_secondary,
-	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
-	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 	.wait_for_init_deassert		= false,
-	.smp_callin_clear_local_apic	= NULL,
 	.inquire_remote_apic		= NULL, /* REMRD not supported */
 
 	.read				= native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index e4840aa7a255..c4a8d63f8220 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -31,11 +31,6 @@ static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
 	return 0;
 }
 
-static unsigned long bigsmp_check_apicid_present(int bit)
-{
-	return 1;
-}
-
 static int bigsmp_early_logical_apicid(int cpu)
 {
 	/* on bigsmp, logical apicid is the same as physical */
@@ -168,21 +163,16 @@ static struct apic apic_bigsmp = {
 	.disable_esr			= 1,
 	.dest_logical			= 0,
 	.check_apicid_used		= bigsmp_check_apicid_used,
-	.check_apicid_present		= bigsmp_check_apicid_present,
 
 	.vector_allocation_domain	= default_vector_allocation_domain,
 	.init_apic_ldr			= bigsmp_init_apic_ldr,
 
 	.ioapic_phys_id_map		= bigsmp_ioapic_phys_id_map,
 	.setup_apic_routing		= bigsmp_setup_apic_routing,
-	.multi_timer_check		= NULL,
 	.cpu_present_to_apicid		= bigsmp_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= physid_set_mask_of_physid,
-	.setup_portio_remap		= NULL,
 	.check_phys_apicid_present	= bigsmp_check_phys_apicid_present,
-	.enable_apic_mode		= NULL,
 	.phys_pkg_id			= bigsmp_phys_pkg_id,
-	.mps_oem_check			= NULL,
 
 	.get_apic_id			= bigsmp_get_apic_id,
 	.set_apic_id			= NULL,
@@ -196,11 +186,7 @@ static struct apic apic_bigsmp = {
 	.send_IPI_all			= bigsmp_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
-	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
 	.wait_for_init_deassert		= true,
-	.smp_callin_clear_local_apic	= NULL,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 81e08eff05ee..29290f554e79 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -31,6 +31,7 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/syscore_ops.h>
+#include <linux/irqdomain.h>
 #include <linux/msi.h>
 #include <linux/htirq.h>
 #include <linux/freezer.h>
@@ -62,6 +63,16 @@
 
 #define __apicdebuginit(type) static type __init
 
+#define	for_each_ioapic(idx)		\
+	for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
+#define	for_each_ioapic_reverse(idx)	\
+	for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--)
+#define	for_each_pin(idx, pin)		\
+	for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++)
+#define	for_each_ioapic_pin(idx, pin)	\
+	for_each_ioapic((idx))		\
+		for_each_pin((idx), (pin))
+
 #define for_each_irq_pin(entry, head) \
 	for (entry = head; entry; entry = entry->next)
 
@@ -73,6 +84,17 @@ int sis_apic_bug = -1;
 
 static DEFINE_RAW_SPINLOCK(ioapic_lock);
 static DEFINE_RAW_SPINLOCK(vector_lock);
+static DEFINE_MUTEX(ioapic_mutex);
+static unsigned int ioapic_dynirq_base;
+static int ioapic_initialized;
+
+struct mp_pin_info {
+	int trigger;
+	int polarity;
+	int node;
+	int set;
+	u32 count;
+};
 
 static struct ioapic {
 	/*
@@ -87,7 +109,9 @@ static struct ioapic {
 	struct mpc_ioapic mp_config;
 	/* IO APIC gsi routing info */
 	struct mp_ioapic_gsi  gsi_config;
-	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+	struct ioapic_domain_cfg irqdomain_cfg;
+	struct irq_domain *irqdomain;
+	struct mp_pin_info *pin_info;
 } ioapics[MAX_IO_APICS];
 
 #define mpc_ioapic_ver(ioapic_idx)	ioapics[ioapic_idx].mp_config.apicver
@@ -107,6 +131,41 @@ struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
 	return &ioapics[ioapic_idx].gsi_config;
 }
 
+static inline int mp_ioapic_pin_count(int ioapic)
+{
+	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+
+	return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
+}
+
+u32 mp_pin_to_gsi(int ioapic, int pin)
+{
+	return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
+}
+
+/*
+ * Initialize all legacy IRQs and all pins on the first IOAPIC
+ * if we have legacy interrupt controller. Kernel boot option "pirq="
+ * may rely on non-legacy pins on the first IOAPIC.
+ */
+static inline int mp_init_irq_at_boot(int ioapic, int irq)
+{
+	if (!nr_legacy_irqs())
+		return 0;
+
+	return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs());
+}
+
+static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin)
+{
+	return ioapics[ioapic_idx].pin_info + pin;
+}
+
+static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
+{
+	return ioapics[ioapic].irqdomain;
+}
+
 int nr_ioapics;
 
 /* The one past the highest gsi number used */
@@ -118,9 +177,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 /* # of MP IRQ source entries */
 int mp_irq_entries;
 
-/* GSI interrupts */
-static int nr_irqs_gsi = NR_IRQS_LEGACY;
-
 #ifdef CONFIG_EISA
 int mp_bus_id_to_type[MAX_MP_BUSSES];
 #endif
@@ -149,8 +205,7 @@ static int __init parse_noapic(char *str)
 }
 early_param("noapic", parse_noapic);
 
-static int io_apic_setup_irq_pin(unsigned int irq, int node,
-				 struct io_apic_irq_attr *attr);
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
 
 /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
 void mp_save_irq(struct mpc_intsrc *m)
@@ -182,19 +237,15 @@ static struct irq_pin_list *alloc_irq_pin_list(int node)
 	return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
 }
 
-
-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
-
 int __init arch_early_irq_init(void)
 {
 	struct irq_cfg *cfg;
-	int count, node, i;
+	int i, node = cpu_to_node(0);
 
-	if (!legacy_pic->nr_legacy_irqs)
+	if (!nr_legacy_irqs())
 		io_apic_irqs = ~0UL;
 
-	for (i = 0; i < nr_ioapics; i++) {
+	for_each_ioapic(i) {
 		ioapics[i].saved_registers =
 			kzalloc(sizeof(struct IO_APIC_route_entry) *
 				ioapics[i].nr_registers, GFP_KERNEL);
@@ -202,28 +253,20 @@ int __init arch_early_irq_init(void)
 			pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
 	}
 
-	cfg = irq_cfgx;
-	count = ARRAY_SIZE(irq_cfgx);
-	node = cpu_to_node(0);
-
-	for (i = 0; i < count; i++) {
-		irq_set_chip_data(i, &cfg[i]);
-		zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
-		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
-		/*
-		 * For legacy IRQ's, start with assigning irq0 to irq15 to
-		 * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
-		 */
-		if (i < legacy_pic->nr_legacy_irqs) {
-			cfg[i].vector = IRQ0_VECTOR + i;
-			cpumask_setall(cfg[i].domain);
-		}
+	/*
+	 * For legacy IRQ's, start with assigning irq0 to irq15 to
+	 * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
+	 */
+	for (i = 0; i < nr_legacy_irqs(); i++) {
+		cfg = alloc_irq_and_cfg_at(i, node);
+		cfg->vector = IRQ0_VECTOR + i;
+		cpumask_setall(cfg->domain);
 	}
 
 	return 0;
 }
 
-static struct irq_cfg *irq_cfg(unsigned int irq)
+static inline struct irq_cfg *irq_cfg(unsigned int irq)
 {
 	return irq_get_chip_data(irq);
 }
@@ -265,7 +308,7 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
 	if (res < 0) {
 		if (res != -EEXIST)
 			return NULL;
-		cfg = irq_get_chip_data(at);
+		cfg = irq_cfg(at);
 		if (cfg)
 			return cfg;
 	}
@@ -425,6 +468,21 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
 	return 0;
 }
 
+static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
+{
+	struct irq_pin_list **last, *entry;
+
+	last = &cfg->irq_2_pin;
+	for_each_irq_pin(entry, cfg->irq_2_pin)
+		if (entry->apic == apic && entry->pin == pin) {
+			*last = entry->next;
+			kfree(entry);
+			return;
+		} else {
+			last = &entry->next;
+		}
+}
+
 static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
 {
 	if (__add_pin_to_irq_node(cfg, node, apic, pin))
@@ -627,9 +685,8 @@ static void clear_IO_APIC (void)
 {
 	int apic, pin;
 
-	for (apic = 0; apic < nr_ioapics; apic++)
-		for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
-			clear_IO_APIC_pin(apic, pin);
+	for_each_ioapic_pin(apic, pin)
+		clear_IO_APIC_pin(apic, pin);
 }
 
 #ifdef CONFIG_X86_32
@@ -678,13 +735,13 @@ int save_ioapic_entries(void)
 	int apic, pin;
 	int err = 0;
 
-	for (apic = 0; apic < nr_ioapics; apic++) {
+	for_each_ioapic(apic) {
 		if (!ioapics[apic].saved_registers) {
 			err = -ENOMEM;
 			continue;
 		}
 
-		for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+		for_each_pin(apic, pin)
 			ioapics[apic].saved_registers[pin] =
 				ioapic_read_entry(apic, pin);
 	}
@@ -699,11 +756,11 @@ void mask_ioapic_entries(void)
 {
 	int apic, pin;
 
-	for (apic = 0; apic < nr_ioapics; apic++) {
+	for_each_ioapic(apic) {
 		if (!ioapics[apic].saved_registers)
 			continue;
 
-		for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+		for_each_pin(apic, pin) {
 			struct IO_APIC_route_entry entry;
 
 			entry = ioapics[apic].saved_registers[pin];
@@ -722,11 +779,11 @@ int restore_ioapic_entries(void)
 {
 	int apic, pin;
 
-	for (apic = 0; apic < nr_ioapics; apic++) {
+	for_each_ioapic(apic) {
 		if (!ioapics[apic].saved_registers)
 			continue;
 
-		for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+		for_each_pin(apic, pin)
 			ioapic_write_entry(apic, pin,
 					   ioapics[apic].saved_registers[pin]);
 	}
@@ -785,7 +842,7 @@ static int __init find_isa_irq_apic(int irq, int type)
 	if (i < mp_irq_entries) {
 		int ioapic_idx;
 
-		for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+		for_each_ioapic(ioapic_idx)
 			if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
 				return ioapic_idx;
 	}
@@ -799,7 +856,7 @@ static int __init find_isa_irq_apic(int irq, int type)
  */
 static int EISA_ELCR(unsigned int irq)
 {
-	if (irq < legacy_pic->nr_legacy_irqs) {
+	if (irq < nr_legacy_irqs()) {
 		unsigned int port = 0x4d0 + (irq >> 3);
 		return (inb(port) >> (irq & 7)) & 1;
 	}
@@ -939,29 +996,101 @@ static int irq_trigger(int idx)
 	return trigger;
 }
 
-static int pin_2_irq(int idx, int apic, int pin)
+static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin)
+{
+	int irq = -1;
+	int ioapic = (int)(long)domain->host_data;
+	int type = ioapics[ioapic].irqdomain_cfg.type;
+
+	switch (type) {
+	case IOAPIC_DOMAIN_LEGACY:
+		/*
+		 * Dynamically allocate IRQ number for non-ISA IRQs in the first 16
+		 * GSIs on some weird platforms.
+		 */
+		if (gsi < nr_legacy_irqs())
+			irq = irq_create_mapping(domain, pin);
+		else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+			irq = gsi;
+		break;
+	case IOAPIC_DOMAIN_STRICT:
+		if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+			irq = gsi;
+		break;
+	case IOAPIC_DOMAIN_DYNAMIC:
+		irq = irq_create_mapping(domain, pin);
+		break;
+	default:
+		WARN(1, "ioapic: unknown irqdomain type %d\n", type);
+		break;
+	}
+
+	return irq > 0 ? irq : -1;
+}
+
+static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
+			     unsigned int flags)
 {
 	int irq;
-	int bus = mp_irqs[idx].srcbus;
-	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
+	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+	struct mp_pin_info *info = mp_pin_info(ioapic, pin);
+
+	if (!domain)
+		return -1;
+
+	mutex_lock(&ioapic_mutex);
 
 	/*
-	 * Debugging check, we are in big trouble if this message pops up!
+	 * Don't use irqdomain to manage ISA IRQs because there may be
+	 * multiple IOAPIC pins sharing the same ISA IRQ number and
+	 * irqdomain only supports 1:1 mapping between IOAPIC pin and
+	 * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used
+	 * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+	 * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are
+	 * available, and some BIOSes may use MP Interrupt Source records
+	 * to override IRQ numbers for PIRQs instead of reprogramming
+	 * the interrupt routing logic. Thus there may be multiple pins
+	 * sharing the same legacy IRQ number when ACPI is disabled.
 	 */
-	if (mp_irqs[idx].dstirq != pin)
-		pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
-
-	if (test_bit(bus, mp_bus_not_pci)) {
+	if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
+		if (flags & IOAPIC_MAP_ALLOC) {
+			if (info->count == 0 &&
+			    mp_irqdomain_map(domain, irq, pin) != 0)
+				irq = -1;
+
+			/* special handling for timer IRQ0 */
+			if (irq == 0)
+				info->count++;
+		}
 	} else {
-		u32 gsi = gsi_cfg->gsi_base + pin;
+		irq = irq_find_mapping(domain, pin);
+		if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC))
+			irq = alloc_irq_from_domain(domain, gsi, pin);
+	}
 
-		if (gsi >= NR_IRQS_LEGACY)
-			irq = gsi;
-		else
-			irq = gsi_top + gsi;
+	if (flags & IOAPIC_MAP_ALLOC) {
+		if (irq > 0)
+			info->count++;
+		else if (info->count == 0)
+			info->set = 0;
 	}
 
+	mutex_unlock(&ioapic_mutex);
+
+	return irq > 0 ? irq : -1;
+}
+
+static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
+{
+	u32 gsi = mp_pin_to_gsi(ioapic, pin);
+
+	/*
+	 * Debugging check, we are in big trouble if this message pops up!
+	 */
+	if (mp_irqs[idx].dstirq != pin)
+		pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
+
 #ifdef CONFIG_X86_32
 	/*
 	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -972,16 +1101,58 @@ static int pin_2_irq(int idx, int apic, int pin)
 				apic_printk(APIC_VERBOSE, KERN_DEBUG
 						"disabling PIRQ%d\n", pin-16);
 			} else {
-				irq = pirq_entries[pin-16];
+				int irq = pirq_entries[pin-16];
 				apic_printk(APIC_VERBOSE, KERN_DEBUG
 						"using PIRQ%d -> IRQ %d\n",
 						pin-16, irq);
+				return irq;
 			}
 		}
 	}
 #endif
 
-	return irq;
+	return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+}
+
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
+{
+	int ioapic, pin, idx;
+
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return -1;
+
+	pin = mp_find_ioapic_pin(ioapic, gsi);
+	idx = find_irq_entry(ioapic, pin, mp_INT);
+	if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
+		return -1;
+
+	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+}
+
+void mp_unmap_irq(int irq)
+{
+	struct irq_data *data = irq_get_irq_data(irq);
+	struct mp_pin_info *info;
+	int ioapic, pin;
+
+	if (!data || !data->domain)
+		return;
+
+	ioapic = (int)(long)data->domain->host_data;
+	pin = (int)data->hwirq;
+	info = mp_pin_info(ioapic, pin);
+
+	mutex_lock(&ioapic_mutex);
+	if (--info->count == 0) {
+		info->set = 0;
+		if (irq < nr_legacy_irqs() &&
+		    ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY)
+			mp_irqdomain_unmap(data->domain, irq);
+		else
+			irq_dispose_mapping(irq);
+	}
+	mutex_unlock(&ioapic_mutex);
 }
 
 /*
@@ -991,7 +1162,7 @@ static int pin_2_irq(int idx, int apic, int pin)
 int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
 				struct io_apic_irq_attr *irq_attr)
 {
-	int ioapic_idx, i, best_guess = -1;
+	int irq, i, best_ioapic = -1, best_idx = -1;
 
 	apic_printk(APIC_DEBUG,
 		    "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
@@ -1001,44 +1172,56 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
 			    "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
 		return -1;
 	}
+
 	for (i = 0; i < mp_irq_entries; i++) {
 		int lbus = mp_irqs[i].srcbus;
+		int ioapic_idx, found = 0;
 
-		for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+		if (bus != lbus || mp_irqs[i].irqtype != mp_INT ||
+		    slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f))
+			continue;
+
+		for_each_ioapic(ioapic_idx)
 			if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
-			    mp_irqs[i].dstapic == MP_APIC_ALL)
+			    mp_irqs[i].dstapic == MP_APIC_ALL) {
+				found = 1;
 				break;
+			}
+		if (!found)
+			continue;
 
-		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].irqtype &&
-		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq);
+		/* Skip ISA IRQs */
+		irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0);
+		if (irq > 0 && !IO_APIC_IRQ(irq))
+			continue;
 
-			if (!(ioapic_idx || IO_APIC_IRQ(irq)))
-				continue;
+		if (pin == (mp_irqs[i].srcbusirq & 3)) {
+			best_idx = i;
+			best_ioapic = ioapic_idx;
+			goto out;
+		}
 
-			if (pin == (mp_irqs[i].srcbusirq & 3)) {
-				set_io_apic_irq_attr(irq_attr, ioapic_idx,
-						     mp_irqs[i].dstirq,
-						     irq_trigger(i),
-						     irq_polarity(i));
-				return irq;
-			}
-			/*
-			 * Use the first all-but-pin matching entry as a
-			 * best-guess fuzzy result for broken mptables.
-			 */
-			if (best_guess < 0) {
-				set_io_apic_irq_attr(irq_attr, ioapic_idx,
-						     mp_irqs[i].dstirq,
-						     irq_trigger(i),
-						     irq_polarity(i));
-				best_guess = irq;
-			}
+		/*
+		 * Use the first all-but-pin matching entry as a
+		 * best-guess fuzzy result for broken mptables.
+		 */
+		if (best_idx < 0) {
+			best_idx = i;
+			best_ioapic = ioapic_idx;
 		}
 	}
-	return best_guess;
+	if (best_idx < 0)
+		return -1;
+
+out:
+	irq = pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq,
+			IOAPIC_MAP_ALLOC);
+	if (irq > 0)
+		set_io_apic_irq_attr(irq_attr, best_ioapic,
+				     mp_irqs[best_idx].dstirq,
+				     irq_trigger(best_idx),
+				     irq_polarity(best_idx));
+	return irq;
 }
 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
 
@@ -1198,7 +1381,7 @@ void __setup_vector_irq(int cpu)
 	raw_spin_lock(&vector_lock);
 	/* Mark the inuse vectors */
 	for_each_active_irq(irq) {
-		cfg = irq_get_chip_data(irq);
+		cfg = irq_cfg(irq);
 		if (!cfg)
 			continue;
 
@@ -1227,12 +1410,10 @@ static inline int IO_APIC_irq_trigger(int irq)
 {
 	int apic, idx, pin;
 
-	for (apic = 0; apic < nr_ioapics; apic++) {
-		for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
-			idx = find_irq_entry(apic, pin, mp_INT);
-			if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
-				return irq_trigger(idx);
-		}
+	for_each_ioapic_pin(apic, pin) {
+		idx = find_irq_entry(apic, pin, mp_INT);
+		if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0)))
+			return irq_trigger(idx);
 	}
 	/*
          * nonexistent IRQs are edge default
@@ -1330,95 +1511,29 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
 	}
 
 	ioapic_register_intr(irq, cfg, attr->trigger);
-	if (irq < legacy_pic->nr_legacy_irqs)
+	if (irq < nr_legacy_irqs())
 		legacy_pic->mask(irq);
 
 	ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
 }
 
-static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin)
-{
-	if (idx != -1)
-		return false;
-
-	apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
-		    mpc_ioapic_id(ioapic_idx), pin);
-	return true;
-}
-
-static void __init __io_apic_setup_irqs(unsigned int ioapic_idx)
-{
-	int idx, node = cpu_to_node(0);
-	struct io_apic_irq_attr attr;
-	unsigned int pin, irq;
-
-	for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) {
-		idx = find_irq_entry(ioapic_idx, pin, mp_INT);
-		if (io_apic_pin_not_connected(idx, ioapic_idx, pin))
-			continue;
-
-		irq = pin_2_irq(idx, ioapic_idx, pin);
-
-		if ((ioapic_idx > 0) && (irq > 16))
-			continue;
-
-		/*
-		 * Skip the timer IRQ if there's a quirk handler
-		 * installed and if it returns 1:
-		 */
-		if (apic->multi_timer_check &&
-		    apic->multi_timer_check(ioapic_idx, irq))
-			continue;
-
-		set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
-				     irq_polarity(idx));
-
-		io_apic_setup_irq_pin(irq, node, &attr);
-	}
-}
-
 static void __init setup_IO_APIC_irqs(void)
 {
-	unsigned int ioapic_idx;
+	unsigned int ioapic, pin;
+	int idx;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
-	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
-		__io_apic_setup_irqs(ioapic_idx);
-}
-
-/*
- * for the gsit that is not in first ioapic
- * but could not use acpi_register_gsi()
- * like some special sci in IBM x3330
- */
-void setup_IO_APIC_irq_extra(u32 gsi)
-{
-	int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0);
-	struct io_apic_irq_attr attr;
-
-	/*
-	 * Convert 'gsi' to 'ioapic.pin'.
-	 */
-	ioapic_idx = mp_find_ioapic(gsi);
-	if (ioapic_idx < 0)
-		return;
-
-	pin = mp_find_ioapic_pin(ioapic_idx, gsi);
-	idx = find_irq_entry(ioapic_idx, pin, mp_INT);
-	if (idx == -1)
-		return;
-
-	irq = pin_2_irq(idx, ioapic_idx, pin);
-
-	/* Only handle the non legacy irqs on secondary ioapics */
-	if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY)
-		return;
-
-	set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
-			     irq_polarity(idx));
-
-	io_apic_setup_irq_pin_once(irq, node, &attr);
+	for_each_ioapic_pin(ioapic, pin) {
+		idx = find_irq_entry(ioapic, pin, mp_INT);
+		if (idx < 0)
+			apic_printk(APIC_VERBOSE,
+				    KERN_DEBUG " apic %d pin %d not connected\n",
+				    mpc_ioapic_id(ioapic), pin);
+		else
+			pin_2_irq(idx, ioapic, pin,
+				  ioapic ? 0 : IOAPIC_MAP_ALLOC);
+	}
 }
 
 /*
@@ -1586,7 +1701,7 @@ __apicdebuginit(void) print_IO_APICs(void)
 	struct irq_chip *chip;
 
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
-	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+	for_each_ioapic(ioapic_idx)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
 		       mpc_ioapic_id(ioapic_idx),
 		       ioapics[ioapic_idx].nr_registers);
@@ -1597,7 +1712,7 @@ __apicdebuginit(void) print_IO_APICs(void)
 	 */
 	printk(KERN_INFO "testing the IO APIC.......................\n");
 
-	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+	for_each_ioapic(ioapic_idx)
 		print_IO_APIC(ioapic_idx);
 
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
@@ -1608,7 +1723,7 @@ __apicdebuginit(void) print_IO_APICs(void)
 		if (chip != &ioapic_chip)
 			continue;
 
-		cfg = irq_get_chip_data(irq);
+		cfg = irq_cfg(irq);
 		if (!cfg)
 			continue;
 		entry = cfg->irq_2_pin;
@@ -1758,7 +1873,7 @@ __apicdebuginit(void) print_PIC(void)
 	unsigned int v;
 	unsigned long flags;
 
-	if (!legacy_pic->nr_legacy_irqs)
+	if (!nr_legacy_irqs())
 		return;
 
 	printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1828,26 +1943,22 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 void __init enable_IO_APIC(void)
 {
 	int i8259_apic, i8259_pin;
-	int apic;
+	int apic, pin;
 
-	if (!legacy_pic->nr_legacy_irqs)
+	if (!nr_legacy_irqs())
 		return;
 
-	for(apic = 0; apic < nr_ioapics; apic++) {
-		int pin;
+	for_each_ioapic_pin(apic, pin) {
 		/* See if any of the pins is in ExtINT mode */
-		for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
-			struct IO_APIC_route_entry entry;
-			entry = ioapic_read_entry(apic, pin);
+		struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin);
 
-			/* If the interrupt line is enabled and in ExtInt mode
-			 * I have found the pin where the i8259 is connected.
-			 */
-			if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
-				ioapic_i8259.apic = apic;
-				ioapic_i8259.pin  = pin;
-				goto found_i8259;
-			}
+		/* If the interrupt line is enabled and in ExtInt mode
+		 * I have found the pin where the i8259 is connected.
+		 */
+		if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+			ioapic_i8259.apic = apic;
+			ioapic_i8259.pin  = pin;
+			goto found_i8259;
 		}
 	}
  found_i8259:
@@ -1919,7 +2030,7 @@ void disable_IO_APIC(void)
 	 */
 	clear_IO_APIC();
 
-	if (!legacy_pic->nr_legacy_irqs)
+	if (!nr_legacy_irqs())
 		return;
 
 	x86_io_apic_ops.disable();
@@ -1950,7 +2061,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
 	/*
 	 * Set the IOAPIC ID to the value stored in the MPC table.
 	 */
-	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
+	for_each_ioapic(ioapic_idx) {
 		/* Read the register 0 value */
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
 		reg_00.raw = io_apic_read(ioapic_idx, 0);
@@ -2123,7 +2234,7 @@ static unsigned int startup_ioapic_irq(struct irq_data *data)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < legacy_pic->nr_legacy_irqs) {
+	if (irq < nr_legacy_irqs()) {
 		legacy_pic->mask(irq);
 		if (legacy_pic->irq_pending(irq))
 			was_pending = 1;
@@ -2225,7 +2336,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
 			goto unlock;
 		}
-		__this_cpu_write(vector_irq[vector], -1);
+		__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
 unlock:
 		raw_spin_unlock(&desc->lock);
 	}
@@ -2253,7 +2364,7 @@ static void irq_complete_move(struct irq_cfg *cfg)
 
 void irq_force_complete_move(int irq)
 {
-	struct irq_cfg *cfg = irq_get_chip_data(irq);
+	struct irq_cfg *cfg = irq_cfg(irq);
 
 	if (!cfg)
 		return;
@@ -2514,26 +2625,15 @@ static inline void init_IO_APIC_traps(void)
 	struct irq_cfg *cfg;
 	unsigned int irq;
 
-	/*
-	 * NOTE! The local APIC isn't very good at handling
-	 * multiple interrupts at the same interrupt level.
-	 * As the interrupt level is determined by taking the
-	 * vector number and shifting that right by 4, we
-	 * want to spread these out a bit so that they don't
-	 * all fall in the same interrupt level.
-	 *
-	 * Also, we've got to be careful not to trash gate
-	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
-	 */
 	for_each_active_irq(irq) {
-		cfg = irq_get_chip_data(irq);
+		cfg = irq_cfg(irq);
 		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
 			 * interrupt if we can..
 			 */
-			if (irq < legacy_pic->nr_legacy_irqs)
+			if (irq < nr_legacy_irqs())
 				legacy_pic->make_irq(irq);
 			else
 				/* Strange. Oh, well.. */
@@ -2649,8 +2749,6 @@ static int __init disable_timer_pin_setup(char *arg)
 }
 early_param("disable_timer_pin_1", disable_timer_pin_setup);
 
-int timer_through_8259 __initdata;
-
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
@@ -2661,7 +2759,7 @@ int timer_through_8259 __initdata;
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_get_chip_data(0);
+	struct irq_cfg *cfg = irq_cfg(0);
 	int node = cpu_to_node(0);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
@@ -2755,7 +2853,6 @@ static inline void __init check_timer(void)
 		legacy_pic->unmask(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
-			timer_through_8259 = 1;
 			goto out;
 		}
 		/*
@@ -2827,15 +2924,54 @@ out:
  */
 #define PIC_IRQS	(1UL << PIC_CASCADE_IR)
 
+static int mp_irqdomain_create(int ioapic)
+{
+	size_t size;
+	int hwirqs = mp_ioapic_pin_count(ioapic);
+	struct ioapic *ip = &ioapics[ioapic];
+	struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
+	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+
+	size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic);
+	ip->pin_info = kzalloc(size, GFP_KERNEL);
+	if (!ip->pin_info)
+		return -ENOMEM;
+
+	if (cfg->type == IOAPIC_DOMAIN_INVALID)
+		return 0;
+
+	ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
+					      (void *)(long)ioapic);
+	if(!ip->irqdomain) {
+		kfree(ip->pin_info);
+		ip->pin_info = NULL;
+		return -ENOMEM;
+	}
+
+	if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
+	    cfg->type == IOAPIC_DOMAIN_STRICT)
+		ioapic_dynirq_base = max(ioapic_dynirq_base,
+					 gsi_cfg->gsi_end + 1);
+
+	if (gsi_cfg->gsi_base == 0)
+		irq_set_default_host(ip->irqdomain);
+
+	return 0;
+}
+
 void __init setup_IO_APIC(void)
 {
+	int ioapic;
 
 	/*
 	 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
 	 */
-	io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
+	io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
 
 	apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+	for_each_ioapic(ioapic)
+		BUG_ON(mp_irqdomain_create(ioapic));
+
 	/*
          * Set up IO-APIC IRQ routing.
          */
@@ -2844,8 +2980,10 @@ void __init setup_IO_APIC(void)
 	sync_Arb_IDs();
 	setup_IO_APIC_irqs();
 	init_IO_APIC_traps();
-	if (legacy_pic->nr_legacy_irqs)
+	if (nr_legacy_irqs())
 		check_timer();
+
+	ioapic_initialized = 1;
 }
 
 /*
@@ -2880,7 +3018,7 @@ static void ioapic_resume(void)
 {
 	int ioapic_idx;
 
-	for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--)
+	for_each_ioapic_reverse(ioapic_idx)
 		resume_ioapic_id(ioapic_idx);
 
 	restore_ioapic_entries();
@@ -2926,7 +3064,7 @@ int arch_setup_hwirq(unsigned int irq, int node)
 
 void arch_teardown_hwirq(unsigned int irq)
 {
-	struct irq_cfg *cfg = irq_get_chip_data(irq);
+	struct irq_cfg *cfg = irq_cfg(irq);
 	unsigned long flags;
 
 	free_remapped_irq(irq);
@@ -3053,7 +3191,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 	if (!irq_offset)
 		write_msi_msg(irq, &msg);
 
-	setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
+	setup_remapped_irq(irq, irq_cfg(irq), chip);
 
 	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
 
@@ -3192,7 +3330,7 @@ int default_setup_hpet_msi(unsigned int irq, unsigned int id)
 
 	hpet_msi_write(irq_get_handler_data(irq), &msg);
 	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-	setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
+	setup_remapped_irq(irq, irq_cfg(irq), chip);
 
 	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
 	return 0;
@@ -3303,27 +3441,6 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
 	return ret;
 }
 
-int io_apic_setup_irq_pin_once(unsigned int irq, int node,
-			       struct io_apic_irq_attr *attr)
-{
-	unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin;
-	int ret;
-	struct IO_APIC_route_entry orig_entry;
-
-	/* Avoid redundant programming */
-	if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) {
-		pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin);
-		orig_entry = ioapic_read_entry(attr->ioapic, pin);
-		if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity)
-			return 0;
-		return -EBUSY;
-	}
-	ret = io_apic_setup_irq_pin(irq, node, attr);
-	if (!ret)
-		set_bit(pin, ioapics[ioapic_idx].pin_programmed);
-	return ret;
-}
-
 static int __init io_apic_get_redir_entries(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
@@ -3340,20 +3457,13 @@ static int __init io_apic_get_redir_entries(int ioapic)
 	return reg_01.bits.entries + 1;
 }
 
-static void __init probe_nr_irqs_gsi(void)
-{
-	int nr;
-
-	nr = gsi_top + NR_IRQS_LEGACY;
-	if (nr > nr_irqs_gsi)
-		nr_irqs_gsi = nr;
-
-	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
-}
-
 unsigned int arch_dynirq_lower_bound(unsigned int from)
 {
-	return from < nr_irqs_gsi ? nr_irqs_gsi : from;
+	/*
+	 * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
+	 * gsi_top if ioapic_dynirq_base hasn't been initialized yet.
+	 */
+	return ioapic_initialized ? ioapic_dynirq_base : gsi_top;
 }
 
 int __init arch_probe_nr_irqs(void)
@@ -3363,33 +3473,17 @@ int __init arch_probe_nr_irqs(void)
 	if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
 		nr_irqs = NR_VECTORS * nr_cpu_ids;
 
-	nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+	nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
 #if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
 	/*
 	 * for MSI and HT dyn irq
 	 */
-	nr += nr_irqs_gsi * 16;
+	nr += gsi_top * 16;
 #endif
 	if (nr < nr_irqs)
 		nr_irqs = nr;
 
-	return NR_IRQS_LEGACY;
-}
-
-int io_apic_set_pci_routing(struct device *dev, int irq,
-			    struct io_apic_irq_attr *irq_attr)
-{
-	int node;
-
-	if (!IO_APIC_IRQ(irq)) {
-		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-			    irq_attr->ioapic);
-		return -EINVAL;
-	}
-
-	node = dev ? dev_to_node(dev) : cpu_to_node(0);
-
-	return io_apic_setup_irq_pin_once(irq, node, irq_attr);
+	return 0;
 }
 
 #ifdef CONFIG_X86_32
@@ -3483,9 +3577,8 @@ static u8 __init io_apic_unique_id(u8 id)
 	DECLARE_BITMAP(used, 256);
 
 	bitmap_zero(used, 256);
-	for (i = 0; i < nr_ioapics; i++) {
+	for_each_ioapic(i)
 		__set_bit(mpc_ioapic_id(i), used);
-	}
 	if (!test_bit(id, used))
 		return id;
 	return find_first_zero_bit(used, 256);
@@ -3543,14 +3636,13 @@ void __init setup_ioapic_dest(void)
 	if (skip_ioapic_setup == 1)
 		return;
 
-	for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
-	for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
+	for_each_ioapic_pin(ioapic, pin) {
 		irq_entry = find_irq_entry(ioapic, pin, mp_INT);
 		if (irq_entry == -1)
 			continue;
-		irq = pin_2_irq(irq_entry, ioapic, pin);
 
-		if ((ioapic > 0) && (irq > 16))
+		irq = pin_2_irq(irq_entry, ioapic, pin, 0);
+		if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq))
 			continue;
 
 		idata = irq_get_irq_data(irq);
@@ -3573,29 +3665,33 @@ void __init setup_ioapic_dest(void)
 
 static struct resource *ioapic_resources;
 
-static struct resource * __init ioapic_setup_resources(int nr_ioapics)
+static struct resource * __init ioapic_setup_resources(void)
 {
 	unsigned long n;
 	struct resource *res;
 	char *mem;
-	int i;
+	int i, num = 0;
 
-	if (nr_ioapics <= 0)
+	for_each_ioapic(i)
+		num++;
+	if (num == 0)
 		return NULL;
 
 	n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
-	n *= nr_ioapics;
+	n *= num;
 
 	mem = alloc_bootmem(n);
 	res = (void *)mem;
 
-	mem += sizeof(struct resource) * nr_ioapics;
+	mem += sizeof(struct resource) * num;
 
-	for (i = 0; i < nr_ioapics; i++) {
-		res[i].name = mem;
-		res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+	num = 0;
+	for_each_ioapic(i) {
+		res[num].name = mem;
+		res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
 		mem += IOAPIC_RESOURCE_NAME_SIZE;
+		num++;
 	}
 
 	ioapic_resources = res;
@@ -3609,8 +3705,8 @@ void __init native_io_apic_init_mappings(void)
 	struct resource *ioapic_res;
 	int i;
 
-	ioapic_res = ioapic_setup_resources(nr_ioapics);
-	for (i = 0; i < nr_ioapics; i++) {
+	ioapic_res = ioapic_setup_resources();
+	for_each_ioapic(i) {
 		if (smp_found_config) {
 			ioapic_phys = mpc_ioapic_addr(i);
 #ifdef CONFIG_X86_32
@@ -3641,8 +3737,6 @@ fake_ioapic_page:
 		ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
 		ioapic_res++;
 	}
-
-	probe_nr_irqs_gsi();
 }
 
 void __init ioapic_insert_resources(void)
@@ -3657,7 +3751,7 @@ void __init ioapic_insert_resources(void)
 		return;
 	}
 
-	for (i = 0; i < nr_ioapics; i++) {
+	for_each_ioapic(i) {
 		insert_resource(&iomem_resource, r);
 		r++;
 	}
@@ -3665,16 +3759,15 @@ void __init ioapic_insert_resources(void)
 
 int mp_find_ioapic(u32 gsi)
 {
-	int i = 0;
+	int i;
 
 	if (nr_ioapics == 0)
 		return -1;
 
 	/* Find the IOAPIC that manages this GSI. */
-	for (i = 0; i < nr_ioapics; i++) {
+	for_each_ioapic(i) {
 		struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
-		if ((gsi >= gsi_cfg->gsi_base)
-		    && (gsi <= gsi_cfg->gsi_end))
+		if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end)
 			return i;
 	}
 
@@ -3686,7 +3779,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
 {
 	struct mp_ioapic_gsi *gsi_cfg;
 
-	if (WARN_ON(ioapic == -1))
+	if (WARN_ON(ioapic < 0))
 		return -1;
 
 	gsi_cfg = mp_ioapic_gsi_routing(ioapic);
@@ -3729,7 +3822,8 @@ static __init int bad_ioapic_register(int idx)
 	return 0;
 }
 
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base,
+			       struct ioapic_domain_cfg *cfg)
 {
 	int idx = 0;
 	int entries;
@@ -3743,6 +3837,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	ioapics[idx].mp_config.type = MP_IOAPIC;
 	ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
 	ioapics[idx].mp_config.apicaddr = address;
+	ioapics[idx].irqdomain = NULL;
+	ioapics[idx].irqdomain_cfg = *cfg;
 
 	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
 
@@ -3779,6 +3875,77 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	nr_ioapics++;
 }
 
+int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
+		     irq_hw_number_t hwirq)
+{
+	int ioapic = (int)(long)domain->host_data;
+	struct mp_pin_info *info = mp_pin_info(ioapic, hwirq);
+	struct io_apic_irq_attr attr;
+
+	/* Get default attribute if not set by caller yet */
+	if (!info->set) {
+		u32 gsi = mp_pin_to_gsi(ioapic, hwirq);
+
+		if (acpi_get_override_irq(gsi, &info->trigger,
+					  &info->polarity) < 0) {
+			/*
+			 * PCI interrupts are always polarity one level
+			 * triggered.
+			 */
+			info->trigger = 1;
+			info->polarity = 1;
+		}
+		info->node = NUMA_NO_NODE;
+		info->set = 1;
+	}
+	set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger,
+			     info->polarity);
+
+	return io_apic_setup_irq_pin(virq, info->node, &attr);
+}
+
+void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
+{
+	struct irq_data *data = irq_get_irq_data(virq);
+	struct irq_cfg *cfg = irq_cfg(virq);
+	int ioapic = (int)(long)domain->host_data;
+	int pin = (int)data->hwirq;
+
+	ioapic_mask_entry(ioapic, pin);
+	__remove_pin_from_irq(cfg, ioapic, pin);
+	WARN_ON(cfg->irq_2_pin != NULL);
+	arch_teardown_hwirq(virq);
+}
+
+int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
+{
+	int ret = 0;
+	int ioapic, pin;
+	struct mp_pin_info *info;
+
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return -ENODEV;
+
+	pin = mp_find_ioapic_pin(ioapic, gsi);
+	info = mp_pin_info(ioapic, pin);
+	trigger = trigger ? 1 : 0;
+	polarity = polarity ? 1 : 0;
+
+	mutex_lock(&ioapic_mutex);
+	if (!info->set) {
+		info->trigger = trigger;
+		info->polarity = polarity;
+		info->node = node;
+		info->set = 1;
+	} else if (info->trigger != trigger || info->polarity != polarity) {
+		ret = -EBUSY;
+	}
+	mutex_unlock(&ioapic_mutex);
+
+	return ret;
+}
+
 /* Enable IOAPIC early just for system timer */
 void __init pre_init_apic_IRQ0(void)
 {
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index cceb352c968c..bda488680dbc 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -88,21 +88,16 @@ static struct apic apic_default = {
 	.disable_esr			= 0,
 	.dest_logical			= APIC_DEST_LOGICAL,
 	.check_apicid_used		= default_check_apicid_used,
-	.check_apicid_present		= default_check_apicid_present,
 
 	.vector_allocation_domain	= flat_vector_allocation_domain,
 	.init_apic_ldr			= default_init_apic_ldr,
 
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map,
 	.setup_apic_routing		= setup_apic_flat_routing,
-	.multi_timer_check		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= physid_set_mask_of_physid,
-	.setup_portio_remap		= NULL,
 	.check_phys_apicid_present	= default_check_phys_apicid_present,
-	.enable_apic_mode		= NULL,
 	.phys_pkg_id			= default_phys_pkg_id,
-	.mps_oem_check			= NULL,
 
 	.get_apic_id			= default_get_apic_id,
 	.set_apic_id			= NULL,
@@ -116,11 +111,7 @@ static struct apic apic_default = {
 	.send_IPI_all			= default_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
-	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
 	.wait_for_init_deassert		= true,
-	.smp_callin_clear_local_apic	= NULL,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
@@ -214,29 +205,7 @@ void __init generic_apic_probe(void)
 	printk(KERN_INFO "Using APIC driver %s\n", apic->name);
 }
 
-/* These functions can switch the APIC even after the initial ->probe() */
-
-int __init
-generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
-{
-	struct apic **drv;
-
-	for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
-		if (!((*drv)->mps_oem_check))
-			continue;
-		if (!(*drv)->mps_oem_check(mpc, oem, productid))
-			continue;
-
-		if (!cmdline_apic) {
-			apic = *drv;
-			printk(KERN_INFO "Switched to APIC driver `%s'.\n",
-			       apic->name);
-		}
-		return 1;
-	}
-	return 0;
-}
-
+/* This function can switch the APIC even after the initial ->probe() */
 int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	struct apic **drv;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e66766bf1641..6ce600f9bc78 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -249,21 +249,16 @@ static struct apic apic_x2apic_cluster = {
 	.disable_esr			= 0,
 	.dest_logical			= APIC_DEST_LOGICAL,
 	.check_apicid_used		= NULL,
-	.check_apicid_present		= NULL,
 
 	.vector_allocation_domain	= cluster_vector_allocation_domain,
 	.init_apic_ldr			= init_x2apic_ldr,
 
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
-	.multi_timer_check		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
-	.setup_portio_remap		= NULL,
 	.check_phys_apicid_present	= default_check_phys_apicid_present,
-	.enable_apic_mode		= NULL,
 	.phys_pkg_id			= x2apic_phys_pkg_id,
-	.mps_oem_check			= NULL,
 
 	.get_apic_id			= x2apic_get_apic_id,
 	.set_apic_id			= x2apic_set_apic_id,
@@ -277,10 +272,7 @@ static struct apic apic_x2apic_cluster = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,
 
-	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
-	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 	.wait_for_init_deassert		= false,
-	.smp_callin_clear_local_apic	= NULL,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 6d600ebf6c12..6fae733e9194 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -103,21 +103,16 @@ static struct apic apic_x2apic_phys = {
 	.disable_esr			= 0,
 	.dest_logical			= 0,
 	.check_apicid_used		= NULL,
-	.check_apicid_present		= NULL,
 
 	.vector_allocation_domain	= default_vector_allocation_domain,
 	.init_apic_ldr			= init_x2apic_ldr,
 
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
-	.multi_timer_check		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
-	.setup_portio_remap		= NULL,
 	.check_phys_apicid_present	= default_check_phys_apicid_present,
-	.enable_apic_mode		= NULL,
 	.phys_pkg_id			= x2apic_phys_pkg_id,
-	.mps_oem_check			= NULL,
 
 	.get_apic_id			= x2apic_get_apic_id,
 	.set_apic_id			= x2apic_set_apic_id,
@@ -131,10 +126,7 @@ static struct apic apic_x2apic_phys = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,
 
-	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
-	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 	.wait_for_init_deassert		= false,
-	.smp_callin_clear_local_apic	= NULL,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 293b41df54ef..004f017aa7b9 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -365,21 +365,16 @@ static struct apic __refdata apic_x2apic_uv_x = {
 	.disable_esr			= 0,
 	.dest_logical			= APIC_DEST_LOGICAL,
 	.check_apicid_used		= NULL,
-	.check_apicid_present		= NULL,
 
 	.vector_allocation_domain	= default_vector_allocation_domain,
 	.init_apic_ldr			= uv_init_apic_ldr,
 
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
-	.multi_timer_check		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
-	.setup_portio_remap		= NULL,
 	.check_phys_apicid_present	= default_check_phys_apicid_present,
-	.enable_apic_mode		= NULL,
 	.phys_pkg_id			= uv_phys_pkg_id,
-	.mps_oem_check			= NULL,
 
 	.get_apic_id			= x2apic_get_apic_id,
 	.set_apic_id			= set_apic_id,
@@ -394,10 +389,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
 	.send_IPI_self			= uv_send_IPI_self,
 
 	.wakeup_secondary_cpu		= uv_wakeup_secondary,
-	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,
-	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH,
 	.wait_for_init_deassert		= false,
-	.smp_callin_clear_local_apic	= NULL,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 333fd5209336..e4ab2b42bd6f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -148,6 +148,7 @@ static int __init x86_xsave_setup(char *s)
 {
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
 	setup_clear_cpu_cap(X86_FEATURE_AVX);
 	setup_clear_cpu_cap(X86_FEATURE_AVX2);
 	return 1;
@@ -161,6 +162,13 @@ static int __init x86_xsaveopt_setup(char *s)
 }
 __setup("noxsaveopt", x86_xsaveopt_setup);
 
+static int __init x86_xsaves_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+	return 1;
+}
+__setup("noxsaves", x86_xsaves_setup);
+
 #ifdef CONFIG_X86_32
 static int cachesize_override = -1;
 static int disable_x86_serial_nr = 1;
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 7db54b5d5f86..3d3503351242 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -21,6 +21,7 @@
 #include <asm/apic.h>
 #include <asm/pci_x86.h>
 #include <asm/setup.h>
+#include <asm/i8259.h>
 
 __initdata u64 initial_dtb;
 char __initdata cmd_line[COMMAND_LINE_SIZE];
@@ -165,82 +166,6 @@ static void __init dtb_lapic_setup(void)
 #ifdef CONFIG_X86_IO_APIC
 static unsigned int ioapic_id;
 
-static void __init dtb_add_ioapic(struct device_node *dn)
-{
-	struct resource r;
-	int ret;
-
-	ret = of_address_to_resource(dn, 0, &r);
-	if (ret) {
-		printk(KERN_ERR "Can't obtain address from node %s.\n",
-				dn->full_name);
-		return;
-	}
-	mp_register_ioapic(++ioapic_id, r.start, gsi_top);
-}
-
-static void __init dtb_ioapic_setup(void)
-{
-	struct device_node *dn;
-
-	for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
-		dtb_add_ioapic(dn);
-
-	if (nr_ioapics) {
-		of_ioapic = 1;
-		return;
-	}
-	printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
-}
-#else
-static void __init dtb_ioapic_setup(void) {}
-#endif
-
-static void __init dtb_apic_setup(void)
-{
-	dtb_lapic_setup();
-	dtb_ioapic_setup();
-}
-
-#ifdef CONFIG_OF_FLATTREE
-static void __init x86_flattree_get_config(void)
-{
-	u32 size, map_len;
-	void *dt;
-
-	if (!initial_dtb)
-		return;
-
-	map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
-
-	initial_boot_params = dt = early_memremap(initial_dtb, map_len);
-	size = of_get_flat_dt_size();
-	if (map_len < size) {
-		early_iounmap(dt, map_len);
-		initial_boot_params = dt = early_memremap(initial_dtb, size);
-		map_len = size;
-	}
-
-	unflatten_and_copy_device_tree();
-	early_iounmap(dt, map_len);
-}
-#else
-static inline void x86_flattree_get_config(void) { }
-#endif
-
-void __init x86_dtb_init(void)
-{
-	x86_flattree_get_config();
-
-	if (!of_have_populated_dt())
-		return;
-
-	dtb_setup_hpet();
-	dtb_apic_setup();
-}
-
-#ifdef CONFIG_X86_IO_APIC
-
 struct of_ioapic_type {
 	u32 out_type;
 	u32 trigger;
@@ -276,10 +201,8 @@ static int ioapic_xlate(struct irq_domain *domain,
 			const u32 *intspec, u32 intsize,
 			irq_hw_number_t *out_hwirq, u32 *out_type)
 {
-	struct io_apic_irq_attr attr;
 	struct of_ioapic_type *it;
-	u32 line, idx;
-	int rc;
+	u32 line, idx, gsi;
 
 	if (WARN_ON(intsize < 2))
 		return -EINVAL;
@@ -291,13 +214,10 @@ static int ioapic_xlate(struct irq_domain *domain,
 
 	it = &of_ioapic_type[intspec[1]];
 
-	idx = (u32) domain->host_data;
-	set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
-
-	rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line),
-					cpu_to_node(0), &attr);
-	if (rc)
-		return rc;
+	idx = (u32)(long)domain->host_data;
+	gsi = mp_pin_to_gsi(idx, line);
+	if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0)))
+		return -EBUSY;
 
 	*out_hwirq = line;
 	*out_type = it->out_type;
@@ -305,81 +225,86 @@ static int ioapic_xlate(struct irq_domain *domain,
 }
 
 const struct irq_domain_ops ioapic_irq_domain_ops = {
+	.map = mp_irqdomain_map,
+	.unmap = mp_irqdomain_unmap,
 	.xlate = ioapic_xlate,
 };
 
-static void dt_add_ioapic_domain(unsigned int ioapic_num,
-		struct device_node *np)
+static void __init dtb_add_ioapic(struct device_node *dn)
 {
-	struct irq_domain *id;
-	struct mp_ioapic_gsi *gsi_cfg;
+	struct resource r;
 	int ret;
-	int num;
-
-	gsi_cfg = mp_ioapic_gsi_routing(ioapic_num);
-	num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
-
-	id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops,
-			(void *)ioapic_num);
-	BUG_ON(!id);
-	if (gsi_cfg->gsi_base == 0) {
-		/*
-		 * The first NR_IRQS_LEGACY irq descs are allocated in
-		 * early_irq_init() and need just a mapping. The
-		 * remaining irqs need both. All of them are preallocated
-		 * and assigned so we can keep the 1:1 mapping which the ioapic
-		 * is having.
-		 */
-		irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
-
-		if (num > NR_IRQS_LEGACY) {
-			ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY,
-					NR_IRQS_LEGACY, num - NR_IRQS_LEGACY);
-			if (ret)
-				pr_err("Error creating mapping for the "
-						"remaining IRQs: %d\n", ret);
-		}
-		irq_set_default_host(id);
-	} else {
-		ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num);
-		if (ret)
-			pr_err("Error creating IRQ mapping: %d\n", ret);
+	struct ioapic_domain_cfg cfg = {
+		.type = IOAPIC_DOMAIN_DYNAMIC,
+		.ops = &ioapic_irq_domain_ops,
+		.dev = dn,
+	};
+
+	ret = of_address_to_resource(dn, 0, &r);
+	if (ret) {
+		printk(KERN_ERR "Can't obtain address from node %s.\n",
+				dn->full_name);
+		return;
 	}
+	mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg);
 }
 
-static void __init ioapic_add_ofnode(struct device_node *np)
+static void __init dtb_ioapic_setup(void)
 {
-	struct resource r;
-	int i, ret;
+	struct device_node *dn;
 
-	ret = of_address_to_resource(np, 0, &r);
-	if (ret) {
-		printk(KERN_ERR "Failed to obtain address for %s\n",
-				np->full_name);
+	for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
+		dtb_add_ioapic(dn);
+
+	if (nr_ioapics) {
+		of_ioapic = 1;
 		return;
 	}
+	printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
+}
+#else
+static void __init dtb_ioapic_setup(void) {}
+#endif
 
-	for (i = 0; i < nr_ioapics; i++) {
-		if (r.start == mpc_ioapic_addr(i)) {
-			dt_add_ioapic_domain(i, np);
-			return;
-		}
-	}
-	printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
+static void __init dtb_apic_setup(void)
+{
+	dtb_lapic_setup();
+	dtb_ioapic_setup();
 }
 
-void __init x86_add_irq_domains(void)
+#ifdef CONFIG_OF_FLATTREE
+static void __init x86_flattree_get_config(void)
 {
-	struct device_node *dp;
+	u32 size, map_len;
+	void *dt;
 
-	if (!of_have_populated_dt())
+	if (!initial_dtb)
 		return;
 
-	for_each_node_with_property(dp, "interrupt-controller") {
-		if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
-			ioapic_add_ofnode(dp);
+	map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
+
+	initial_boot_params = dt = early_memremap(initial_dtb, map_len);
+	size = of_get_flat_dt_size();
+	if (map_len < size) {
+		early_iounmap(dt, map_len);
+		initial_boot_params = dt = early_memremap(initial_dtb, size);
+		map_len = size;
 	}
+
+	unflatten_and_copy_device_tree();
+	early_iounmap(dt, map_len);
 }
 #else
-void __init x86_add_irq_domains(void) { }
+static inline void x86_flattree_get_config(void) { }
 #endif
+
+void __init x86_dtb_init(void)
+{
+	x86_flattree_get_config();
+
+	if (!of_have_populated_dt())
+		return;
+
+	dtb_setup_hpet();
+	dtb_apic_setup();
+}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5dd80814419..a9a4229f6161 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -375,7 +375,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	/*
 	 * These bits must be zero.
 	 */
-	xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+	memset(xsave_hdr->reserved, 0, 48);
 
 	return ret;
 }
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 7f50156542fb..1e6cff5814fa 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -78,7 +78,7 @@ void __init init_ISA_irqs(void)
 #endif
 	legacy_pic->init(0);
 
-	for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+	for (i = 0; i < nr_legacy_irqs(); i++)
 		irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
 }
 
@@ -87,12 +87,6 @@ void __init init_IRQ(void)
 	int i;
 
 	/*
-	 * We probably need a better place for this, but it works for
-	 * now ...
-	 */
-	x86_add_irq_domains();
-
-	/*
 	 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
 	 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
 	 * then this configuration will likely be static after the boot. If
@@ -100,7 +94,7 @@ void __init init_IRQ(void)
 	 * then this vector space can be freed and re-used dynamically as the
 	 * irq's migrate etc.
 	 */
-	for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+	for (i = 0; i < nr_legacy_irqs(); i++)
 		per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
 
 	x86_init.irqs.intr_init();
@@ -121,7 +115,7 @@ void setup_vector_irq(int cpu)
 	 * legacy PIC, for the new cpu that is coming online, setup the static
 	 * legacy vector to irq mapping:
 	 */
-	for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
+	for (irq = 0; irq < nr_legacy_irqs(); irq++)
 		per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
 #endif
 
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d2b56489d70f..2d2a237f2c73 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/smp.h>
 #include <linux/pci.h>
+#include <linux/irqdomain.h>
 
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
@@ -67,7 +68,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
 		boot_cpu_physical_apicid = m->apicid;
 	}
 
-	printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
+	pr_info("Processor #%d%s\n", m->apicid, bootup_cpu);
 	generic_processor_info(apicid, m->apicver);
 }
 
@@ -87,9 +88,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
 
 #if MAX_MP_BUSSES < 256
 	if (m->busid >= MAX_MP_BUSSES) {
-		printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
-		       " is too large, max. supported is %d\n",
-		       m->busid, str, MAX_MP_BUSSES - 1);
+		pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n",
+			m->busid, str, MAX_MP_BUSSES - 1);
 		return;
 	}
 #endif
@@ -110,19 +110,29 @@ static void __init MP_bus_info(struct mpc_bus *m)
 		mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
 #endif
 	} else
-		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+		pr_warn("Unknown bustype %s - ignoring\n", str);
 }
 
+static struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+	.map = mp_irqdomain_map,
+	.unmap = mp_irqdomain_unmap,
+};
+
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
+	struct ioapic_domain_cfg cfg = {
+		.type = IOAPIC_DOMAIN_LEGACY,
+		.ops = &mp_ioapic_irqdomain_ops,
+	};
+
 	if (m->flags & MPC_APIC_USABLE)
-		mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
+		mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg);
 }
 
 static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
 {
-	apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
-		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
+	apic_printk(APIC_VERBOSE,
+		"Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
 		mp_irq->irqtype, mp_irq->irqflag & 3,
 		(mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
 		mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
@@ -135,8 +145,8 @@ static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
 
 static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
 {
-	apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
-		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+	apic_printk(APIC_VERBOSE,
+		"Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n",
 		m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
 		m->srcbusirq, m->destapic, m->destapiclint);
 }
@@ -148,34 +158,33 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
 {
 
 	if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
-		printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
+		pr_err("MPTABLE: bad signature [%c%c%c%c]!\n",
 		       mpc->signature[0], mpc->signature[1],
 		       mpc->signature[2], mpc->signature[3]);
 		return 0;
 	}
 	if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
-		printk(KERN_ERR "MPTABLE: checksum error!\n");
+		pr_err("MPTABLE: checksum error!\n");
 		return 0;
 	}
 	if (mpc->spec != 0x01 && mpc->spec != 0x04) {
-		printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
-		       mpc->spec);
+		pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec);
 		return 0;
 	}
 	if (!mpc->lapic) {
-		printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+		pr_err("MPTABLE: null local APIC address!\n");
 		return 0;
 	}
 	memcpy(oem, mpc->oem, 8);
 	oem[8] = 0;
-	printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
+	pr_info("MPTABLE: OEM ID: %s\n", oem);
 
 	memcpy(str, mpc->productid, 12);
 	str[12] = 0;
 
-	printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
+	pr_info("MPTABLE: Product ID: %s\n", str);
 
-	printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
+	pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic);
 
 	return 1;
 }
@@ -188,8 +197,8 @@ static void skip_entry(unsigned char **ptr, int *count, int size)
 
 static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
 {
-	printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
-		"type %x\n", *mpt);
+	pr_err("Your mptable is wrong, contact your HW vendor!\n");
+	pr_cont("type %x\n", *mpt);
 	print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
 			1, mpc, mpc->length, 1);
 }
@@ -207,9 +216,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 	if (!smp_check_mpc(mpc, oem, str))
 		return 0;
 
-#ifdef CONFIG_X86_32
-	generic_mps_oem_check(mpc, oem, str);
-#endif
 	/* Initialize the lapic mapping */
 	if (!acpi_lapic)
 		register_lapic_address(mpc->lapic);
@@ -259,7 +265,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 	}
 
 	if (!num_processors)
-		printk(KERN_ERR "MPTABLE: no processors registered!\n");
+		pr_err("MPTABLE: no processors registered!\n");
 	return num_processors;
 }
 
@@ -295,16 +301,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
 	 *  If it does, we assume it's valid.
 	 */
 	if (mpc_default_type == 5) {
-		printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
-		       "falling back to ELCR\n");
+		pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
 
 		if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
 		    ELCR_trigger(13))
-			printk(KERN_ERR "ELCR contains invalid data... "
-			       "not using ELCR\n");
+			pr_err("ELCR contains invalid data... not using ELCR\n");
 		else {
-			printk(KERN_INFO
-			       "Using ELCR to identify PCI interrupts\n");
+			pr_info("Using ELCR to identify PCI interrupts\n");
 			ELCR_fallback = 1;
 		}
 	}
@@ -353,7 +356,7 @@ static void __init construct_ioapic_table(int mpc_default_type)
 	bus.busid = 0;
 	switch (mpc_default_type) {
 	default:
-		printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+		pr_err("???\nUnknown standard configuration %d\n",
 		       mpc_default_type);
 		/* fall through */
 	case 1:
@@ -462,8 +465,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
 #ifdef CONFIG_X86_LOCAL_APIC
 		smp_found_config = 0;
 #endif
-		printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
-			"... disabling SMP support. (tell your hw vendor)\n");
+		pr_err("BIOS bug, MP table errors detected!...\n");
+		pr_cont("... disabling SMP support. (tell your hw vendor)\n");
 		early_iounmap(mpc, size);
 		return -1;
 	}
@@ -481,8 +484,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
 	if (!mp_irq_entries) {
 		struct mpc_bus bus;
 
-		printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
-		       "using default mptable. (tell your hw vendor)\n");
+		pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
 
 		bus.type = MP_BUS;
 		bus.busid = 0;
@@ -516,14 +518,14 @@ void __init default_get_smp_config(unsigned int early)
 	if (acpi_lapic && acpi_ioapic)
 		return;
 
-	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
-	       mpf->specification);
+	pr_info("Intel MultiProcessor Specification v1.%d\n",
+		mpf->specification);
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
 	if (mpf->feature2 & (1 << 7)) {
-		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
+		pr_info("    IMCR and PIC compatibility mode.\n");
 		pic_mode = 1;
 	} else {
-		printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
+		pr_info("    Virtual Wire compatibility mode.\n");
 		pic_mode = 0;
 	}
 #endif
@@ -539,8 +541,7 @@ void __init default_get_smp_config(unsigned int early)
 			return;
 		}
 
-		printk(KERN_INFO "Default MP configuration #%d\n",
-		       mpf->feature1);
+		pr_info("Default MP configuration #%d\n", mpf->feature1);
 		construct_default_ISA_mptable(mpf->feature1);
 
 	} else if (mpf->physptr) {
@@ -550,7 +551,7 @@ void __init default_get_smp_config(unsigned int early)
 		BUG();
 
 	if (!early)
-		printk(KERN_INFO "Processors: %d\n", num_processors);
+		pr_info("Processors: %d\n", num_processors);
 	/*
 	 * Only use the first configuration found.
 	 */
@@ -583,10 +584,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 #endif
 			mpf_found = mpf;
 
-			printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
-			       (unsigned long long) virt_to_phys(mpf),
-			       (unsigned long long) virt_to_phys(mpf) +
-			       sizeof(*mpf) - 1, mpf);
+			pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
+				(unsigned long long) virt_to_phys(mpf),
+				(unsigned long long) virt_to_phys(mpf) +
+				sizeof(*mpf) - 1, mpf);
 
 			mem = virt_to_phys(mpf);
 			memblock_reserve(mem, sizeof(*mpf));
@@ -735,7 +736,7 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 	int nr_m_spare = 0;
 	unsigned char *mpt = ((unsigned char *)mpc) + count;
 
-	printk(KERN_INFO "mpc_length %x\n", mpc->length);
+	pr_info("mpc_length %x\n", mpc->length);
 	while (count < mpc->length) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
@@ -862,13 +863,13 @@ static int __init update_mp_table(void)
 	if (!smp_check_mpc(mpc, oem, str))
 		return 0;
 
-	printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf));
-	printk(KERN_INFO "physptr: %x\n", mpf->physptr);
+	pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf));
+	pr_info("physptr: %x\n", mpf->physptr);
 
 	if (mpc_new_phys && mpc->length > mpc_new_length) {
 		mpc_new_phys = 0;
-		printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
-			 mpc_new_length);
+		pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n",
+			mpc_new_length);
 	}
 
 	if (!mpc_new_phys) {
@@ -879,10 +880,10 @@ static int __init update_mp_table(void)
 		mpc->checksum = 0xff;
 		new = mpf_checksum((unsigned char *)mpc, mpc->length);
 		if (old == new) {
-			printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+			pr_info("mpc is readonly, please try alloc_mptable instead\n");
 			return 0;
 		}
-		printk(KERN_INFO "use in-position replacing\n");
+		pr_info("use in-position replacing\n");
 	} else {
 		mpf->physptr = mpc_new_phys;
 		mpc_new = phys_to_virt(mpc_new_phys);
@@ -892,7 +893,7 @@ static int __init update_mp_table(void)
 		if (mpc_new_phys - mpf->physptr) {
 			struct mpf_intel *mpf_new;
 			/* steal 16 bytes from [0, 1k) */
-			printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+			pr_info("mpf new: %x\n", 0x400 - 16);
 			mpf_new = phys_to_virt(0x400 - 16);
 			memcpy(mpf_new, mpf, 16);
 			mpf = mpf_new;
@@ -900,7 +901,7 @@ static int __init update_mp_table(void)
 		}
 		mpf->checksum = 0;
 		mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
-		printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
+		pr_info("physptr new: %x\n", mpf->physptr);
 	}
 
 	/*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4505e2a950d8..f804dc935d2a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -93,6 +93,7 @@ void arch_task_cache_init(void)
         	kmem_cache_create("task_xstate", xstate_size,
 				  __alignof__(union thread_xstate),
 				  SLAB_PANIC | SLAB_NOTRACK, NULL);
+	setup_xstate_comp();
 }
 
 /*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5492798930ef..2d872e08fab9 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -168,10 +168,6 @@ static void smp_callin(void)
 	 * CPU, first the APIC. (this is probably redundant on most
 	 * boards)
 	 */
-
-	pr_debug("CALLIN, before setup_local_APIC()\n");
-	if (apic->smp_callin_clear_local_apic)
-		apic->smp_callin_clear_local_apic();
 	setup_local_APIC();
 	end_local_APIC_setup();
 
@@ -1143,10 +1139,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 		enable_IO_APIC();
 
 	bsp_end_local_APIC_setup();
-
-	if (apic->setup_portio_remap)
-		apic->setup_portio_remap();
-
 	smpboot_setup_io_apic();
 	/*
 	 * Set up local APIC timer on boot CPU.
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index b99b9ad8540c..ee22c1d93ae5 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -152,7 +152,7 @@ static void __init detect_vsmp_box(void)
 		is_vsmp = 1;
 }
 
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
 {
 	if (is_vsmp != -1)
 		return is_vsmp;
@@ -166,7 +166,7 @@ int is_vsmp_box(void)
 static void __init detect_vsmp_box(void)
 {
 }
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
 {
 	return 0;
 }
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a4b451c6addf..940b142cc11f 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -8,6 +8,7 @@
 
 #include <linux/bootmem.h>
 #include <linux/compat.h>
+#include <linux/cpu.h>
 #include <asm/i387.h>
 #include <asm/fpu-internal.h>
 #include <asm/sigframe.h>
@@ -24,7 +25,9 @@ u64 pcntxt_mask;
 struct xsave_struct *init_xstate_buf;
 
 static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
-static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+static unsigned int *xstate_offsets, *xstate_sizes;
+static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8];
+static unsigned int xstate_features;
 
 /*
  * If a processor implementation discern that a processor state component is
@@ -283,7 +286,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
 
 	if (use_xsave()) {
 		/* These bits must be zero. */
-		xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+		memset(xsave_hdr->reserved, 0, 48);
 
 		/*
 		 * Init the state that is not present in the memory
@@ -479,6 +482,52 @@ static void __init setup_xstate_features(void)
 }
 
 /*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ *
+ * Input: void
+ * Output: void
+ */
+void setup_xstate_comp(void)
+{
+	unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8];
+	int i;
+
+	/*
+	 * The FP xstates and SSE xstates are legacy states. They are always
+	 * in the fixed offsets in the xsave area in either compacted form
+	 * or standard form.
+	 */
+	xstate_comp_offsets[0] = 0;
+	xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space);
+
+	if (!cpu_has_xsaves) {
+		for (i = 2; i < xstate_features; i++) {
+			if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+				xstate_comp_offsets[i] = xstate_offsets[i];
+				xstate_comp_sizes[i] = xstate_sizes[i];
+			}
+		}
+		return;
+	}
+
+	xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+	for (i = 2; i < xstate_features; i++) {
+		if (test_bit(i, (unsigned long *)&pcntxt_mask))
+			xstate_comp_sizes[i] = xstate_sizes[i];
+		else
+			xstate_comp_sizes[i] = 0;
+
+		if (i > 2)
+			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+					+ xstate_comp_sizes[i-1];
+
+	}
+}
+
+/*
  * setup the xstate image representing the init state
  */
 static void __init setup_init_fpu_buf(void)
@@ -496,15 +545,21 @@ static void __init setup_init_fpu_buf(void)
 
 	setup_xstate_features();
 
+	if (cpu_has_xsaves) {
+		init_xstate_buf->xsave_hdr.xcomp_bv =
+						(u64)1 << 63 | pcntxt_mask;
+		init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask;
+	}
+
 	/*
 	 * Init all the features state with header_bv being 0x0
 	 */
-	xrstor_state(init_xstate_buf, -1);
+	xrstor_state_booting(init_xstate_buf, -1);
 	/*
 	 * Dump the init state again. This is to identify the init state
 	 * of any feature which is not represented by all zero's.
 	 */
-	xsave_state(init_xstate_buf, -1);
+	xsave_state_booting(init_xstate_buf, -1);
 }
 
 static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
@@ -520,6 +575,30 @@ static int __init eager_fpu_setup(char *s)
 }
 __setup("eagerfpu=", eager_fpu_setup);
 
+
+/*
+ * Calculate total size of enabled xstates in XCR0/pcntxt_mask.
+ */
+static void __init init_xstate_size(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	int i;
+
+	if (!cpu_has_xsaves) {
+		cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+		xstate_size = ebx;
+		return;
+	}
+
+	xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	for (i = 2; i < 64; i++) {
+		if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+			cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+			xstate_size += eax;
+		}
+	}
+}
+
 /*
  * Enable and initialize the xsave feature.
  */
@@ -551,8 +630,7 @@ static void __init xstate_enable_boot_cpu(void)
 	/*
 	 * Recompute the context size for enabled features
 	 */
-	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-	xstate_size = ebx;
+	init_xstate_size();
 
 	update_regset_xstate_info(xstate_size, pcntxt_mask);
 	prepare_fx_sw_frame();
@@ -572,8 +650,9 @@ static void __init xstate_enable_boot_cpu(void)
 		}
 	}
 
-	pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
-		pcntxt_mask, xstate_size);
+	pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n",
+		pcntxt_mask, xstate_size,
+		cpu_has_xsaves ? "compacted form" : "standard form");
 }
 
 /*
@@ -635,3 +714,26 @@ void eager_fpu_init(void)
 	else
 		fxrstor_checking(&init_xstate_buf->i387);
 }
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Inputs:
+ *	xsave: base address of the xsave area;
+ *	xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE,
+ *	etc.)
+ * Output:
+ *	address of the state in the xsave area.
+ */
+void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
+{
+	int feature = fls64(xstate) - 1;
+	if (!test_bit(feature, (unsigned long *)&pcntxt_mask))
+		return NULL;
+
+	return (void *)xsave + xstate_comp_offsets[feature];
+}
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 5075371ab593..cfd1b132b8e3 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -448,7 +448,7 @@ static void probe_pci_root_info(struct pci_root_info *info,
 		return;
 
 	size = sizeof(*info->res) * info->res_num;
-	info->res = kzalloc(size, GFP_KERNEL);
+	info->res = kzalloc_node(size, GFP_KERNEL, info->sd.node);
 	if (!info->res) {
 		info->res_num = 0;
 		return;
@@ -456,7 +456,7 @@ static void probe_pci_root_info(struct pci_root_info *info,
 
 	size = sizeof(*info->res_offset) * info->res_num;
 	info->res_num = 0;
-	info->res_offset = kzalloc(size, GFP_KERNEL);
+	info->res_offset = kzalloc_node(size, GFP_KERNEL, info->sd.node);
 	if (!info->res_offset) {
 		kfree(info->res);
 		info->res = NULL;
@@ -499,7 +499,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 	if (node != NUMA_NO_NODE && !node_online(node))
 		node = NUMA_NO_NODE;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
 	if (!info) {
 		printk(KERN_WARNING "pci_bus %04x:%02x: "
 		       "ignored (out of memory)\n", domain, busnum);
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 84b9d672843d..3865116c51fb 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -208,27 +208,31 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
 
 static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 {
-	u8 pin;
-	struct io_apic_irq_attr irq_attr;
+	int polarity;
 
-	pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+	if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
+		polarity = 0; /* active high */
+	else
+		polarity = 1; /* active low */
 
 	/*
 	 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
 	 * IOAPIC RTE entries, so we just enable RTE for the device.
 	 */
-	irq_attr.ioapic = mp_find_ioapic(dev->irq);
-	irq_attr.ioapic_pin = dev->irq;
-	irq_attr.trigger = 1; /* level */
-	if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
-		irq_attr.polarity = 0; /* active high */
-	else
-		irq_attr.polarity = 1; /* active low */
-	io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr);
+	if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev)))
+		return -EBUSY;
+	if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0)
+		return -EBUSY;
 
 	return 0;
 }
 
+static void intel_mid_pci_irq_disable(struct pci_dev *dev)
+{
+	if (!dev->dev.power.is_prepared && dev->irq > 0)
+		mp_unmap_irq(dev->irq);
+}
+
 struct pci_ops intel_mid_pci_ops = {
 	.read = pci_read,
 	.write = pci_write,
@@ -245,6 +249,7 @@ int __init intel_mid_pci_init(void)
 	pr_info("Intel MID platform detected, using MID PCI ops\n");
 	pci_mmcfg_late_init();
 	pcibios_enable_irq = intel_mid_pci_irq_enable;
+	pcibios_disable_irq = intel_mid_pci_irq_disable;
 	pci_root_ops = intel_mid_pci_ops;
 	pci_soc_mode = 1;
 	/* Continue with standard init */
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 84112f55dd7a..bc1a2c341891 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -26,6 +26,7 @@ static int acer_tm360_irqrouting;
 static struct irq_routing_table *pirq_table;
 
 static int pirq_enable_irq(struct pci_dev *dev);
+static void pirq_disable_irq(struct pci_dev *dev);
 
 /*
  * Never use: 0, 1, 2 (timer, keyboard, and cascade)
@@ -53,7 +54,7 @@ struct irq_router_handler {
 };
 
 int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq;
-void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
+void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq;
 
 /*
  *  Check passed address for the PCI IRQ Routing Table signature
@@ -1186,7 +1187,7 @@ void pcibios_penalize_isa_irq(int irq, int active)
 
 static int pirq_enable_irq(struct pci_dev *dev)
 {
-	u8 pin;
+	u8 pin = 0;
 
 	pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
 	if (pin && !pcibios_lookup_irq(dev, 1)) {
@@ -1227,8 +1228,6 @@ static int pirq_enable_irq(struct pci_dev *dev)
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
-				io_apic_set_pci_routing(&dev->dev, irq,
-							 &irq_attr);
 				dev->irq = irq;
 				dev_info(&dev->dev, "PCI->APIC IRQ transform: "
 					 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
@@ -1254,3 +1253,12 @@ static int pirq_enable_irq(struct pci_dev *dev)
 	}
 	return 0;
 }
+
+static void pirq_disable_irq(struct pci_dev *dev)
+{
+	if (io_apic_assign_pci_irqs && !dev->dev.power.is_prepared &&
+	    dev->irq) {
+		mp_unmap_irq(dev->irq);
+		dev->irq = 0;
+	}
+}
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 905956f16465..093f5f4272d3 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -23,6 +23,7 @@
 #include <xen/features.h>
 #include <xen/events.h>
 #include <asm/xen/pci.h>
+#include <asm/i8259.h>
 
 static int xen_pcifront_enable_irq(struct pci_dev *dev)
 {
@@ -40,7 +41,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev)
 	/* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/
 	pirq = gsi;
 
-	if (gsi < NR_IRQS_LEGACY)
+	if (gsi < nr_legacy_irqs())
 		share = 0;
 
 	rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
@@ -511,7 +512,7 @@ int __init pci_xen_initial_domain(void)
 	xen_setup_acpi_sci();
 	__acpi_register_gsi = acpi_register_gsi_xen;
 	/* Pre-allocate legacy irqs */
-	for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
+	for (irq = 0; irq < nr_legacy_irqs(); irq++) {
 		int trigger, polarity;
 
 		if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
@@ -522,7 +523,7 @@ int __init pci_xen_initial_domain(void)
 			true /* Map GSI to PIRQ */);
 	}
 	if (0 == nr_ioapics) {
-		for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+		for (irq = 0; irq < nr_legacy_irqs(); irq++)
 			xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic");
 	}
 	return 0;
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index 8244f5ec2f4c..701fd5843c87 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -135,14 +135,10 @@ static void __init sdv_arch_setup(void)
 	sdv_serial_fixup();
 }
 
-#ifdef CONFIG_X86_IO_APIC
 static void sdv_pci_init(void)
 {
 	x86_of_pci_init();
-	/* We can't set this earlier, because we need to calibrate the timer */
-	legacy_pic = &null_legacy_pic;
 }
-#endif
 
 /*
  * CE4100 specific x86_init function overrides and early setup
@@ -155,7 +151,9 @@ void __init x86_ce4100_early_setup(void)
 	x86_init.resources.probe_roms = x86_init_noop;
 	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
 	x86_init.mpparse.find_smp_config = x86_init_noop;
+	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
 	x86_init.pci.init = ce4100_pci_init;
+	x86_init.pci.init_irq = sdv_pci_init;
 
 	/*
 	 * By default, the reboot method is ACPI which is supported by the
@@ -166,10 +164,5 @@ void __init x86_ce4100_early_setup(void)
 	 */
 	reboot_type = BOOT_KBD;
 
-#ifdef CONFIG_X86_IO_APIC
-	x86_init.pci.init_irq = sdv_pci_init;
-	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
-#endif
-
 	pm_power_off = ce4100_power_off;
 }
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
index 973cf3bfa9fd..0b283d4d0ad7 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
@@ -26,28 +26,18 @@ static struct platform_device wdt_dev = {
 
 static int tangier_probe(struct platform_device *pdev)
 {
-	int ioapic;
-	int irq;
+	int gsi;
 	struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
-	struct io_apic_irq_attr irq_attr = { 0 };
 
 	if (!pdata)
 		return -EINVAL;
 
-	irq = pdata->irq;
-	ioapic = mp_find_ioapic(irq);
-	if (ioapic >= 0) {
-		int ret;
-		irq_attr.ioapic = ioapic;
-		irq_attr.ioapic_pin = irq;
-		irq_attr.trigger = 1;
-		/* irq_attr.polarity = 0; -> Active high */
-		ret = io_apic_set_pci_routing(NULL, irq, &irq_attr);
-		if (ret)
-			return ret;
-	} else {
+	/* IOAPIC builds identity mapping between GSI and IRQ on MID */
+	gsi = pdata->irq;
+	if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) ||
+	    mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) {
 		dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
-			 irq);
+			 gsi);
 		return -EINVAL;
 	}
 
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index 994c40bd7cb7..3c53a90fdb18 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -432,9 +432,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 	struct sfi_table_simple *sb;
 	struct sfi_device_table_entry *pentry;
 	struct devs_id *dev = NULL;
-	int num, i;
-	int ioapic;
-	struct io_apic_irq_attr irq_attr;
+	int num, i, ret;
+	int polarity;
 
 	sb = (struct sfi_table_simple *)table;
 	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
@@ -448,35 +447,30 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 			 * devices, but they have separate RTE entry in IOAPIC
 			 * so we have to enable them one by one here
 			 */
-			ioapic = mp_find_ioapic(irq);
-			if (ioapic >= 0) {
-				irq_attr.ioapic = ioapic;
-				irq_attr.ioapic_pin = irq;
-				irq_attr.trigger = 1;
-				if (intel_mid_identify_cpu() ==
-						INTEL_MID_CPU_CHIP_TANGIER) {
-					if (!strncmp(pentry->name,
-							"r69001-ts-i2c", 13))
-						/* active low */
-						irq_attr.polarity = 1;
-					else if (!strncmp(pentry->name,
-							"synaptics_3202", 14))
-						/* active low */
-						irq_attr.polarity = 1;
-					else if (irq == 41)
-						/* fast_int_1 */
-						irq_attr.polarity = 1;
-					else
-						/* active high */
-						irq_attr.polarity = 0;
-				} else {
-					/* PNW and CLV go with active low */
-					irq_attr.polarity = 1;
-				}
-				io_apic_set_pci_routing(NULL, irq, &irq_attr);
+			if (intel_mid_identify_cpu() ==
+					INTEL_MID_CPU_CHIP_TANGIER) {
+				if (!strncmp(pentry->name, "r69001-ts-i2c", 13))
+					/* active low */
+					polarity = 1;
+				else if (!strncmp(pentry->name,
+						"synaptics_3202", 14))
+					/* active low */
+					polarity = 1;
+				else if (irq == 41)
+					/* fast_int_1 */
+					polarity = 1;
+				else
+					/* active high */
+					polarity = 0;
+			} else {
+				/* PNW and CLV go with active low */
+				polarity = 1;
 			}
-		} else {
-			irq = 0; /* No irq */
+
+			ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE);
+			if (ret == 0)
+				ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC);
+			WARN_ON(ret < 0);
 		}
 
 		dev = get_device_id(pentry->type, pentry->name);
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index bcd1a703e3e6..2a8a74f3bd76 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/sfi.h>
 #include <linux/io.h>
+#include <linux/irqdomain.h>
 
 #include <asm/io_apic.h>
 #include <asm/mpspec.h>
@@ -70,19 +71,26 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
+static struct irq_domain_ops sfi_ioapic_irqdomain_ops = {
+	.map = mp_irqdomain_map,
+};
 
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 {
 	struct sfi_table_simple *sb;
 	struct sfi_apic_table_entry *pentry;
 	int i, num;
+	struct ioapic_domain_cfg cfg = {
+		.type = IOAPIC_DOMAIN_STRICT,
+		.ops = &sfi_ioapic_irqdomain_ops,
+	};
 
 	sb = (struct sfi_table_simple *)table;
 	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
 	pentry = (struct sfi_apic_table_entry *)sb->pentry;
 
 	for (i = 0; i < num; i++) {
-		mp_register_ioapic(i, pentry->phys_addr, gsi_top);
+		mp_register_ioapic(i, pentry->phys_addr, gsi_top, &cfg);
 		pentry++;
 	}
 
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 9e241063a616..bc423f7b02da 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -70,8 +70,10 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 					  bs->bvec_integrity_pool);
 		if (!bip->bip_vec)
 			goto err;
+		bip->bip_max_vcnt = bvec_nr_vecs(idx);
 	} else {
 		bip->bip_vec = bip->bip_inline_vecs;
+		bip->bip_max_vcnt = inline_vecs;
 	}
 
 	bip->bip_slab = idx;
@@ -114,14 +116,6 @@ void bio_integrity_free(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_integrity_free);
 
-static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
-{
-	if (bip->bip_slab == BIO_POOL_NONE)
-		return BIP_INLINE_VECS;
-
-	return bvec_nr_vecs(bip->bip_slab);
-}
-
 /**
  * bio_integrity_add_page - Attach integrity metadata
  * @bio:	bio to update
@@ -137,7 +131,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 	struct bio_vec *iv;
 
-	if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
+	if (bip->bip_vcnt >= bip->bip_max_vcnt) {
 		printk(KERN_ERR "%s: bip_vec full\n", __func__);
 		return 0;
 	}
diff --git a/block/bio.c b/block/bio.c
index 0ec61c9e536c..3e6331d25d90 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -112,7 +112,8 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 	bslab = &bio_slabs[entry];
 
 	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
-	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+	slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
+				 SLAB_HWCACHE_ALIGN, NULL);
 	if (!slab)
 		goto out_unlock;
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 6f8dba161bfe..c359d72e9d76 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -438,14 +438,17 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
  */
 void blk_queue_bypass_start(struct request_queue *q)
 {
-	bool drain;
-
 	spin_lock_irq(q->queue_lock);
-	drain = !q->bypass_depth++;
+	q->bypass_depth++;
 	queue_flag_set(QUEUE_FLAG_BYPASS, q);
 	spin_unlock_irq(q->queue_lock);
 
-	if (drain) {
+	/*
+	 * Queues start drained.  Skip actual draining till init is
+	 * complete.  This avoids lenghty delays during queue init which
+	 * can happen many times during boot.
+	 */
+	if (blk_queue_init_done(q)) {
 		spin_lock_irq(q->queue_lock);
 		__blk_drain_queue(q, false);
 		spin_unlock_irq(q->queue_lock);
@@ -511,7 +514,7 @@ void blk_cleanup_queue(struct request_queue *q)
 	 * prevent that q->request_fn() gets invoked after draining finished.
 	 */
 	if (q->mq_ops) {
-		blk_mq_drain_queue(q);
+		blk_mq_freeze_queue(q);
 		spin_lock_irq(lock);
 	} else {
 		spin_lock_irq(lock);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ad69ef657e85..5189cb1e478a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -78,68 +78,47 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 
 static int blk_mq_queue_enter(struct request_queue *q)
 {
-	int ret;
-
-	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
-	smp_wmb();
-
-	/* we have problems freezing the queue if it's initializing */
-	if (!blk_queue_dying(q) &&
-	    (!blk_queue_bypass(q) || !blk_queue_init_done(q)))
-		return 0;
-
-	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+	while (true) {
+		int ret;
 
-	spin_lock_irq(q->queue_lock);
-	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
-		!blk_queue_bypass(q) || blk_queue_dying(q),
-		*q->queue_lock);
-	/* inc usage with lock hold to avoid freeze_queue runs here */
-	if (!ret && !blk_queue_dying(q))
-		__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
-	else if (blk_queue_dying(q))
-		ret = -ENODEV;
-	spin_unlock_irq(q->queue_lock);
+		if (percpu_ref_tryget_live(&q->mq_usage_counter))
+			return 0;
 
-	return ret;
+		ret = wait_event_interruptible(q->mq_freeze_wq,
+				!q->mq_freeze_depth || blk_queue_dying(q));
+		if (blk_queue_dying(q))
+			return -ENODEV;
+		if (ret)
+			return ret;
+	}
 }
 
 static void blk_mq_queue_exit(struct request_queue *q)
 {
-	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
+	percpu_ref_put(&q->mq_usage_counter);
 }
 
-void blk_mq_drain_queue(struct request_queue *q)
+static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 {
-	while (true) {
-		s64 count;
-
-		spin_lock_irq(q->queue_lock);
-		count = percpu_counter_sum(&q->mq_usage_counter);
-		spin_unlock_irq(q->queue_lock);
+	struct request_queue *q =
+		container_of(ref, struct request_queue, mq_usage_counter);
 
-		if (count == 0)
-			break;
-		blk_mq_start_hw_queues(q);
-		msleep(10);
-	}
+	wake_up_all(&q->mq_freeze_wq);
 }
 
 /*
  * Guarantee no request is in use, so we can change any data structure of
  * the queue afterward.
  */
-static void blk_mq_freeze_queue(struct request_queue *q)
+void blk_mq_freeze_queue(struct request_queue *q)
 {
-	bool drain;
-
 	spin_lock_irq(q->queue_lock);
-	drain = !q->bypass_depth++;
-	queue_flag_set(QUEUE_FLAG_BYPASS, q);
+	q->mq_freeze_depth++;
 	spin_unlock_irq(q->queue_lock);
 
-	if (drain)
-		blk_mq_drain_queue(q);
+	percpu_ref_kill(&q->mq_usage_counter);
+	blk_mq_run_queues(q, false);
+	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
 }
 
 static void blk_mq_unfreeze_queue(struct request_queue *q)
@@ -147,14 +126,13 @@ static void blk_mq_unfreeze_queue(struct request_queue *q)
 	bool wake = false;
 
 	spin_lock_irq(q->queue_lock);
-	if (!--q->bypass_depth) {
-		queue_flag_clear(QUEUE_FLAG_BYPASS, q);
-		wake = true;
-	}
-	WARN_ON_ONCE(q->bypass_depth < 0);
+	wake = !--q->mq_freeze_depth;
+	WARN_ON_ONCE(q->mq_freeze_depth < 0);
 	spin_unlock_irq(q->queue_lock);
-	if (wake)
+	if (wake) {
+		percpu_ref_reinit(&q->mq_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
+	}
 }
 
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
@@ -1798,7 +1776,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	if (!q)
 		goto err_hctxs;
 
-	if (percpu_counter_init(&q->mq_usage_counter, 0))
+	if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release))
 		goto err_map;
 
 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
@@ -1891,7 +1869,7 @@ void blk_mq_free_queue(struct request_queue *q)
 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
 	blk_mq_free_hw_queues(q, set);
 
-	percpu_counter_destroy(&q->mq_usage_counter);
+	percpu_ref_exit(&q->mq_usage_counter);
 
 	free_percpu(q->queue_ctx);
 	kfree(q->queue_hw_ctx);
@@ -2050,8 +2028,7 @@ static int __init blk_mq_init(void)
 {
 	blk_mq_cpu_init();
 
-	/* Must be called after percpu_counter_hotcpu_callback() */
-	hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
+	hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
 
 	return 0;
 }
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 26460884c6cd..ca4964a6295d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -28,7 +28,7 @@ struct blk_mq_ctx {
 void __blk_mq_complete_request(struct request *rq);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_init_flush(struct request_queue *q);
-void blk_mq_drain_queue(struct request_queue *q);
+void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 void blk_mq_clone_flush_request(struct request *flush_rq,
 		struct request *orig_rq);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 23321fbab293..4db5abf96b9e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -554,8 +554,8 @@ int blk_register_queue(struct gendisk *disk)
 	 * Initialization must be complete by now.  Finish the initial
 	 * bypass from queue allocation.
 	 */
-	blk_queue_bypass_end(q);
 	queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
+	blk_queue_bypass_end(q);
 
 	ret = blk_trace_init_sysfs(dev);
 	if (ret)
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index a0926a6094b2..18b282ce361e 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -663,6 +663,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	fmode_t mode = file->f_mode;
 	struct backing_dev_info *bdi;
 	loff_t size;
+	unsigned int max_sectors;
 
 	/*
 	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
@@ -719,8 +720,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKSSZGET: /* get block device hardware sector size */
 		return compat_put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
-		return compat_put_ushort(arg,
-					 queue_max_sectors(bdev_get_queue(bdev)));
+		max_sectors = min_t(unsigned int, USHRT_MAX,
+				    queue_max_sectors(bdev_get_queue(bdev)));
+		return compat_put_ushort(arg, max_sectors);
 	case BLKROTATIONAL:
 		return compat_put_ushort(arg,
 					 !blk_queue_nonrot(bdev_get_queue(bdev)));
diff --git a/block/ioctl.c b/block/ioctl.c
index 7d5c3b20af45..d6cda8147c91 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -278,6 +278,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	struct backing_dev_info *bdi;
 	loff_t size;
 	int ret, n;
+	unsigned int max_sectors;
 
 	switch(cmd) {
 	case BLKFLSBUF:
@@ -375,7 +376,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKDISCARDZEROES:
 		return put_uint(arg, bdev_discard_zeroes_data(bdev));
 	case BLKSECTGET:
-		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
+		max_sectors = min_t(unsigned int, USHRT_MAX,
+				    queue_max_sectors(bdev_get_queue(bdev)));
+		return put_ushort(arg, max_sectors);
 	case BLKROTATIONAL:
 		return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
 	case BLKRASET:
diff --git a/block/partitions/aix.c b/block/partitions/aix.c
index 43be471d9b1d..f3ed7b2d89bf 100644
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -215,7 +215,7 @@ int aix_partition(struct parsed_partitions *state)
 		numlvs = be16_to_cpu(p->numlvs);
 		put_dev_sector(sect);
 	}
-	lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL);
+	lvip = kcalloc(state->limit, sizeof(struct lv_info), GFP_KERNEL);
 	if (!lvip)
 		return 0;
 	if (numlvs && (d = read_part_sector(state, vgda_sector + 1, &sect))) {
@@ -253,7 +253,7 @@ int aix_partition(struct parsed_partitions *state)
 				continue;
 			}
 			lv_ix = be16_to_cpu(p->lv_ix) - 1;
-			if (lv_ix > state->limit) {
+			if (lv_ix >= state->limit) {
 				cur_lv_ix = -1;
 				continue;
 			}
diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c
index 70cbf44a1560..2b13533d60a2 100644
--- a/block/partitions/amiga.c
+++ b/block/partitions/amiga.c
@@ -7,6 +7,8 @@
  *  Re-organised Feb 1998 Russell King
  */
 
+#define pr_fmt(fmt) fmt
+
 #include <linux/types.h>
 #include <linux/affs_hardblocks.h>
 
@@ -40,7 +42,7 @@ int amiga_partition(struct parsed_partitions *state)
 		data = read_part_sector(state, blk, &sect);
 		if (!data) {
 			if (warn_no_part)
-				printk("Dev %s: unable to read RDB block %d\n",
+				pr_err("Dev %s: unable to read RDB block %d\n",
 				       bdevname(state->bdev, b), blk);
 			res = -1;
 			goto rdb_done;
@@ -57,12 +59,12 @@ int amiga_partition(struct parsed_partitions *state)
 		*(__be32 *)(data+0xdc) = 0;
 		if (checksum_block((__be32 *)data,
 				be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
-			printk("Warning: Trashed word at 0xd0 in block %d "
-				"ignored in checksum calculation\n",blk);
+			pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n",
+			       blk);
 			break;
 		}
 
-		printk("Dev %s: RDB in block %d has bad checksum\n",
+		pr_err("Dev %s: RDB in block %d has bad checksum\n",
 		       bdevname(state->bdev, b), blk);
 	}
 
@@ -83,7 +85,7 @@ int amiga_partition(struct parsed_partitions *state)
 		data = read_part_sector(state, blk, &sect);
 		if (!data) {
 			if (warn_no_part)
-				printk("Dev %s: unable to read partition block %d\n",
+				pr_err("Dev %s: unable to read partition block %d\n",
 				       bdevname(state->bdev, b), blk);
 			res = -1;
 			goto rdb_done;
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index dc51f467a560..56d08fd75b1a 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -121,7 +121,7 @@ __setup("gpt", force_gpt_fn);
 /**
  * efi_crc32() - EFI version of crc32 function
  * @buf: buffer to calculate crc32 of
- * @len - length of buf
+ * @len: length of buf
  *
  * Description: Returns EFI-style CRC32 value for @buf
  * 
@@ -240,10 +240,10 @@ done:
 
 /**
  * read_lba(): Read bytes from disk, starting at given LBA
- * @state
- * @lba
- * @buffer
- * @size_t
+ * @state: disk parsed partitions
+ * @lba: the Logical Block Address of the partition table
+ * @buffer: destination buffer
+ * @count: bytes to read
  *
  * Description: Reads @count bytes from @state->bdev into @buffer.
  * Returns number of bytes read on success, 0 on error.
@@ -277,8 +277,8 @@ static size_t read_lba(struct parsed_partitions *state,
 
 /**
  * alloc_read_gpt_entries(): reads partition entries from disk
- * @state
- * @gpt - GPT header
+ * @state: disk parsed partitions
+ * @gpt: GPT header
  * 
  * Description: Returns ptes on success,  NULL on error.
  * Allocates space for PTEs based on information found in @gpt.
@@ -312,8 +312,8 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
 
 /**
  * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @state
- * @lba is the Logical Block Address of the partition table
+ * @state: disk parsed partitions
+ * @lba: the Logical Block Address of the partition table
  * 
  * Description: returns GPT header on success, NULL on error.   Allocates
  * and fills a GPT header starting at @ from @state->bdev.
@@ -340,10 +340,10 @@ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
 
 /**
  * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @state
- * @lba is the logical block address of the GPT header to test
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
+ * @state: disk parsed partitions
+ * @lba: logical block address of the GPT header to test
+ * @gpt: GPT header ptr, filled on return.
+ * @ptes: PTEs ptr, filled on return.
  *
  * Description: returns 1 if valid,  0 on error.
  * If valid, returns pointers to newly allocated GPT header and PTEs.
@@ -461,8 +461,8 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
 
 /**
  * is_pte_valid() - tests one PTE for validity
- * @pte is the pte to check
- * @lastlba is last lba of the disk
+ * @pte:pte to check
+ * @lastlba: last lba of the disk
  *
  * Description: returns 1 if valid,  0 on error.
  */
@@ -478,9 +478,10 @@ is_pte_valid(const gpt_entry *pte, const u64 lastlba)
 
 /**
  * compare_gpts() - Search disk for valid GPT headers and PTEs
- * @pgpt is the primary GPT header
- * @agpt is the alternate GPT header
- * @lastlba is the last LBA number
+ * @pgpt: primary GPT header
+ * @agpt: alternate GPT header
+ * @lastlba: last LBA number
+ *
  * Description: Returns nothing.  Sanity checks pgpt and agpt fields
  * and prints warnings on discrepancies.
  * 
@@ -572,9 +573,10 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 
 /**
  * find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @state
- * @gpt is a GPT header ptr, filled on return.
- * @ptes is a PTEs ptr, filled on return.
+ * @state: disk parsed partitions
+ * @gpt: GPT header ptr, filled on return.
+ * @ptes: PTEs ptr, filled on return.
+ *
  * Description: Returns 1 if valid, 0 on error.
  * If valid, returns pointers to newly allocated GPT header and PTEs.
  * Validity depends on PMBR being valid (or being overridden by the
@@ -663,7 +665,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
 
 /**
  * efi_partition(struct parsed_partitions *state)
- * @state
+ * @state: disk parsed partitions
  *
  * Description: called from check.c, if the disk contains GPT
  * partitions, sets up partition entries in the kernel.
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 9123f250b425..93e7c1b32edd 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -159,8 +159,9 @@ static void parse_extended(struct parsed_partitions *state,
 		/*
 		 * First process the data partition(s)
 		 */
-		for (i=0; i<4; i++, p++) {
+		for (i = 0; i < 4; i++, p++) {
 			sector_t offs, size, next;
+
 			if (!nr_sects(p) || is_extended_partition(p))
 				continue;
 
@@ -194,7 +195,7 @@ static void parse_extended(struct parsed_partitions *state,
 		 * It should be a link to the next logical partition.
 		 */
 		p -= 4;
-		for (i=0; i<4; i++, p++)
+		for (i = 0; i < 4; i++, p++)
 			if (nr_sects(p) && is_extended_partition(p))
 				break;
 		if (i == 4)
@@ -243,8 +244,8 @@ static void parse_solaris_x86(struct parsed_partitions *state,
 		return;
 	}
 	/* Ensure we can handle previous case of VTOC with 8 entries gracefully */
-	max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
-	for (i=0; i<max_nparts && state->next<state->limit; i++) {
+	max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
+	for (i = 0; i < max_nparts && state->next < state->limit; i++) {
 		struct solaris_x86_slice *s = &v->v_slice[i];
 		char tmp[3 + 10 + 1 + 1];
 
@@ -409,7 +410,7 @@ static void parse_minix(struct parsed_partitions *state,
 	/* The first sector of a Minix partition can have either
 	 * a secondary MBR describing its subpartitions, or
 	 * the normal boot sector. */
-	if (msdos_magic_present (data + 510) &&
+	if (msdos_magic_present(data + 510) &&
 	    SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
 		char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
 
@@ -527,6 +528,7 @@ int msdos_partition(struct parsed_partitions *state)
 	for (slot = 1 ; slot <= 4 ; slot++, p++) {
 		sector_t start = start_sect(p)*sector_size;
 		sector_t size = nr_sects(p)*sector_size;
+
 		if (!size)
 			continue;
 		if (is_extended_partition(p)) {
@@ -537,6 +539,7 @@ int msdos_partition(struct parsed_partitions *state)
 			 * sector, although it may not be enough/proper.
 			 */
 			sector_t n = 2;
+
 			n = min(size, max(sector_size, n));
 			put_partition(state, slot, start, n);
 
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 14695c6221c8..51bf5155ee75 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -82,9 +82,18 @@ static int sg_set_timeout(struct request_queue *q, int __user *p)
 	return err;
 }
 
+static int max_sectors_bytes(struct request_queue *q)
+{
+	unsigned int max_sectors = queue_max_sectors(q);
+
+	max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);
+
+	return max_sectors << 9;
+}
+
 static int sg_get_reserved_size(struct request_queue *q, int __user *p)
 {
-	unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9);
+	int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));
 
 	return put_user(val, p);
 }
@@ -98,10 +107,8 @@ static int sg_set_reserved_size(struct request_queue *q, int __user *p)
 
 	if (size < 0)
 		return -EINVAL;
-	if (size > (queue_max_sectors(q) << 9))
-		size = queue_max_sectors(q) << 9;
 
-	q->sg_reserved_size = size;
+	q->sg_reserved_size = min(size, max_sectors_bytes(q));
 	return 0;
 }
 
@@ -283,6 +290,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	unsigned long start_time;
 	ssize_t ret = 0;
 	int writing = 0;
+	int at_head = 0;
 	struct request *rq;
 	char sense[SCSI_SENSE_BUFFERSIZE];
 	struct bio *bio;
@@ -306,6 +314,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		case SG_DXFER_FROM_DEV:
 			break;
 		}
+	if (hdr->flags & SG_FLAG_Q_AT_HEAD)
+		at_head = 1;
 
 	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
 	if (!rq)
@@ -362,7 +372,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	 * (if he doesn't check that is his problem).
 	 * N.B. a non-zero SCSI status is _not_ necessarily an error.
 	 */
-	blk_execute_rq(q, bd_disk, rq, 0);
+	blk_execute_rq(q, bd_disk, rq, at_head);
 
 	hdr->duration = jiffies_to_msecs(jiffies - start_time);
 
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 9c62340c2360..c96887d5289e 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -481,6 +481,10 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
 	if (!pin)
 		return;
 
+	/* Keep IOAPIC pin configuration when suspending */
+	if (dev->dev.power.is_prepared)
+		return;
+
 	entry = acpi_pci_irq_lookup(dev, pin);
 	if (!entry)
 		return;
@@ -498,5 +502,6 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
 	 */
 
 	dev_dbg(&dev->dev, "PCI INT %c disabled\n", pin_name(pin));
-	acpi_unregister_gsi(gsi);
+	if (gsi >= 0 && dev->irq > 0)
+		acpi_unregister_gsi(gsi);
 }
diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c
index 0e3f8f9dcd29..480fa6ffbc09 100644
--- a/drivers/atm/atmtcp.c
+++ b/drivers/atm/atmtcp.c
@@ -299,6 +299,7 @@ static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb)
 	out_vcc = find_vcc(dev, ntohs(hdr->vpi), ntohs(hdr->vci));
 	read_unlock(&vcc_sklist_lock);
 	if (!out_vcc) {
+		result = -EUNATCH;
 		atomic_inc(&vcc->stats->tx_err);
 		goto done;
 	}
diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
index 943cf0d6abaf..7652e8dc188f 100644
--- a/drivers/atm/solos-pci.c
+++ b/drivers/atm/solos-pci.c
@@ -1278,6 +1278,7 @@ static int fpga_probe(struct pci_dev *dev, const struct pci_device_id *id)
 			card->dma_bounce = kmalloc(card->nr_ports * BUF_SIZE, GFP_KERNEL);
 			if (!card->dma_bounce) {
 				dev_warn(&card->dev->dev, "Failed to allocate DMA bounce buffers\n");
+				err = -ENOMEM;
 				/* Fallback to MMIO doesn't work */
 				goto out_unmap_both;
 			}
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
index 8b450338075e..4464e353c1e8 100644
--- a/drivers/block/drbd/Makefile
+++ b/drivers/block/drbd/Makefile
@@ -3,5 +3,6 @@ drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
 drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
 drbd-y += drbd_interval.o drbd_state.o
 drbd-y += drbd_nla.o
+drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
 
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 05a1780ffa85..d26a3fa63688 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -92,34 +92,26 @@ struct __packed al_transaction_on_disk {
 	__be32	context[AL_CONTEXT_PER_TRANSACTION];
 };
 
-struct update_odbm_work {
-	struct drbd_work w;
-	struct drbd_device *device;
-	unsigned int enr;
-};
-
-struct update_al_work {
-	struct drbd_work w;
-	struct drbd_device *device;
-	struct completion event;
-	int err;
-};
-
-
-void *drbd_md_get_buffer(struct drbd_device *device)
+void *drbd_md_get_buffer(struct drbd_device *device, const char *intent)
 {
 	int r;
 
 	wait_event(device->misc_wait,
-		   (r = atomic_cmpxchg(&device->md_io_in_use, 0, 1)) == 0 ||
+		   (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 ||
 		   device->state.disk <= D_FAILED);
 
-	return r ? NULL : page_address(device->md_io_page);
+	if (r)
+		return NULL;
+
+	device->md_io.current_use = intent;
+	device->md_io.start_jif = jiffies;
+	device->md_io.submit_jif = device->md_io.start_jif - 1;
+	return page_address(device->md_io.page);
 }
 
 void drbd_md_put_buffer(struct drbd_device *device)
 {
-	if (atomic_dec_and_test(&device->md_io_in_use))
+	if (atomic_dec_and_test(&device->md_io.in_use))
 		wake_up(&device->misc_wait);
 }
 
@@ -145,10 +137,11 @@ void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_b
 
 static int _drbd_md_sync_page_io(struct drbd_device *device,
 				 struct drbd_backing_dev *bdev,
-				 struct page *page, sector_t sector,
-				 int rw, int size)
+				 sector_t sector, int rw)
 {
 	struct bio *bio;
+	/* we do all our meta data IO in aligned 4k blocks. */
+	const int size = 4096;
 	int err;
 
 	device->md_io.done = 0;
@@ -156,15 +149,15 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 
 	if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags))
 		rw |= REQ_FUA | REQ_FLUSH;
-	rw |= REQ_SYNC;
+	rw |= REQ_SYNC | REQ_NOIDLE;
 
 	bio = bio_alloc_drbd(GFP_NOIO);
 	bio->bi_bdev = bdev->md_bdev;
 	bio->bi_iter.bi_sector = sector;
 	err = -EIO;
-	if (bio_add_page(bio, page, size, 0) != size)
+	if (bio_add_page(bio, device->md_io.page, size, 0) != size)
 		goto out;
-	bio->bi_private = &device->md_io;
+	bio->bi_private = device;
 	bio->bi_end_io = drbd_md_io_complete;
 	bio->bi_rw = rw;
 
@@ -179,7 +172,8 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 	}
 
 	bio_get(bio); /* one bio_put() is in the completion handler */
-	atomic_inc(&device->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
+	atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
+	device->md_io.submit_jif = jiffies;
 	if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
 		bio_endio(bio, -EIO);
 	else
@@ -197,9 +191,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
 			 sector_t sector, int rw)
 {
 	int err;
-	struct page *iop = device->md_io_page;
-
-	D_ASSERT(device, atomic_read(&device->md_io_in_use) == 1);
+	D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);
 
 	BUG_ON(!bdev->md_bdev);
 
@@ -214,8 +206,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
 		     current->comm, current->pid, __func__,
 		     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
 
-	/* we do all our meta data IO in aligned 4k blocks. */
-	err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096);
+	err = _drbd_md_sync_page_io(device, bdev, sector, rw);
 	if (err) {
 		drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
 		    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
@@ -297,26 +288,12 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *
 	return need_transaction;
 }
 
-static int al_write_transaction(struct drbd_device *device, bool delegate);
-
-/* When called through generic_make_request(), we must delegate
- * activity log I/O to the worker thread: a further request
- * submitted via generic_make_request() within the same task
- * would be queued on current->bio_list, and would only start
- * after this function returns (see generic_make_request()).
- *
- * However, if we *are* the worker, we must not delegate to ourselves.
- */
+static int al_write_transaction(struct drbd_device *device);
 
-/*
- * @delegate:   delegate activity log I/O to the worker thread
- */
-void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
+void drbd_al_begin_io_commit(struct drbd_device *device)
 {
 	bool locked = false;
 
-	BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
-
 	/* Serialize multiple transactions.
 	 * This uses test_and_set_bit, memory barrier is implicit.
 	 */
@@ -335,7 +312,7 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
 			rcu_read_unlock();
 
 			if (write_al_updates)
-				al_write_transaction(device, delegate);
+				al_write_transaction(device);
 			spin_lock_irq(&device->al_lock);
 			/* FIXME
 			if (err)
@@ -352,12 +329,10 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
 /*
  * @delegate:   delegate activity log I/O to the worker thread
  */
-void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate)
+void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i)
 {
-	BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
-
 	if (drbd_al_begin_io_prepare(device, i))
-		drbd_al_begin_io_commit(device, delegate);
+		drbd_al_begin_io_commit(device);
 }
 
 int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i)
@@ -380,8 +355,19 @@ int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *
 	/* We want all necessary updates for a given request within the same transaction
 	 * We could first check how many updates are *actually* needed,
 	 * and use that instead of the worst-case nr_al_extents */
-	if (available_update_slots < nr_al_extents)
-		return -EWOULDBLOCK;
+	if (available_update_slots < nr_al_extents) {
+		/* Too many activity log extents are currently "hot".
+		 *
+		 * If we have accumulated pending changes already,
+		 * we made progress.
+		 *
+		 * If we cannot get even a single pending change through,
+		 * stop the fast path until we made some progress,
+		 * or requests to "cold" extents could be starved. */
+		if (!al->pending_changes)
+			__set_bit(__LC_STARVING, &device->act_log->flags);
+		return -ENOBUFS;
+	}
 
 	/* Is resync active in this area? */
 	for (enr = first; enr <= last; enr++) {
@@ -452,15 +438,6 @@ static unsigned int al_extent_to_bm_page(unsigned int al_enr)
 		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
 }
 
-static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
-{
-	return rs_enr >>
-		/* bit to page */
-		((PAGE_SHIFT + 3) -
-		/* resync extent number to bit */
-		 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
-}
-
 static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
 {
 	const unsigned int stripes = device->ldev->md.al_stripes;
@@ -479,8 +456,7 @@ static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
 	return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
 }
 
-static int
-_al_write_transaction(struct drbd_device *device)
+int al_write_transaction(struct drbd_device *device)
 {
 	struct al_transaction_on_disk *buffer;
 	struct lc_element *e;
@@ -505,7 +481,8 @@ _al_write_transaction(struct drbd_device *device)
 		return -EIO;
 	}
 
-	buffer = drbd_md_get_buffer(device); /* protects md_io_buffer, al_tr_cycle, ... */
+	/* protects md_io_buffer, al_tr_cycle, ... */
+	buffer = drbd_md_get_buffer(device, __func__);
 	if (!buffer) {
 		drbd_err(device, "disk failed while waiting for md_io buffer\n");
 		put_ldev(device);
@@ -590,38 +567,6 @@ _al_write_transaction(struct drbd_device *device)
 	return err;
 }
 
-
-static int w_al_write_transaction(struct drbd_work *w, int unused)
-{
-	struct update_al_work *aw = container_of(w, struct update_al_work, w);
-	struct drbd_device *device = aw->device;
-	int err;
-
-	err = _al_write_transaction(device);
-	aw->err = err;
-	complete(&aw->event);
-
-	return err != -EIO ? err : 0;
-}
-
-/* Calls from worker context (see w_restart_disk_io()) need to write the
-   transaction directly. Others came through generic_make_request(),
-   those need to delegate it to the worker. */
-static int al_write_transaction(struct drbd_device *device, bool delegate)
-{
-	if (delegate) {
-		struct update_al_work al_work;
-		init_completion(&al_work.event);
-		al_work.w.cb = w_al_write_transaction;
-		al_work.device = device;
-		drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-				      &al_work.w);
-		wait_for_completion(&al_work.event);
-		return al_work.err;
-	} else
-		return _al_write_transaction(device);
-}
-
 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
 {
 	int rv;
@@ -682,72 +627,56 @@ int drbd_initialize_al(struct drbd_device *device, void *buffer)
 	return 0;
 }
 
-static int w_update_odbm(struct drbd_work *w, int unused)
-{
-	struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
-	struct drbd_device *device = udw->device;
-	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
-
-	if (!get_ldev(device)) {
-		if (__ratelimit(&drbd_ratelimit_state))
-			drbd_warn(device, "Can not update on disk bitmap, local IO disabled.\n");
-		kfree(udw);
-		return 0;
-	}
-
-	drbd_bm_write_page(device, rs_extent_to_bm_page(udw->enr));
-	put_ldev(device);
-
-	kfree(udw);
-
-	if (drbd_bm_total_weight(device) <= device->rs_failed) {
-		switch (device->state.conn) {
-		case C_SYNC_SOURCE:  case C_SYNC_TARGET:
-		case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
-			drbd_resync_finished(device);
-		default:
-			/* nothing to do */
-			break;
-		}
-	}
-	drbd_bcast_event(device, &sib);
-
-	return 0;
-}
-
+static const char *drbd_change_sync_fname[] = {
+	[RECORD_RS_FAILED] = "drbd_rs_failed_io",
+	[SET_IN_SYNC] = "drbd_set_in_sync",
+	[SET_OUT_OF_SYNC] = "drbd_set_out_of_sync"
+};
 
 /* ATTENTION. The AL's extents are 4MB each, while the extents in the
  * resync LRU-cache are 16MB each.
  * The caller of this function has to hold an get_ldev() reference.
  *
+ * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success),
+ * potentially pulling in (and recounting the corresponding bits)
+ * this resync extent into the resync extent lru cache.
+ *
+ * Returns whether all bits have been cleared for this resync extent,
+ * precisely: (rs_left <= rs_failed)
+ *
  * TODO will be obsoleted once we have a caching lru of the on disk bitmap
  */
-static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector,
-				      int count, int success)
+static bool update_rs_extent(struct drbd_device *device,
+		unsigned int enr, int count,
+		enum update_sync_bits_mode mode)
 {
 	struct lc_element *e;
-	struct update_odbm_work *udw;
-
-	unsigned int enr;
 
 	D_ASSERT(device, atomic_read(&device->local_cnt));
 
-	/* I simply assume that a sector/size pair never crosses
-	 * a 16 MB extent border. (Currently this is true...) */
-	enr = BM_SECT_TO_EXT(sector);
-
-	e = lc_get(device->resync, enr);
+	/* When setting out-of-sync bits,
+	 * we don't need it cached (lc_find).
+	 * But if it is present in the cache,
+	 * we should update the cached bit count.
+	 * Otherwise, that extent should be in the resync extent lru cache
+	 * already -- or we want to pull it in if necessary -- (lc_get),
+	 * then update and check rs_left and rs_failed. */
+	if (mode == SET_OUT_OF_SYNC)
+		e = lc_find(device->resync, enr);
+	else
+		e = lc_get(device->resync, enr);
 	if (e) {
 		struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
 		if (ext->lce.lc_number == enr) {
-			if (success)
+			if (mode == SET_IN_SYNC)
 				ext->rs_left -= count;
+			else if (mode == SET_OUT_OF_SYNC)
+				ext->rs_left += count;
 			else
 				ext->rs_failed += count;
 			if (ext->rs_left < ext->rs_failed) {
-				drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d "
+				drbd_warn(device, "BAD! enr=%u rs_left=%d "
 				    "rs_failed=%d count=%d cstate=%s\n",
-				     (unsigned long long)sector,
 				     ext->lce.lc_number, ext->rs_left,
 				     ext->rs_failed, count,
 				     drbd_conn_str(device->state.conn));
@@ -781,34 +710,27 @@ static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t secto
 				     ext->lce.lc_number, ext->rs_failed);
 			}
 			ext->rs_left = rs_left;
-			ext->rs_failed = success ? 0 : count;
+			ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0;
 			/* we don't keep a persistent log of the resync lru,
 			 * we can commit any change right away. */
 			lc_committed(device->resync);
 		}
-		lc_put(device->resync, &ext->lce);
+		if (mode != SET_OUT_OF_SYNC)
+			lc_put(device->resync, &ext->lce);
 		/* no race, we are within the al_lock! */
 
-		if (ext->rs_left == ext->rs_failed) {
+		if (ext->rs_left <= ext->rs_failed) {
 			ext->rs_failed = 0;
-
-			udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
-			if (udw) {
-				udw->enr = ext->lce.lc_number;
-				udw->w.cb = w_update_odbm;
-				udw->device = device;
-				drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-						      &udw->w);
-			} else {
-				drbd_warn(device, "Could not kmalloc an udw\n");
-			}
+			return true;
 		}
-	} else {
+	} else if (mode != SET_OUT_OF_SYNC) {
+		/* be quiet if lc_find() did not find it. */
 		drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
 		    device->resync_locked,
 		    device->resync->nr_elements,
 		    device->resync->flags);
 	}
+	return false;
 }
 
 void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
@@ -827,105 +749,105 @@ void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go
 	}
 }
 
-/* clear the bit corresponding to the piece of storage in question:
- * size byte of data starting from sector.  Only clear a bits of the affected
- * one ore more _aligned_ BM_BLOCK_SIZE blocks.
- *
- * called by worker on C_SYNC_TARGET and receiver on SyncSource.
- *
- */
-void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size,
-		       const char *file, const unsigned int line)
+/* It is called lazy update, so don't do write-out too often. */
+static bool lazy_bitmap_update_due(struct drbd_device *device)
 {
-	/* Is called from worker and receiver context _only_ */
-	unsigned long sbnr, ebnr, lbnr;
-	unsigned long count = 0;
-	sector_t esector, nr_sectors;
-	int wake_up = 0;
-	unsigned long flags;
+	return time_after(jiffies, device->rs_last_bcast + 2*HZ);
+}
 
-	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-		drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
-				(unsigned long long)sector, size);
+static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
+{
+	if (rs_done)
+		set_bit(RS_DONE, &device->flags);
+		/* and also set RS_PROGRESS below */
+	else if (!lazy_bitmap_update_due(device))
 		return;
-	}
-
-	if (!get_ldev(device))
-		return; /* no disk, no metadata, no bitmap to clear bits in */
-
-	nr_sectors = drbd_get_capacity(device->this_bdev);
-	esector = sector + (size >> 9) - 1;
-
-	if (!expect(sector < nr_sectors))
-		goto out;
-	if (!expect(esector < nr_sectors))
-		esector = nr_sectors - 1;
-
-	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
-
-	/* we clear it (in sync).
-	 * round up start sector, round down end sector.  we make sure we only
-	 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
-	if (unlikely(esector < BM_SECT_PER_BIT-1))
-		goto out;
-	if (unlikely(esector == (nr_sectors-1)))
-		ebnr = lbnr;
-	else
-		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
-	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
 
-	if (sbnr > ebnr)
-		goto out;
+	drbd_device_post_work(device, RS_PROGRESS);
+}
 
+static int update_sync_bits(struct drbd_device *device,
+		unsigned long sbnr, unsigned long ebnr,
+		enum update_sync_bits_mode mode)
+{
 	/*
-	 * ok, (capacity & 7) != 0 sometimes, but who cares...
-	 * we count rs_{total,left} in bits, not sectors.
+	 * We keep a count of set bits per resync-extent in the ->rs_left
+	 * caching member, so we need to loop and work within the resync extent
+	 * alignment. Typically this loop will execute exactly once.
 	 */
-	count = drbd_bm_clear_bits(device, sbnr, ebnr);
-	if (count) {
-		drbd_advance_rs_marks(device, drbd_bm_total_weight(device));
-		spin_lock_irqsave(&device->al_lock, flags);
-		drbd_try_clear_on_disk_bm(device, sector, count, true);
-		spin_unlock_irqrestore(&device->al_lock, flags);
-
-		/* just wake_up unconditional now, various lc_chaged(),
-		 * lc_put() in drbd_try_clear_on_disk_bm(). */
-		wake_up = 1;
+	unsigned long flags;
+	unsigned long count = 0;
+	unsigned int cleared = 0;
+	while (sbnr <= ebnr) {
+		/* set temporary boundary bit number to last bit number within
+		 * the resync extent of the current start bit number,
+		 * but cap at provided end bit number */
+		unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK);
+		unsigned long c;
+
+		if (mode == RECORD_RS_FAILED)
+			/* Only called from drbd_rs_failed_io(), bits
+			 * supposedly still set.  Recount, maybe some
+			 * of the bits have been successfully cleared
+			 * by application IO meanwhile.
+			 */
+			c = drbd_bm_count_bits(device, sbnr, tbnr);
+		else if (mode == SET_IN_SYNC)
+			c = drbd_bm_clear_bits(device, sbnr, tbnr);
+		else /* if (mode == SET_OUT_OF_SYNC) */
+			c = drbd_bm_set_bits(device, sbnr, tbnr);
+
+		if (c) {
+			spin_lock_irqsave(&device->al_lock, flags);
+			cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode);
+			spin_unlock_irqrestore(&device->al_lock, flags);
+			count += c;
+		}
+		sbnr = tbnr + 1;
 	}
-out:
-	put_ldev(device);
-	if (wake_up)
+	if (count) {
+		if (mode == SET_IN_SYNC) {
+			unsigned long still_to_go = drbd_bm_total_weight(device);
+			bool rs_is_done = (still_to_go <= device->rs_failed);
+			drbd_advance_rs_marks(device, still_to_go);
+			if (cleared || rs_is_done)
+				maybe_schedule_on_disk_bitmap_update(device, rs_is_done);
+		} else if (mode == RECORD_RS_FAILED)
+			device->rs_failed += count;
 		wake_up(&device->al_wait);
+	}
+	return count;
 }
 
-/*
- * this is intended to set one request worth of data out of sync.
- * affects at least 1 bit,
- * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
  *
- * called by tl_clear and drbd_send_dblock (==drbd_make_request).
- * so this can be _any_ process.
  */
-int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size,
-			    const char *file, const unsigned int line)
+int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+		enum update_sync_bits_mode mode,
+		const char *file, const unsigned int line)
 {
-	unsigned long sbnr, ebnr, flags;
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count = 0;
 	sector_t esector, nr_sectors;
-	unsigned int enr, count = 0;
-	struct lc_element *e;
 
-	/* this should be an empty REQ_FLUSH */
-	if (size == 0)
+	/* This would be an empty REQ_FLUSH, be silent. */
+	if ((mode == SET_OUT_OF_SYNC) && size == 0)
 		return 0;
 
-	if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-		drbd_err(device, "sector: %llus, size: %d\n",
-			(unsigned long long)sector, size);
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+		drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
+				drbd_change_sync_fname[mode],
+				(unsigned long long)sector, size);
 		return 0;
 	}
 
 	if (!get_ldev(device))
-		return 0; /* no disk, no metadata, no bitmap to set bits in */
+		return 0; /* no disk, no metadata, no bitmap to manipulate bits in */
 
 	nr_sectors = drbd_get_capacity(device->this_bdev);
 	esector = sector + (size >> 9) - 1;
@@ -935,25 +857,28 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size
 	if (!expect(esector < nr_sectors))
 		esector = nr_sectors - 1;
 
-	/* we set it out of sync,
-	 * we do not need to round anything here */
-	sbnr = BM_SECT_TO_BIT(sector);
-	ebnr = BM_SECT_TO_BIT(esector);
-
-	/* ok, (capacity & 7) != 0 sometimes, but who cares...
-	 * we count rs_{total,left} in bits, not sectors.  */
-	spin_lock_irqsave(&device->al_lock, flags);
-	count = drbd_bm_set_bits(device, sbnr, ebnr);
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
 
-	enr = BM_SECT_TO_EXT(sector);
-	e = lc_find(device->resync, enr);
-	if (e)
-		lc_entry(e, struct bm_extent, lce)->rs_left += count;
-	spin_unlock_irqrestore(&device->al_lock, flags);
+	if (mode == SET_IN_SYNC) {
+		/* Round up start sector, round down end sector.  We make sure
+		 * we only clear full, aligned, BM_BLOCK_SIZE blocks. */
+		if (unlikely(esector < BM_SECT_PER_BIT-1))
+			goto out;
+		if (unlikely(esector == (nr_sectors-1)))
+			ebnr = lbnr;
+		else
+			ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+		sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+	} else {
+		/* We set it out of sync, or record resync failure.
+		 * Should not round anything here. */
+		sbnr = BM_SECT_TO_BIT(sector);
+		ebnr = BM_SECT_TO_BIT(esector);
+	}
 
+	count = update_sync_bits(device, sbnr, ebnr, mode);
 out:
 	put_ldev(device);
-
 	return count;
 }
 
@@ -1075,6 +1000,15 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
 	struct lc_element *e;
 	struct bm_extent *bm_ext;
 	int i;
+	bool throttle = drbd_rs_should_slow_down(device, sector, true);
+
+	/* If we need to throttle, a half-locked (only marked BME_NO_WRITES,
+	 * not yet BME_LOCKED) extent needs to be kicked out explicitly if we
+	 * need to throttle. There is at most one such half-locked extent,
+	 * which is remembered in resync_wenr. */
+
+	if (throttle && device->resync_wenr != enr)
+		return -EAGAIN;
 
 	spin_lock_irq(&device->al_lock);
 	if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
@@ -1098,8 +1032,10 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
 			D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
 			clear_bit(BME_NO_WRITES, &bm_ext->flags);
 			device->resync_wenr = LC_FREE;
-			if (lc_put(device->resync, &bm_ext->lce) == 0)
+			if (lc_put(device->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0;
 				device->resync_locked--;
+			}
 			wake_up(&device->al_wait);
 		} else {
 			drbd_alert(device, "LOGIC BUG\n");
@@ -1161,8 +1097,20 @@ proceed:
 	return 0;
 
 try_again:
-	if (bm_ext)
-		device->resync_wenr = enr;
+	if (bm_ext) {
+		if (throttle) {
+			D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+			clear_bit(BME_NO_WRITES, &bm_ext->flags);
+			device->resync_wenr = LC_FREE;
+			if (lc_put(device->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0;
+				device->resync_locked--;
+			}
+			wake_up(&device->al_wait);
+		} else
+			device->resync_wenr = enr;
+	}
 	spin_unlock_irq(&device->al_lock);
 	return -EAGAIN;
 }
@@ -1270,69 +1218,3 @@ int drbd_rs_del_all(struct drbd_device *device)
 
 	return 0;
 }
-
-/**
- * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
- * @device:	DRBD device.
- * @sector:	The sector number.
- * @size:	Size of failed IO operation, in byte.
- */
-void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size)
-{
-	/* Is called from worker and receiver context _only_ */
-	unsigned long sbnr, ebnr, lbnr;
-	unsigned long count;
-	sector_t esector, nr_sectors;
-	int wake_up = 0;
-
-	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
-		drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
-				(unsigned long long)sector, size);
-		return;
-	}
-	nr_sectors = drbd_get_capacity(device->this_bdev);
-	esector = sector + (size >> 9) - 1;
-
-	if (!expect(sector < nr_sectors))
-		return;
-	if (!expect(esector < nr_sectors))
-		esector = nr_sectors - 1;
-
-	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
-
-	/*
-	 * round up start sector, round down end sector.  we make sure we only
-	 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
-	if (unlikely(esector < BM_SECT_PER_BIT-1))
-		return;
-	if (unlikely(esector == (nr_sectors-1)))
-		ebnr = lbnr;
-	else
-		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
-	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
-
-	if (sbnr > ebnr)
-		return;
-
-	/*
-	 * ok, (capacity & 7) != 0 sometimes, but who cares...
-	 * we count rs_{total,left} in bits, not sectors.
-	 */
-	spin_lock_irq(&device->al_lock);
-	count = drbd_bm_count_bits(device, sbnr, ebnr);
-	if (count) {
-		device->rs_failed += count;
-
-		if (get_ldev(device)) {
-			drbd_try_clear_on_disk_bm(device, sector, count, false);
-			put_ldev(device);
-		}
-
-		/* just wake_up unconditional now, various lc_chaged(),
-		 * lc_put() in drbd_try_clear_on_disk_bm(). */
-		wake_up = 1;
-	}
-	spin_unlock_irq(&device->al_lock);
-	if (wake_up)
-		wake_up(&device->al_wait);
-}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 1aa29f8fdfe1..426c97aef900 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -22,6 +22,8 @@
    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
 #include <linux/bitops.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
@@ -353,9 +355,8 @@ static void bm_free_pages(struct page **pages, unsigned long number)
 
 	for (i = 0; i < number; i++) {
 		if (!pages[i]) {
-			printk(KERN_ALERT "drbd: bm_free_pages tried to free "
-					  "a NULL pointer; i=%lu n=%lu\n",
-					  i, number);
+			pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
+				 i, number);
 			continue;
 		}
 		__free_page(pages[i]);
@@ -592,7 +593,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
 	end = offset + len;
 
 	if (end > b->bm_words) {
-		printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
+		pr_alert("bm_memset end > bm_words\n");
 		return;
 	}
 
@@ -602,7 +603,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
 		p_addr = bm_map_pidx(b, idx);
 		bm = p_addr + MLPP(offset);
 		if (bm+do_now > p_addr + LWPP) {
-			printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+			pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
 			       p_addr, bm, (int)do_now);
 		} else
 			memset(bm, c, do_now * sizeof(long));
@@ -927,22 +928,14 @@ void drbd_bm_clear_all(struct drbd_device *device)
 	spin_unlock_irq(&b->bm_lock);
 }
 
-struct bm_aio_ctx {
-	struct drbd_device *device;
-	atomic_t in_flight;
-	unsigned int done;
-	unsigned flags;
-#define BM_AIO_COPY_PAGES	1
-#define BM_AIO_WRITE_HINTED	2
-#define BM_WRITE_ALL_PAGES	4
-	int error;
-	struct kref kref;
-};
-
-static void bm_aio_ctx_destroy(struct kref *kref)
+static void drbd_bm_aio_ctx_destroy(struct kref *kref)
 {
-	struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+	struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
+	unsigned long flags;
 
+	spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
+	list_del(&ctx->list);
+	spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
 	put_ldev(ctx->device);
 	kfree(ctx);
 }
@@ -950,7 +943,7 @@ static void bm_aio_ctx_destroy(struct kref *kref)
 /* bv_page may be a copy, or may be the original */
 static void bm_async_io_complete(struct bio *bio, int error)
 {
-	struct bm_aio_ctx *ctx = bio->bi_private;
+	struct drbd_bm_aio_ctx *ctx = bio->bi_private;
 	struct drbd_device *device = ctx->device;
 	struct drbd_bitmap *b = device->bitmap;
 	unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
@@ -993,17 +986,18 @@ static void bm_async_io_complete(struct bio *bio, int error)
 	if (atomic_dec_and_test(&ctx->in_flight)) {
 		ctx->done = 1;
 		wake_up(&device->misc_wait);
-		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
 	}
 }
 
-static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
+static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
 {
 	struct bio *bio = bio_alloc_drbd(GFP_NOIO);
 	struct drbd_device *device = ctx->device;
 	struct drbd_bitmap *b = device->bitmap;
 	struct page *page;
 	unsigned int len;
+	unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE;
 
 	sector_t on_disk_sector =
 		device->ldev->md.md_offset + device->ldev->md.bm_offset;
@@ -1049,9 +1043,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 /*
  * bm_rw: read/write the whole bitmap from/to its on disk location.
  */
-static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
+static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
 {
-	struct bm_aio_ctx *ctx;
+	struct drbd_bm_aio_ctx *ctx;
 	struct drbd_bitmap *b = device->bitmap;
 	int num_pages, i, count = 0;
 	unsigned long now;
@@ -1067,12 +1061,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 	 * as we submit copies of pages anyways.
 	 */
 
-	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
 	if (!ctx)
 		return -ENOMEM;
 
-	*ctx = (struct bm_aio_ctx) {
+	*ctx = (struct drbd_bm_aio_ctx) {
 		.device = device,
+		.start_jif = jiffies,
 		.in_flight = ATOMIC_INIT(1),
 		.done = 0,
 		.flags = flags,
@@ -1080,15 +1075,21 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 		.kref = { ATOMIC_INIT(2) },
 	};
 
-	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
 		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
 		kfree(ctx);
 		return -ENODEV;
 	}
+	/* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
+	   drbd_adm_attach(), after device->ldev was assigned. */
 
-	if (!ctx->flags)
+	if (0 == (ctx->flags & ~BM_AIO_READ))
 		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
 
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&ctx->list, &device->pending_bitmap_io);
+	spin_unlock_irq(&device->resource->req_lock);
+
 	num_pages = b->bm_number_of_pages;
 
 	now = jiffies;
@@ -1098,13 +1099,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 		/* ignore completely unchanged pages */
 		if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
 			break;
-		if (rw & WRITE) {
+		if (!(flags & BM_AIO_READ)) {
 			if ((flags & BM_AIO_WRITE_HINTED) &&
 			    !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
 				    &page_private(b->bm_pages[i])))
 				continue;
 
-			if (!(flags & BM_WRITE_ALL_PAGES) &&
+			if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
 			    bm_test_page_unchanged(b->bm_pages[i])) {
 				dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
 				continue;
@@ -1118,7 +1119,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 			}
 		}
 		atomic_inc(&ctx->in_flight);
-		bm_page_io_async(ctx, i, rw);
+		bm_page_io_async(ctx, i);
 		++count;
 		cond_resched();
 	}
@@ -1134,12 +1135,12 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 	if (!atomic_dec_and_test(&ctx->in_flight))
 		wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
 	else
-		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
 
 	/* summary for global bitmap IO */
 	if (flags == 0)
 		drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
-			 rw == WRITE ? "WRITE" : "READ",
+			 (flags & BM_AIO_READ) ? "READ" : "WRITE",
 			 count, jiffies - now);
 
 	if (ctx->error) {
@@ -1152,20 +1153,18 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
 		err = -EIO; /* Disk timeout/force-detach during IO... */
 
 	now = jiffies;
-	if (rw == WRITE) {
-		drbd_md_flush(device);
-	} else /* rw == READ */ {
+	if (flags & BM_AIO_READ) {
 		b->bm_set = bm_count_bits(b);
 		drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
 		     jiffies - now);
 	}
 	now = b->bm_set;
 
-	if (flags == 0)
+	if ((flags & ~BM_AIO_READ) == 0)
 		drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
 		     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
 
-	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
 	return err;
 }
 
@@ -1175,7 +1174,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
  */
 int drbd_bm_read(struct drbd_device *device) __must_hold(local)
 {
-	return bm_rw(device, READ, 0, 0);
+	return bm_rw(device, BM_AIO_READ, 0);
 }
 
 /**
@@ -1186,7 +1185,7 @@ int drbd_bm_read(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write(struct drbd_device *device) __must_hold(local)
 {
-	return bm_rw(device, WRITE, 0, 0);
+	return bm_rw(device, 0, 0);
 }
 
 /**
@@ -1197,7 +1196,17 @@ int drbd_bm_write(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
 {
-	return bm_rw(device, WRITE, BM_WRITE_ALL_PAGES, 0);
+	return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
+ * @device:	DRBD device.
+ * @upper_idx:	0: write all changed pages; +ve: page index to stop scanning for changed pages
+ */
+int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
 }
 
 /**
@@ -1213,7 +1222,7 @@ int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
 {
-	return bm_rw(device, WRITE, BM_AIO_COPY_PAGES, 0);
+	return bm_rw(device, BM_AIO_COPY_PAGES, 0);
 }
 
 /**
@@ -1222,62 +1231,7 @@ int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
  */
 int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
 {
-	return bm_rw(device, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
-}
-
-/**
- * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
- * @device:	DRBD device.
- * @idx:	bitmap page index
- *
- * We don't want to special case on logical_block_size of the backend device,
- * so we submit PAGE_SIZE aligned pieces.
- * Note that on "most" systems, PAGE_SIZE is 4k.
- *
- * In case this becomes an issue on systems with larger PAGE_SIZE,
- * we may want to change this again to write 4k aligned 4k pieces.
- */
-int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local)
-{
-	struct bm_aio_ctx *ctx;
-	int err;
-
-	if (bm_test_page_unchanged(device->bitmap->bm_pages[idx])) {
-		dynamic_drbd_dbg(device, "skipped bm page write for idx %u\n", idx);
-		return 0;
-	}
-
-	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
-	if (!ctx)
-		return -ENOMEM;
-
-	*ctx = (struct bm_aio_ctx) {
-		.device = device,
-		.in_flight = ATOMIC_INIT(1),
-		.done = 0,
-		.flags = BM_AIO_COPY_PAGES,
-		.error = 0,
-		.kref = { ATOMIC_INIT(2) },
-	};
-
-	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
-		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
-		kfree(ctx);
-		return -ENODEV;
-	}
-
-	bm_page_io_async(ctx, idx, WRITE_SYNC);
-	wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
-
-	if (ctx->error)
-		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
-		/* that causes us to detach, so the in memory bitmap will be
-		 * gone in a moment as well. */
-
-	device->bm_writ_cnt++;
-	err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
-	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
-	return err;
+	return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
 }
 
 /* NOTE
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
new file mode 100644
index 000000000000..5c20b18540b8
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -0,0 +1,958 @@
+#define pr_fmt(fmt) "drbd debugfs: " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+#include "drbd_debugfs.h"
+
+
+/**********************************************************************
+ * Whenever you change the file format, remember to bump the version. *
+ **********************************************************************/
+
+static struct dentry *drbd_debugfs_root;
+static struct dentry *drbd_debugfs_version;
+static struct dentry *drbd_debugfs_resources;
+static struct dentry *drbd_debugfs_minors;
+
+static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt)
+{
+	if (valid)
+		seq_printf(m, "\t%d", jiffies_to_msecs(dt));
+	else
+		seq_printf(m, "\t-");
+}
+
+static void __seq_print_rq_state_bit(struct seq_file *m,
+	bool is_set, char *sep, const char *set_name, const char *unset_name)
+{
+	if (is_set && set_name) {
+		seq_putc(m, *sep);
+		seq_puts(m, set_name);
+		*sep = '|';
+	} else if (!is_set && unset_name) {
+		seq_putc(m, *sep);
+		seq_puts(m, unset_name);
+		*sep = '|';
+	}
+}
+
+static void seq_print_rq_state_bit(struct seq_file *m,
+	bool is_set, char *sep, const char *set_name)
+{
+	__seq_print_rq_state_bit(m, is_set, sep, set_name, NULL);
+}
+
+/* pretty print enum drbd_req_state_bits req->rq_state */
+static void seq_print_request_state(struct seq_file *m, struct drbd_request *req)
+{
+	unsigned int s = req->rq_state;
+	char sep = ' ';
+	seq_printf(m, "\t0x%08x", s);
+	seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed");
+
+	/* RQ_WRITE ignored, already reported */
+	seq_puts(m, "\tlocal:");
+	seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL");
+	seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed");
+	seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok");
+	if (sep == ' ')
+		seq_puts(m, " -");
+
+	/* for_each_connection ... */
+	seq_printf(m, "\tnet:");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending");
+	seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued");
+	seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent");
+	seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done");
+	seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis");
+	seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok");
+	if (sep == ' ')
+		seq_puts(m, " -");
+
+	seq_printf(m, " :");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B");
+	seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C");
+	seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr");
+	if (sep == ' ')
+		seq_puts(m, " -");
+	seq_printf(m, "\n");
+}
+
+static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+	/* change anything here, fixup header below! */
+	unsigned int s = req->rq_state;
+
+#define RQ_HDR_1 "epoch\tsector\tsize\trw"
+	seq_printf(m, "0x%x\t%llu\t%u\t%s",
+		req->epoch,
+		(unsigned long long)req->i.sector, req->i.size >> 9,
+		(s & RQ_WRITE) ? "W" : "R");
+
+#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
+	seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
+	seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
+	seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
+
+#define RQ_HDR_3 "\tsent\tacked\tdone"
+	seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif);
+	seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif);
+	seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif);
+
+#define RQ_HDR_4 "\tstate\n"
+	seq_print_request_state(m, req);
+}
+#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4
+
+static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+	seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr);
+	seq_print_one_request(m, req, now);
+}
+
+static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		struct drbd_md_io tmp;
+		/* In theory this is racy,
+		 * in the sense that there could have been a
+		 * drbd_md_put_buffer(); drbd_md_get_buffer();
+		 * between accessing these members here.  */
+		tmp = device->md_io;
+		if (atomic_read(&tmp.in_use)) {
+			seq_printf(m, "%u\t%u\t%d\t",
+				device->minor, device->vnr,
+				jiffies_to_msecs(now - tmp.start_jif));
+			if (time_before(tmp.submit_jif, tmp.start_jif))
+				seq_puts(m, "-\t");
+			else
+				seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif));
+			seq_printf(m, "%s\n", tmp.current_use);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\tage\t#waiting\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		unsigned long jif;
+		struct drbd_request *req;
+		int n = atomic_read(&device->ap_actlog_cnt);
+		if (n) {
+			spin_lock_irq(&device->resource->req_lock);
+			req = list_first_entry_or_null(&device->pending_master_completion[1],
+				struct drbd_request, req_pending_master_completion);
+			/* if the oldest request does not wait for the activity log
+			 * it is not interesting for us here */
+			if (req && !(req->rq_state & RQ_IN_ACT_LOG))
+				jif = req->start_jif;
+			else
+				req = NULL;
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+		if (n) {
+			seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+			if (req)
+				seq_printf(m, "%u\t", jiffies_to_msecs(now - jif));
+			else
+				seq_puts(m, "-\t");
+			seq_printf(m, "%u\n", n);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now)
+{
+	struct drbd_bm_aio_ctx *ctx;
+	unsigned long start_jif;
+	unsigned int in_flight;
+	unsigned int flags;
+	spin_lock_irq(&device->resource->req_lock);
+	ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list);
+	if (ctx && ctx->done)
+		ctx = NULL;
+	if (ctx) {
+		start_jif = ctx->start_jif;
+		in_flight = atomic_read(&ctx->in_flight);
+		flags = ctx->flags;
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+	if (ctx) {
+		seq_printf(m, "%u\t%u\t%c\t%u\t%u\n",
+			device->minor, device->vnr,
+			(flags & BM_AIO_READ) ? 'R' : 'W',
+			jiffies_to_msecs(now - start_jif),
+			in_flight);
+	}
+}
+
+static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		seq_print_device_bitmap_io(m, device, now);
+	}
+	rcu_read_unlock();
+}
+
+/* pretty print enum peer_req->flags */
+static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req)
+{
+	unsigned long f = peer_req->flags;
+	char sep = ' ';
+
+	__seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing");
+	__seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal");
+	seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
+	seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
+	seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
+
+	if (f & EE_IS_TRIM) {
+		seq_putc(m, sep);
+		sep = '|';
+		if (f & EE_IS_TRIM_USE_ZEROOUT)
+			seq_puts(m, "zero-out");
+		else
+			seq_puts(m, "trim");
+	}
+	seq_putc(m, '\n');
+}
+
+static void seq_print_peer_request(struct seq_file *m,
+	struct drbd_device *device, struct list_head *lh,
+	unsigned long now)
+{
+	bool reported_preparing = false;
+	struct drbd_peer_request *peer_req;
+	list_for_each_entry(peer_req, lh, w.list) {
+		if (reported_preparing && !(peer_req->flags & EE_SUBMITTED))
+			continue;
+
+		if (device)
+			seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+
+		seq_printf(m, "%llu\t%u\t%c\t%u\t",
+			(unsigned long long)peer_req->i.sector, peer_req->i.size >> 9,
+			(peer_req->flags & EE_WRITE) ? 'W' : 'R',
+			jiffies_to_msecs(now - peer_req->submit_jif));
+		seq_print_peer_request_flags(m, peer_req);
+		if (peer_req->flags & EE_SUBMITTED)
+			break;
+		else
+			reported_preparing = true;
+	}
+}
+
+static void seq_print_device_peer_requests(struct seq_file *m,
+	struct drbd_device *device, unsigned long now)
+{
+	seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n");
+	spin_lock_irq(&device->resource->req_lock);
+	seq_print_peer_request(m, device, &device->active_ee, now);
+	seq_print_peer_request(m, device, &device->read_ee, now);
+	seq_print_peer_request(m, device, &device->sync_ee, now);
+	spin_unlock_irq(&device->resource->req_lock);
+	if (test_bit(FLUSH_PENDING, &device->flags)) {
+		seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n",
+			device->minor, device->vnr,
+			jiffies_to_msecs(now - device->flush_jif));
+	}
+}
+
+static void seq_print_resource_pending_peer_requests(struct seq_file *m,
+	struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		seq_print_device_peer_requests(m, device, now);
+	}
+	rcu_read_unlock();
+}
+
+static void seq_print_resource_transfer_log_summary(struct seq_file *m,
+	struct drbd_resource *resource,
+	struct drbd_connection *connection,
+	unsigned long now)
+{
+	struct drbd_request *req;
+	unsigned int count = 0;
+	unsigned int show_state = 0;
+
+	seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR);
+	spin_lock_irq(&resource->req_lock);
+	list_for_each_entry(req, &connection->transfer_log, tl_requests) {
+		unsigned int tmp = 0;
+		unsigned int s;
+		++count;
+
+		/* don't disable irq "forever" */
+		if (!(count & 0x1ff)) {
+			struct drbd_request *req_next;
+			kref_get(&req->kref);
+			spin_unlock_irq(&resource->req_lock);
+			cond_resched();
+			spin_lock_irq(&resource->req_lock);
+			req_next = list_next_entry(req, tl_requests);
+			if (kref_put(&req->kref, drbd_req_destroy))
+				req = req_next;
+			if (&req->tl_requests == &connection->transfer_log)
+				break;
+		}
+
+		s = req->rq_state;
+
+		/* This is meant to summarize timing issues, to be able to tell
+		 * local disk problems from network problems.
+		 * Skip requests, if we have shown an even older request with
+		 * similar aspects already.  */
+		if (req->master_bio == NULL)
+			tmp |= 1;
+		if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING))
+			tmp |= 2;
+		if (s & RQ_NET_MASK) {
+			if (!(s & RQ_NET_SENT))
+				tmp |= 4;
+			if (s & RQ_NET_PENDING)
+				tmp |= 8;
+			if (!(s & RQ_NET_DONE))
+				tmp |= 16;
+		}
+		if ((tmp & show_state) == tmp)
+			continue;
+		show_state |= tmp;
+		seq_printf(m, "%u\t", count);
+		seq_print_minor_vnr_req(m, req, now);
+		if (show_state == 0x1f)
+			break;
+	}
+	spin_unlock_irq(&resource->req_lock);
+}
+
+/* TODO: transfer_log and friends should be moved to resource */
+static int in_flight_summary_show(struct seq_file *m, void *pos)
+{
+	struct drbd_resource *resource = m->private;
+	struct drbd_connection *connection;
+	unsigned long jif = jiffies;
+
+	connection = first_connection(resource);
+	/* This does not happen, actually.
+	 * But be robust and prepare for future code changes. */
+	if (!connection || !kref_get_unless_zero(&connection->kref))
+		return -ESTALE;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	seq_puts(m, "oldest bitmap IO\n");
+	seq_print_resource_pending_bitmap_io(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "meta data IO\n");
+	seq_print_resource_pending_meta_io(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "socket buffer stats\n");
+	/* for each connection ... once we have more than one */
+	rcu_read_lock();
+	if (connection->data.socket) {
+		/* open coded SIOCINQ, the "relevant" part */
+		struct tcp_sock *tp = tcp_sk(connection->data.socket->sk);
+		int answ = tp->rcv_nxt - tp->copied_seq;
+		seq_printf(m, "unread receive buffer: %u Byte\n", answ);
+		/* open coded SIOCOUTQ, the "relevant" part */
+		answ = tp->write_seq - tp->snd_una;
+		seq_printf(m, "unacked send buffer: %u Byte\n", answ);
+	}
+	rcu_read_unlock();
+	seq_putc(m, '\n');
+
+	seq_puts(m, "oldest peer requests\n");
+	seq_print_resource_pending_peer_requests(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "application requests waiting for activity log\n");
+	seq_print_waiting_for_AL(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "oldest application requests\n");
+	seq_print_resource_transfer_log_summary(m, resource, connection, jif);
+	seq_putc(m, '\n');
+
+	jif = jiffies - jif;
+	if (jif)
+		seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return 0;
+}
+
+/* simple_positive(file->f_dentry) respectively debugfs_positive(),
+ * but neither is "reachable" from here.
+ * So we have our own inline version of it above.  :-( */
+static inline int debugfs_positive(struct dentry *dentry)
+{
+        return dentry->d_inode && !d_unhashed(dentry);
+}
+
+/* make sure at *open* time that the respective object won't go away. */
+static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
+		                void *data, struct kref *kref,
+				void (*release)(struct kref *))
+{
+	struct dentry *parent;
+	int ret = -ESTALE;
+
+	/* Are we still linked,
+	 * or has debugfs_remove() already been called? */
+	parent = file->f_dentry->d_parent;
+	/* not sure if this can happen: */
+	if (!parent || !parent->d_inode)
+		goto out;
+	/* serialize with d_delete() */
+	mutex_lock(&parent->d_inode->i_mutex);
+	/* Make sure the object is still alive */
+	if (debugfs_positive(file->f_dentry)
+	&& kref_get_unless_zero(kref))
+		ret = 0;
+	mutex_unlock(&parent->d_inode->i_mutex);
+	if (!ret) {
+		ret = single_open(file, show, data);
+		if (ret)
+			kref_put(kref, release);
+	}
+out:
+	return ret;
+}
+
+static int in_flight_summary_open(struct inode *inode, struct file *file)
+{
+	struct drbd_resource *resource = inode->i_private;
+	return drbd_single_open(file, in_flight_summary_show, resource,
+				&resource->kref, drbd_destroy_resource);
+}
+
+static int in_flight_summary_release(struct inode *inode, struct file *file)
+{
+	struct drbd_resource *resource = inode->i_private;
+	kref_put(&resource->kref, drbd_destroy_resource);
+	return single_release(inode, file);
+}
+
+static const struct file_operations in_flight_summary_fops = {
+	.owner		= THIS_MODULE,
+	.open		= in_flight_summary_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= in_flight_summary_release,
+};
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource)
+{
+	struct dentry *dentry;
+	if (!drbd_debugfs_resources)
+		return;
+
+	dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res = dentry;
+
+	dentry = debugfs_create_dir("volumes", resource->debugfs_res);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res_volumes = dentry;
+
+	dentry = debugfs_create_dir("connections", resource->debugfs_res);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res_connections = dentry;
+
+	dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
+			resource->debugfs_res, resource,
+			&in_flight_summary_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res_in_flight_summary = dentry;
+	return;
+
+fail:
+	drbd_debugfs_resource_cleanup(resource);
+	drbd_err(resource, "failed to create debugfs dentry\n");
+}
+
+static void drbd_debugfs_remove(struct dentry **dp)
+{
+	debugfs_remove(*dp);
+	*dp = NULL;
+}
+
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
+{
+	/* it is ok to call debugfs_remove(NULL) */
+	drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary);
+	drbd_debugfs_remove(&resource->debugfs_res_connections);
+	drbd_debugfs_remove(&resource->debugfs_res_volumes);
+	drbd_debugfs_remove(&resource->debugfs_res);
+}
+
+static void seq_print_one_timing_detail(struct seq_file *m,
+	const struct drbd_thread_timing_details *tdp,
+	unsigned long now)
+{
+	struct drbd_thread_timing_details td;
+	/* No locking...
+	 * use temporary assignment to get at consistent data. */
+	do {
+		td = *tdp;
+	} while (td.cb_nr != tdp->cb_nr);
+	if (!td.cb_addr)
+		return;
+	seq_printf(m, "%u\t%d\t%s:%u\t%ps\n",
+			td.cb_nr,
+			jiffies_to_msecs(now - td.start_jif),
+			td.caller_fn, td.line,
+			td.cb_addr);
+}
+
+static void seq_print_timing_details(struct seq_file *m,
+		const char *title,
+		unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now)
+{
+	unsigned int start_idx;
+	unsigned int i;
+
+	seq_printf(m, "%s\n", title);
+	/* If not much is going on, this will result in natural ordering.
+	 * If it is very busy, we will possibly skip events, or even see wrap
+	 * arounds, which could only be avoided with locking.
+	 */
+	start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST;
+	for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++)
+		seq_print_one_timing_detail(m, tdp+i, now);
+	for (i = 0; i < start_idx; i++)
+		seq_print_one_timing_detail(m, tdp+i, now);
+}
+
+static int callback_history_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_connection *connection = m->private;
+	unsigned long jif = jiffies;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	seq_puts(m, "n\tage\tcallsite\tfn\n");
+	seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
+	seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
+	return 0;
+}
+
+static int callback_history_open(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	return drbd_single_open(file, callback_history_show, connection,
+				&connection->kref, drbd_destroy_connection);
+}
+
+static int callback_history_release(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return single_release(inode, file);
+}
+
+static const struct file_operations connection_callback_history_fops = {
+	.owner		= THIS_MODULE,
+	.open		= callback_history_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= callback_history_release,
+};
+
+static int connection_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_connection *connection = m->private;
+	unsigned long now = jiffies;
+	struct drbd_request *r1, *r2;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	spin_lock_irq(&connection->resource->req_lock);
+	r1 = connection->req_next;
+	if (r1)
+		seq_print_minor_vnr_req(m, r1, now);
+	r2 = connection->req_ack_pending;
+	if (r2 && r2 != r1) {
+		r1 = r2;
+		seq_print_minor_vnr_req(m, r1, now);
+	}
+	r2 = connection->req_not_net_done;
+	if (r2 && r2 != r1)
+		seq_print_minor_vnr_req(m, r2, now);
+	spin_unlock_irq(&connection->resource->req_lock);
+	return 0;
+}
+
+static int connection_oldest_requests_open(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	return drbd_single_open(file, connection_oldest_requests_show, connection,
+				&connection->kref, drbd_destroy_connection);
+}
+
+static int connection_oldest_requests_release(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return single_release(inode, file);
+}
+
+static const struct file_operations connection_oldest_requests_fops = {
+	.owner		= THIS_MODULE,
+	.open		= connection_oldest_requests_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= connection_oldest_requests_release,
+};
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection)
+{
+	struct dentry *conns_dir = connection->resource->debugfs_res_connections;
+	struct dentry *dentry;
+	if (!conns_dir)
+		return;
+
+	/* Once we enable mutliple peers,
+	 * these connections will have descriptive names.
+	 * For now, it is just the one connection to the (only) "peer". */
+	dentry = debugfs_create_dir("peer", conns_dir);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	connection->debugfs_conn = dentry;
+
+	dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
+			connection->debugfs_conn, connection,
+			&connection_callback_history_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	connection->debugfs_conn_callback_history = dentry;
+
+	dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
+			connection->debugfs_conn, connection,
+			&connection_oldest_requests_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	connection->debugfs_conn_oldest_requests = dentry;
+	return;
+
+fail:
+	drbd_debugfs_connection_cleanup(connection);
+	drbd_err(connection, "failed to create debugfs dentry\n");
+}
+
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
+{
+	drbd_debugfs_remove(&connection->debugfs_conn_callback_history);
+	drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests);
+	drbd_debugfs_remove(&connection->debugfs_conn);
+}
+
+static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
+{
+       struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+       seq_printf(m, "%5d %s %s %s\n", bme->rs_left,
+		  test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------",
+		  test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------",
+		  test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------"
+		  );
+}
+
+static int device_resync_extents_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		lc_seq_printf_stats(m, device->resync);
+		lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
+		put_ldev(device);
+	}
+	return 0;
+}
+
+static int device_act_log_extents_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		lc_seq_printf_stats(m, device->act_log);
+		lc_seq_dump_details(m, device->act_log, "", NULL);
+		put_ldev(device);
+	}
+	return 0;
+}
+
+static int device_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	struct drbd_resource *resource = device->resource;
+	unsigned long now = jiffies;
+	struct drbd_request *r1, *r2;
+	int i;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	seq_puts(m, RQ_HDR);
+	spin_lock_irq(&resource->req_lock);
+	/* WRITE, then READ */
+	for (i = 1; i >= 0; --i) {
+		r1 = list_first_entry_or_null(&device->pending_master_completion[i],
+			struct drbd_request, req_pending_master_completion);
+		r2 = list_first_entry_or_null(&device->pending_completion[i],
+			struct drbd_request, req_pending_local);
+		if (r1)
+			seq_print_one_request(m, r1, now);
+		if (r2 && r2 != r1)
+			seq_print_one_request(m, r2, now);
+	}
+	spin_unlock_irq(&resource->req_lock);
+	return 0;
+}
+
+static int device_data_gen_id_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	struct drbd_md *md;
+	enum drbd_uuid_index idx;
+
+	if (!get_ldev_if_state(device, D_FAILED))
+		return -ENODEV;
+
+	md = &device->ldev->md;
+	spin_lock_irq(&md->uuid_lock);
+	for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) {
+		seq_printf(m, "0x%016llX\n", md->uuid[idx]);
+	}
+	spin_unlock_irq(&md->uuid_lock);
+	put_ldev(device);
+	return 0;
+}
+
+#define drbd_debugfs_device_attr(name)						\
+static int device_ ## name ## _open(struct inode *inode, struct file *file)	\
+{										\
+	struct drbd_device *device = inode->i_private;				\
+	return drbd_single_open(file, device_ ## name ## _show, device,		\
+				&device->kref, drbd_destroy_device);		\
+}										\
+static int device_ ## name ## _release(struct inode *inode, struct file *file)	\
+{										\
+	struct drbd_device *device = inode->i_private;				\
+	kref_put(&device->kref, drbd_destroy_device);				\
+	return single_release(inode, file);					\
+}										\
+static const struct file_operations device_ ## name ## _fops = {		\
+	.owner		= THIS_MODULE,						\
+	.open		= device_ ## name ## _open,				\
+	.read		= seq_read,						\
+	.llseek		= seq_lseek,						\
+	.release	= device_ ## name ## _release,				\
+};
+
+drbd_debugfs_device_attr(oldest_requests)
+drbd_debugfs_device_attr(act_log_extents)
+drbd_debugfs_device_attr(resync_extents)
+drbd_debugfs_device_attr(data_gen_id)
+
+void drbd_debugfs_device_add(struct drbd_device *device)
+{
+	struct dentry *vols_dir = device->resource->debugfs_res_volumes;
+	char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */
+	char vnr_buf[8];   /* volume number vnr is even 16 bit only; */
+	char *slink_name = NULL;
+
+	struct dentry *dentry;
+	if (!vols_dir || !drbd_debugfs_minors)
+		return;
+
+	snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr);
+	dentry = debugfs_create_dir(vnr_buf, vols_dir);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	device->debugfs_vol = dentry;
+
+	snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor);
+	slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u",
+			device->resource->name, device->vnr);
+	if (!slink_name)
+		goto fail;
+	dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
+	kfree(slink_name);
+	slink_name = NULL;
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	device->debugfs_minor = dentry;
+
+#define DCF(name)	do {					\
+	dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP,	\
+			device->debugfs_vol, device,		\
+			&device_ ## name ## _fops);		\
+	if (IS_ERR_OR_NULL(dentry))				\
+		goto fail;					\
+	device->debugfs_vol_ ## name = dentry;			\
+	} while (0)
+
+	DCF(oldest_requests);
+	DCF(act_log_extents);
+	DCF(resync_extents);
+	DCF(data_gen_id);
+#undef DCF
+	return;
+
+fail:
+	drbd_debugfs_device_cleanup(device);
+	drbd_err(device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_device_cleanup(struct drbd_device *device)
+{
+	drbd_debugfs_remove(&device->debugfs_minor);
+	drbd_debugfs_remove(&device->debugfs_vol_oldest_requests);
+	drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
+	drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
+	drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+	drbd_debugfs_remove(&device->debugfs_vol);
+}
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device)
+{
+	struct dentry *conn_dir = peer_device->connection->debugfs_conn;
+	struct dentry *dentry;
+	char vnr_buf[8];
+
+	if (!conn_dir)
+		return;
+
+	snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr);
+	dentry = debugfs_create_dir(vnr_buf, conn_dir);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	peer_device->debugfs_peer_dev = dentry;
+	return;
+
+fail:
+	drbd_debugfs_peer_device_cleanup(peer_device);
+	drbd_err(peer_device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
+{
+	drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
+}
+
+static int drbd_version_show(struct seq_file *m, void *ignored)
+{
+	seq_printf(m, "# %s\n", drbd_buildtag());
+	seq_printf(m, "VERSION=%s\n", REL_VERSION);
+	seq_printf(m, "API_VERSION=%u\n", API_VERSION);
+	seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN);
+	seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX);
+	return 0;
+}
+
+static int drbd_version_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, drbd_version_show, NULL);
+}
+
+static struct file_operations drbd_version_fops = {
+	.owner = THIS_MODULE,
+	.open = drbd_version_open,
+	.llseek = seq_lseek,
+	.read = seq_read,
+	.release = single_release,
+};
+
+/* not __exit, may be indirectly called
+ * from the module-load-failure path as well. */
+void drbd_debugfs_cleanup(void)
+{
+	drbd_debugfs_remove(&drbd_debugfs_resources);
+	drbd_debugfs_remove(&drbd_debugfs_minors);
+	drbd_debugfs_remove(&drbd_debugfs_version);
+	drbd_debugfs_remove(&drbd_debugfs_root);
+}
+
+int __init drbd_debugfs_init(void)
+{
+	struct dentry *dentry;
+
+	dentry = debugfs_create_dir("drbd", NULL);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_root = dentry;
+
+	dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_version = dentry;
+
+	dentry = debugfs_create_dir("resources", drbd_debugfs_root);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_resources = dentry;
+
+	dentry = debugfs_create_dir("minors", drbd_debugfs_root);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_minors = dentry;
+	return 0;
+
+fail:
+	drbd_debugfs_cleanup();
+	if (dentry)
+		return PTR_ERR(dentry);
+	else
+		return -EINVAL;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
new file mode 100644
index 000000000000..8bee21340dce
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.h
@@ -0,0 +1,39 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include "drbd_int.h"
+
+#ifdef CONFIG_DEBUG_FS
+int __init drbd_debugfs_init(void);
+void drbd_debugfs_cleanup(void);
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource);
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection);
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
+
+void drbd_debugfs_device_add(struct drbd_device *device);
+void drbd_debugfs_device_cleanup(struct drbd_device *device);
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device);
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device);
+#else
+
+static inline int __init drbd_debugfs_init(void) { return -ENODEV; }
+static inline void drbd_debugfs_cleanup(void) { }
+
+static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
+static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
+
+static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
+static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
+
+static inline void drbd_debugfs_device_add(struct drbd_device *device) { }
+static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { }
+
+static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { }
+static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { }
+
+#endif
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a76ceb344d64..1a000016ccdf 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -317,7 +317,63 @@ struct drbd_request {
 
 	struct list_head tl_requests; /* ring list in the transfer log */
 	struct bio *master_bio;       /* master bio pointer */
-	unsigned long start_time;
+
+	/* see struct drbd_device */
+	struct list_head req_pending_master_completion;
+	struct list_head req_pending_local;
+
+	/* for generic IO accounting */
+	unsigned long start_jif;
+
+	/* for DRBD internal statistics */
+
+	/* Minimal set of time stamps to determine if we wait for activity log
+	 * transactions, local disk or peer.  32 bit "jiffies" are good enough,
+	 * we don't expect a DRBD request to be stalled for several month.
+	 */
+
+	/* before actual request processing */
+	unsigned long in_actlog_jif;
+
+	/* local disk */
+	unsigned long pre_submit_jif;
+
+	/* per connection */
+	unsigned long pre_send_jif;
+	unsigned long acked_jif;
+	unsigned long net_done_jif;
+
+	/* Possibly even more detail to track each phase:
+	 *  master_completion_jif
+	 *      how long did it take to complete the master bio
+	 *      (application visible latency)
+	 *  allocated_jif
+	 *      how long the master bio was blocked until we finally allocated
+	 *      a tracking struct
+	 *  in_actlog_jif
+	 *      how long did we wait for activity log transactions
+	 *
+	 *  net_queued_jif
+	 *      when did we finally queue it for sending
+	 *  pre_send_jif
+	 *      when did we start sending it
+	 *  post_send_jif
+	 *      how long did we block in the network stack trying to send it
+	 *  acked_jif
+	 *      when did we receive (or fake, in protocol A) a remote ACK
+	 *  net_done_jif
+	 *      when did we receive final acknowledgement (P_BARRIER_ACK),
+	 *      or decide, e.g. on connection loss, that we do no longer expect
+	 *      anything from this peer for this request.
+	 *
+	 *  pre_submit_jif
+	 *  post_sub_jif
+	 *      when did we start submiting to the lower level device,
+	 *      and how long did we block in that submit function
+	 *  local_completion_jif
+	 *      how long did it take the lower level device to complete this request
+	 */
+
 
 	/* once it hits 0, we may complete the master_bio */
 	atomic_t completion_ref;
@@ -366,6 +422,7 @@ struct drbd_peer_request {
 	struct drbd_interval i;
 	/* see comments on ee flag bits below */
 	unsigned long flags;
+	unsigned long submit_jif;
 	union {
 		u64 block_id;
 		struct digest_info *digest;
@@ -408,6 +465,17 @@ enum {
 
 	/* Is set when net_conf had two_primaries set while creating this peer_req */
 	__EE_IN_INTERVAL_TREE,
+
+	/* for debugfs: */
+	/* has this been submitted, or does it still wait for something else? */
+	__EE_SUBMITTED,
+
+	/* this is/was a write request */
+	__EE_WRITE,
+
+	/* this originates from application on peer
+	 * (not some resync or verify or other DRBD internal request) */
+	__EE_APPLICATION,
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
@@ -419,6 +487,9 @@ enum {
 #define EE_RESTART_REQUESTS	(1<<__EE_RESTART_REQUESTS)
 #define EE_SEND_WRITE_ACK	(1<<__EE_SEND_WRITE_ACK)
 #define EE_IN_INTERVAL_TREE	(1<<__EE_IN_INTERVAL_TREE)
+#define EE_SUBMITTED		(1<<__EE_SUBMITTED)
+#define EE_WRITE		(1<<__EE_WRITE)
+#define EE_APPLICATION		(1<<__EE_APPLICATION)
 
 /* flag bits per device */
 enum {
@@ -433,11 +504,11 @@ enum {
 	CONSIDER_RESYNC,
 
 	MD_NO_FUA,		/* Users wants us to not use FUA/FLUSH on meta data dev */
+
 	SUSPEND_IO,		/* suspend application io */
 	BITMAP_IO,		/* suspend application io;
 				   once no more io in flight, start bitmap io */
 	BITMAP_IO_QUEUED,       /* Started bitmap IO */
-	GO_DISKLESS,		/* Disk is being detached, on io-error or admin request. */
 	WAS_IO_ERROR,		/* Local disk failed, returned IO error */
 	WAS_READ_ERROR,		/* Local disk READ failed (set additionally to the above) */
 	FORCE_DETACH,		/* Force-detach from local disk, aborting any pending local IO */
@@ -450,6 +521,20 @@ enum {
 	B_RS_H_DONE,		/* Before resync handler done (already executed) */
 	DISCARD_MY_DATA,	/* discard_my_data flag per volume */
 	READ_BALANCE_RR,
+
+	FLUSH_PENDING,		/* if set, device->flush_jif is when we submitted that flush
+				 * from drbd_flush_after_epoch() */
+
+	/* cleared only after backing device related structures have been destroyed. */
+	GOING_DISKLESS,		/* Disk is being detached, because of io-error, or admin request. */
+
+	/* to be used in drbd_device_post_work() */
+	GO_DISKLESS,		/* tell worker to schedule cleanup before detach */
+	DESTROY_DISK,		/* tell worker to close backing devices and destroy related structures. */
+	MD_SYNC,		/* tell worker to call drbd_md_sync() */
+	RS_START,		/* tell worker to start resync/OV */
+	RS_PROGRESS,		/* tell worker that resync made significant progress */
+	RS_DONE,		/* tell worker that resync is done */
 };
 
 struct drbd_bitmap; /* opaque for drbd_device */
@@ -531,6 +616,11 @@ struct drbd_backing_dev {
 };
 
 struct drbd_md_io {
+	struct page *page;
+	unsigned long start_jif;	/* last call to drbd_md_get_buffer */
+	unsigned long submit_jif;	/* last _drbd_md_sync_page_io() submit */
+	const char *current_use;
+	atomic_t in_use;
 	unsigned int done;
 	int error;
 };
@@ -577,10 +667,18 @@ enum {
 				 * and potentially deadlock on, this drbd worker.
 				 */
 	DISCONNECT_SENT,
+
+	DEVICE_WORK_PENDING,	/* tell worker that some device has pending work */
 };
 
 struct drbd_resource {
 	char *name;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_res;
+	struct dentry *debugfs_res_volumes;
+	struct dentry *debugfs_res_connections;
+	struct dentry *debugfs_res_in_flight_summary;
+#endif
 	struct kref kref;
 	struct idr devices;		/* volume number to device mapping */
 	struct list_head connections;
@@ -594,12 +692,28 @@ struct drbd_resource {
 	unsigned susp_nod:1;		/* IO suspended because no data */
 	unsigned susp_fen:1;		/* IO suspended because fence peer handler runs */
 
+	enum write_ordering_e write_ordering;
+
 	cpumask_var_t cpu_mask;
 };
 
+struct drbd_thread_timing_details
+{
+	unsigned long start_jif;
+	void *cb_addr;
+	const char *caller_fn;
+	unsigned int line;
+	unsigned int cb_nr;
+};
+
 struct drbd_connection {
 	struct list_head connections;
 	struct drbd_resource *resource;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_conn;
+	struct dentry *debugfs_conn_callback_history;
+	struct dentry *debugfs_conn_oldest_requests;
+#endif
 	struct kref kref;
 	struct idr peer_devices;	/* volume number to peer device mapping */
 	enum drbd_conns cstate;		/* Only C_STANDALONE to C_WF_REPORT_PARAMS */
@@ -636,7 +750,6 @@ struct drbd_connection {
 	struct drbd_epoch *current_epoch;
 	spinlock_t epoch_lock;
 	unsigned int epochs;
-	enum write_ordering_e write_ordering;
 	atomic_t current_tle_nr;	/* transfer log epoch number */
 	unsigned current_tle_writes;	/* writes seen within this tl epoch */
 
@@ -645,9 +758,22 @@ struct drbd_connection {
 	struct drbd_thread worker;
 	struct drbd_thread asender;
 
+	/* cached pointers,
+	 * so we can look up the oldest pending requests more quickly.
+	 * protected by resource->req_lock */
+	struct drbd_request *req_next; /* DRBD 9: todo.req_next */
+	struct drbd_request *req_ack_pending;
+	struct drbd_request *req_not_net_done;
+
 	/* sender side */
 	struct drbd_work_queue sender_work;
 
+#define DRBD_THREAD_DETAILS_HIST	16
+	unsigned int w_cb_nr; /* keeps counting up */
+	unsigned int r_cb_nr; /* keeps counting up */
+	struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
+	struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
+
 	struct {
 		/* whether this sender thread
 		 * has processed a single write yet. */
@@ -663,11 +789,22 @@ struct drbd_connection {
 	} send;
 };
 
+void __update_timing_details(
+		struct drbd_thread_timing_details *tdp,
+		unsigned int *cb_nr,
+		void *cb,
+		const char *fn, const unsigned int line);
+
+#define update_worker_timing_details(c, cb) \
+	__update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
+#define update_receiver_timing_details(c, cb) \
+	__update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
+
 struct submit_worker {
 	struct workqueue_struct *wq;
 	struct work_struct worker;
 
-	spinlock_t lock;
+	/* protected by ..->resource->req_lock */
 	struct list_head writes;
 };
 
@@ -675,12 +812,29 @@ struct drbd_peer_device {
 	struct list_head peer_devices;
 	struct drbd_device *device;
 	struct drbd_connection *connection;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_peer_dev;
+#endif
 };
 
 struct drbd_device {
 	struct drbd_resource *resource;
 	struct list_head peer_devices;
-	int vnr;			/* volume number within the connection */
+	struct list_head pending_bitmap_io;
+
+	unsigned long flush_jif;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_minor;
+	struct dentry *debugfs_vol;
+	struct dentry *debugfs_vol_oldest_requests;
+	struct dentry *debugfs_vol_act_log_extents;
+	struct dentry *debugfs_vol_resync_extents;
+	struct dentry *debugfs_vol_data_gen_id;
+#endif
+
+	unsigned int vnr;	/* volume number within the connection */
+	unsigned int minor;	/* device minor number */
+
 	struct kref kref;
 
 	/* things that are stored as / read from meta data on disk */
@@ -697,19 +851,10 @@ struct drbd_device {
 	unsigned long last_reattach_jif;
 	struct drbd_work resync_work;
 	struct drbd_work unplug_work;
-	struct drbd_work go_diskless;
-	struct drbd_work md_sync_work;
-	struct drbd_work start_resync_work;
 	struct timer_list resync_timer;
 	struct timer_list md_sync_timer;
 	struct timer_list start_resync_timer;
 	struct timer_list request_timer;
-#ifdef DRBD_DEBUG_MD_SYNC
-	struct {
-		unsigned int line;
-		const char* func;
-	} last_md_mark_dirty;
-#endif
 
 	/* Used after attach while negotiating new disk state. */
 	union drbd_state new_state_tmp;
@@ -724,6 +869,7 @@ struct drbd_device {
 	unsigned int al_writ_cnt;
 	unsigned int bm_writ_cnt;
 	atomic_t ap_bio_cnt;	 /* Requests we need to complete */
+	atomic_t ap_actlog_cnt;  /* Requests waiting for activity log */
 	atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
 	atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
 	atomic_t unacked_cnt;	 /* Need to send replies for */
@@ -733,6 +879,13 @@ struct drbd_device {
 	struct rb_root read_requests;
 	struct rb_root write_requests;
 
+	/* for statistics and timeouts */
+	/* [0] read, [1] write */
+	struct list_head pending_master_completion[2];
+	struct list_head pending_completion[2];
+
+	/* use checksums for *this* resync */
+	bool use_csums;
 	/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
 	unsigned long rs_total;
 	/* number of resync blocks that failed in this run */
@@ -788,9 +941,7 @@ struct drbd_device {
 	atomic_t pp_in_use;		/* allocated from page pool */
 	atomic_t pp_in_use_by_net;	/* sendpage()d, still referenced by tcp */
 	wait_queue_head_t ee_wait;
-	struct page *md_io_page;	/* one page buffer for md_io */
 	struct drbd_md_io md_io;
-	atomic_t md_io_in_use;		/* protects the md_io, md_io_page and md_io_tmpp */
 	spinlock_t al_lock;
 	wait_queue_head_t al_wait;
 	struct lru_cache *act_log;	/* activity log */
@@ -800,7 +951,6 @@ struct drbd_device {
 	atomic_t packet_seq;
 	unsigned int peer_seq;
 	spinlock_t peer_seq_lock;
-	unsigned int minor;
 	unsigned long comm_bm_set; /* communicated number of set bits. */
 	struct bm_io_work bm_io_work;
 	u64 ed_uuid; /* UUID of the exposed data */
@@ -824,6 +974,21 @@ struct drbd_device {
 	struct submit_worker submit;
 };
 
+struct drbd_bm_aio_ctx {
+	struct drbd_device *device;
+	struct list_head list; /* on device->pending_bitmap_io */;
+	unsigned long start_jif;
+	atomic_t in_flight;
+	unsigned int done;
+	unsigned flags;
+#define BM_AIO_COPY_PAGES	1
+#define BM_AIO_WRITE_HINTED	2
+#define BM_AIO_WRITE_ALL_PAGES	4
+#define BM_AIO_READ		8
+	int error;
+	struct kref kref;
+};
+
 struct drbd_config_context {
 	/* assigned from drbd_genlmsghdr */
 	unsigned int minor;
@@ -949,7 +1114,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
 extern int drbd_send_bitmap(struct drbd_device *device);
 extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
 extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
-extern void drbd_free_bc(struct drbd_backing_dev *ldev);
+extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
 extern void drbd_device_cleanup(struct drbd_device *device);
 void drbd_print_uuids(struct drbd_device *device, const char *text);
 
@@ -966,13 +1131,7 @@ extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must
 extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
 extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
 extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
-#ifndef DRBD_DEBUG_MD_SYNC
 extern void drbd_md_mark_dirty(struct drbd_device *device);
-#else
-#define drbd_md_mark_dirty(m)	drbd_md_mark_dirty_(m, __LINE__ , __func__ )
-extern void drbd_md_mark_dirty_(struct drbd_device *device,
-		unsigned int line, const char *func);
-#endif
 extern void drbd_queue_bitmap_io(struct drbd_device *device,
 				 int (*io_fn)(struct drbd_device *),
 				 void (*done)(struct drbd_device *, int),
@@ -983,9 +1142,8 @@ extern int drbd_bitmap_io(struct drbd_device *device,
 extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
 		int (*io_fn)(struct drbd_device *),
 		char *why, enum bm_flag flags);
-extern int drbd_bmio_set_n_write(struct drbd_device *device);
-extern int drbd_bmio_clear_n_write(struct drbd_device *device);
-extern void drbd_ldev_destroy(struct drbd_device *device);
+extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local);
+extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local);
 
 /* Meta data layout
  *
@@ -1105,17 +1263,21 @@ struct bm_extent {
 /* in which _bitmap_ extent (resp. sector) the bit for a certain
  * _storage_ sector is located in */
 #define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SHIFT-9))
+#define BM_BIT_TO_EXT(x)    ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
 
-/* how much _storage_ sectors we have per bitmap sector */
+/* first storage sector a bitmap extent corresponds to */
 #define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SHIFT-9))
+/* how much _storage_ sectors we have per bitmap extent */
 #define BM_SECT_PER_EXT     BM_EXT_TO_SECT(1)
+/* how many bits are covered by one bitmap extent (resync extent) */
+#define BM_BITS_PER_EXT     (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+#define BM_BLOCKS_PER_BM_EXT_MASK  (BM_BITS_PER_EXT - 1)
+
 
 /* in one sector of the bitmap, we have this many activity_log extents. */
 #define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
 
-#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
-#define BM_BLOCKS_PER_BM_EXT_MASK  ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
-
 /* the extent in "PER_EXTENT" below is an activity log extent
  * we need that many (long words/bytes) to store the bitmap
  *		     of one AL_EXTENT_SIZE chunk of storage.
@@ -1195,11 +1357,11 @@ extern void _drbd_bm_set_bits(struct drbd_device *device,
 		const unsigned long s, const unsigned long e);
 extern int  drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
 extern int  drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
-extern int  drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local);
 extern int  drbd_bm_read(struct drbd_device *device) __must_hold(local);
 extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
 extern int  drbd_bm_write(struct drbd_device *device) __must_hold(local);
 extern int  drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
 extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
 extern int  drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local);
 extern size_t	     drbd_bm_words(struct drbd_device *device);
@@ -1213,7 +1375,6 @@ extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned lon
 extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
 extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
 extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
-extern int drbd_bm_rs_done(struct drbd_device *device);
 /* for receive_bitmap */
 extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
 		size_t number, unsigned long *buffer);
@@ -1312,7 +1473,7 @@ enum determine_dev_size {
 extern enum determine_dev_size
 drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_device *);
-extern void drbd_reconsider_max_bio_size(struct drbd_device *device);
+extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev);
 extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
 					enum drbd_role new_role,
 					int force);
@@ -1333,7 +1494,7 @@ extern void resume_next_sg(struct drbd_device *device);
 extern void suspend_other_sg(struct drbd_device *device);
 extern int drbd_resync_finished(struct drbd_device *device);
 /* maybe rather drbd_main.c ? */
-extern void *drbd_md_get_buffer(struct drbd_device *device);
+extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
 extern void drbd_md_put_buffer(struct drbd_device *device);
 extern int drbd_md_sync_page_io(struct drbd_device *device,
 		struct drbd_backing_dev *bdev, sector_t sector, int rw);
@@ -1380,7 +1541,8 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
 extern int drbd_receiver(struct drbd_thread *thi);
 extern int drbd_asender(struct drbd_thread *thi);
 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
-extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+		bool throttle_if_app_is_waiting);
 extern int drbd_submit_peer_request(struct drbd_device *,
 				    struct drbd_peer_request *, const unsigned,
 				    const int);
@@ -1464,10 +1626,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
 {
 	__release(local);
 	if (!bio->bi_bdev) {
-		printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
-				"bio->bi_bdev == NULL\n",
-		       device_to_minor(device));
-		dump_stack();
+		drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n");
 		bio_endio(bio, -ENODEV);
 		return;
 	}
@@ -1478,7 +1637,8 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
 		generic_make_request(bio);
 }
 
-void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo);
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+			      enum write_ordering_e wo);
 
 /* drbd_proc.c */
 extern struct proc_dir_entry *drbd_proc;
@@ -1489,9 +1649,9 @@ extern const char *drbd_role_str(enum drbd_role s);
 /* drbd_actlog.c */
 extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
 extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate);
+extern void drbd_al_begin_io_commit(struct drbd_device *device);
 extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate);
+extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i);
 extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
 extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
 extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
@@ -1501,14 +1661,17 @@ extern int drbd_rs_del_all(struct drbd_device *device);
 extern void drbd_rs_failed_io(struct drbd_device *device,
 		sector_t sector, int size);
 extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
-extern void __drbd_set_in_sync(struct drbd_device *device, sector_t sector,
-		int size, const char *file, const unsigned int line);
+
+enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
+extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+		enum update_sync_bits_mode mode,
+		const char *file, const unsigned int line);
 #define drbd_set_in_sync(device, sector, size) \
-	__drbd_set_in_sync(device, sector, size, __FILE__, __LINE__)
-extern int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector,
-		int size, const char *file, const unsigned int line);
+	__drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__)
 #define drbd_set_out_of_sync(device, sector, size) \
-	__drbd_set_out_of_sync(device, sector, size, __FILE__, __LINE__)
+	__drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__)
+#define drbd_rs_failed_io(device, sector, size) \
+	__drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__)
 extern void drbd_al_shrink(struct drbd_device *device);
 extern int drbd_initialize_al(struct drbd_device *, void *);
 
@@ -1764,25 +1927,38 @@ static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
 }
 
 static inline void
-drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&q->q_lock, flags);
-	list_add(&w->list, &q->q);
+	list_add_tail(&w->list, &q->q);
 	spin_unlock_irqrestore(&q->q_lock, flags);
 	wake_up(&q->q_wait);
 }
 
 static inline void
-drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&q->q_lock, flags);
-	list_add_tail(&w->list, &q->q);
+	if (list_empty_careful(&w->list))
+		list_add_tail(&w->list, &q->q);
 	spin_unlock_irqrestore(&q->q_lock, flags);
 	wake_up(&q->q_wait);
 }
 
+static inline void
+drbd_device_post_work(struct drbd_device *device, int work_bit)
+{
+	if (!test_and_set_bit(work_bit, &device->flags)) {
+		struct drbd_connection *connection =
+			first_peer_device(device)->connection;
+		struct drbd_work_queue *q = &connection->sender_work;
+		if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags))
+			wake_up(&q->q_wait);
+	}
+}
+
 extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
 
 static inline void wake_asender(struct drbd_connection *connection)
@@ -1859,7 +2035,7 @@ static inline void inc_ap_pending(struct drbd_device *device)
 			func, line,					\
 			atomic_read(&device->which))
 
-#define dec_ap_pending(device) _dec_ap_pending(device, __FUNCTION__, __LINE__)
+#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__)
 static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line)
 {
 	if (atomic_dec_and_test(&device->ap_pending_cnt))
@@ -1878,7 +2054,7 @@ static inline void inc_rs_pending(struct drbd_device *device)
 	atomic_inc(&device->rs_pending_cnt);
 }
 
-#define dec_rs_pending(device) _dec_rs_pending(device, __FUNCTION__, __LINE__)
+#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__)
 static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line)
 {
 	atomic_dec(&device->rs_pending_cnt);
@@ -1899,20 +2075,29 @@ static inline void inc_unacked(struct drbd_device *device)
 	atomic_inc(&device->unacked_cnt);
 }
 
-#define dec_unacked(device) _dec_unacked(device, __FUNCTION__, __LINE__)
+#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__)
 static inline void _dec_unacked(struct drbd_device *device, const char *func, int line)
 {
 	atomic_dec(&device->unacked_cnt);
 	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
 }
 
-#define sub_unacked(device, n) _sub_unacked(device, n, __FUNCTION__, __LINE__)
+#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__)
 static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line)
 {
 	atomic_sub(n, &device->unacked_cnt);
 	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
 }
 
+static inline bool is_sync_state(enum drbd_conns connection_state)
+{
+	return
+	   (connection_state == C_SYNC_SOURCE
+	||  connection_state == C_SYNC_TARGET
+	||  connection_state == C_PAUSED_SYNC_S
+	||  connection_state == C_PAUSED_SYNC_T);
+}
+
 /**
  * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
  * @M:		DRBD device.
@@ -1924,6 +2109,11 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f
 
 static inline void put_ldev(struct drbd_device *device)
 {
+	enum drbd_disk_state ds = device->state.disk;
+	/* We must check the state *before* the atomic_dec becomes visible,
+	 * or we have a theoretical race where someone hitting zero,
+	 * while state still D_FAILED, will then see D_DISKLESS in the
+	 * condition below and calling into destroy, where he must not, yet. */
 	int i = atomic_dec_return(&device->local_cnt);
 
 	/* This may be called from some endio handler,
@@ -1932,15 +2122,13 @@ static inline void put_ldev(struct drbd_device *device)
 	__release(local);
 	D_ASSERT(device, i >= 0);
 	if (i == 0) {
-		if (device->state.disk == D_DISKLESS)
+		if (ds == D_DISKLESS)
 			/* even internal references gone, safe to destroy */
-			drbd_ldev_destroy(device);
-		if (device->state.disk == D_FAILED) {
+			drbd_device_post_work(device, DESTROY_DISK);
+		if (ds == D_FAILED)
 			/* all application IO references gone. */
-			if (!test_and_set_bit(GO_DISKLESS, &device->flags))
-				drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-						&device->go_diskless);
-		}
+			if (!test_and_set_bit(GOING_DISKLESS, &device->flags))
+				drbd_device_post_work(device, GO_DISKLESS);
 		wake_up(&device->misc_wait);
 	}
 }
@@ -1964,54 +2152,6 @@ static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_
 extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
 #endif
 
-/* you must have an "get_ldev" reference */
-static inline void drbd_get_syncer_progress(struct drbd_device *device,
-		unsigned long *bits_left, unsigned int *per_mil_done)
-{
-	/* this is to break it at compile time when we change that, in case we
-	 * want to support more than (1<<32) bits on a 32bit arch. */
-	typecheck(unsigned long, device->rs_total);
-
-	/* note: both rs_total and rs_left are in bits, i.e. in
-	 * units of BM_BLOCK_SIZE.
-	 * for the percentage, we don't care. */
-
-	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
-		*bits_left = device->ov_left;
-	else
-		*bits_left = drbd_bm_total_weight(device) - device->rs_failed;
-	/* >> 10 to prevent overflow,
-	 * +1 to prevent division by zero */
-	if (*bits_left > device->rs_total) {
-		/* doh. maybe a logic bug somewhere.
-		 * may also be just a race condition
-		 * between this and a disconnect during sync.
-		 * for now, just prevent in-kernel buffer overflow.
-		 */
-		smp_rmb();
-		drbd_warn(device, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
-				drbd_conn_str(device->state.conn),
-				*bits_left, device->rs_total, device->rs_failed);
-		*per_mil_done = 0;
-	} else {
-		/* Make sure the division happens in long context.
-		 * We allow up to one petabyte storage right now,
-		 * at a granularity of 4k per bit that is 2**38 bits.
-		 * After shift right and multiplication by 1000,
-		 * this should still fit easily into a 32bit long,
-		 * so we don't need a 64bit division on 32bit arch.
-		 * Note: currently we don't support such large bitmaps on 32bit
-		 * arch anyways, but no harm done to be prepared for it here.
-		 */
-		unsigned int shift = device->rs_total > UINT_MAX ? 16 : 10;
-		unsigned long left = *bits_left >> shift;
-		unsigned long total = 1UL + (device->rs_total >> shift);
-		unsigned long tmp = 1000UL - left * 1000UL/total;
-		*per_mil_done = tmp;
-	}
-}
-
-
 /* this throttles on-the-fly application requests
  * according to max_buffers settings;
  * maybe re-implement using semaphores? */
@@ -2201,25 +2341,6 @@ static inline int drbd_queue_order_type(struct drbd_device *device)
 	return QUEUE_ORDERED_NONE;
 }
 
-static inline void drbd_md_flush(struct drbd_device *device)
-{
-	int r;
-
-	if (device->ldev == NULL) {
-		drbd_warn(device, "device->ldev == NULL in drbd_md_flush\n");
-		return;
-	}
-
-	if (test_bit(MD_NO_FUA, &device->flags))
-		return;
-
-	r = blkdev_issue_flush(device->ldev->md_bdev, GFP_NOIO, NULL);
-	if (r) {
-		set_bit(MD_NO_FUA, &device->flags);
-		drbd_err(device, "meta data flush failed with status %d, disabling md-flushes\n", r);
-	}
-}
-
 static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
 {
 	return list_first_entry_or_null(&resource->connections,
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
index f38fcb00c10d..f210543f05f4 100644
--- a/drivers/block/drbd/drbd_interval.h
+++ b/drivers/block/drbd/drbd_interval.h
@@ -10,7 +10,9 @@ struct drbd_interval {
 	unsigned int size;	/* size in bytes */
 	sector_t end;		/* highest interval end in subtree */
 	int local:1		/* local or remote request? */;
-	int waiting:1;
+	int waiting:1;		/* someone is waiting for this to complete */
+	int completed:1;	/* this has been completed already;
+				 * ignore for conflict detection */
 };
 
 static inline void drbd_clear_interval(struct drbd_interval *i)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 960645c26e6f..9b465bb68487 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -26,7 +26,10 @@
 
  */
 
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
+#include <linux/jiffies.h>
 #include <linux/drbd.h>
 #include <asm/uaccess.h>
 #include <asm/types.h>
@@ -54,16 +57,14 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
-
 #include "drbd_vli.h"
+#include "drbd_debugfs.h"
 
 static DEFINE_MUTEX(drbd_main_mutex);
 static int drbd_open(struct block_device *bdev, fmode_t mode);
 static void drbd_release(struct gendisk *gd, fmode_t mode);
-static int w_md_sync(struct drbd_work *w, int unused);
 static void md_sync_timer_fn(unsigned long data);
 static int w_bitmap_io(struct drbd_work *w, int unused);
-static int w_go_diskless(struct drbd_work *w, int unused);
 
 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
 	      "Lars Ellenberg <lars@linbit.com>");
@@ -264,7 +265,7 @@ bail:
 
 /**
  * _tl_restart() - Walks the transfer log, and applies an action to all requests
- * @device:	DRBD device.
+ * @connection:	DRBD connection to operate on.
  * @what:       The action/event to perform with all request objects
  *
  * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
@@ -662,6 +663,11 @@ static int __send_command(struct drbd_connection *connection, int vnr,
 			    msg_flags);
 	if (data && !err)
 		err = drbd_send_all(connection, sock->socket, data, size, 0);
+	/* DRBD protocol "pings" are latency critical.
+	 * This is supposed to trigger tcp_push_pending_frames() */
+	if (!err && (cmd == P_PING || cmd == P_PING_ACK))
+		drbd_tcp_nodelay(sock->socket);
+
 	return err;
 }
 
@@ -1636,7 +1642,10 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
 	if (peer_device->connection->agreed_pro_version >= 100) {
 		if (req->rq_state & RQ_EXP_RECEIVE_ACK)
 			dp_flags |= DP_SEND_RECEIVE_ACK;
-		if (req->rq_state & RQ_EXP_WRITE_ACK)
+		/* During resync, request an explicit write ack,
+		 * even in protocol != C */
+		if (req->rq_state & RQ_EXP_WRITE_ACK
+		|| (dp_flags & DP_MAY_SET_IN_SYNC))
 			dp_flags |= DP_SEND_WRITE_ACK;
 	}
 	p->dp_flags = cpu_to_be32(dp_flags);
@@ -1900,6 +1909,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
 	drbd_set_defaults(device);
 
 	atomic_set(&device->ap_bio_cnt, 0);
+	atomic_set(&device->ap_actlog_cnt, 0);
 	atomic_set(&device->ap_pending_cnt, 0);
 	atomic_set(&device->rs_pending_cnt, 0);
 	atomic_set(&device->unacked_cnt, 0);
@@ -1908,7 +1918,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
 	atomic_set(&device->rs_sect_in, 0);
 	atomic_set(&device->rs_sect_ev, 0);
 	atomic_set(&device->ap_in_flight, 0);
-	atomic_set(&device->md_io_in_use, 0);
+	atomic_set(&device->md_io.in_use, 0);
 
 	mutex_init(&device->own_state_mutex);
 	device->state_mutex = &device->own_state_mutex;
@@ -1924,17 +1934,15 @@ void drbd_init_set_defaults(struct drbd_device *device)
 	INIT_LIST_HEAD(&device->resync_reads);
 	INIT_LIST_HEAD(&device->resync_work.list);
 	INIT_LIST_HEAD(&device->unplug_work.list);
-	INIT_LIST_HEAD(&device->go_diskless.list);
-	INIT_LIST_HEAD(&device->md_sync_work.list);
-	INIT_LIST_HEAD(&device->start_resync_work.list);
 	INIT_LIST_HEAD(&device->bm_io_work.w.list);
+	INIT_LIST_HEAD(&device->pending_master_completion[0]);
+	INIT_LIST_HEAD(&device->pending_master_completion[1]);
+	INIT_LIST_HEAD(&device->pending_completion[0]);
+	INIT_LIST_HEAD(&device->pending_completion[1]);
 
 	device->resync_work.cb  = w_resync_timer;
 	device->unplug_work.cb  = w_send_write_hint;
-	device->go_diskless.cb  = w_go_diskless;
-	device->md_sync_work.cb = w_md_sync;
 	device->bm_io_work.w.cb = w_bitmap_io;
-	device->start_resync_work.cb = w_start_resync;
 
 	init_timer(&device->resync_timer);
 	init_timer(&device->md_sync_timer);
@@ -1992,7 +2000,7 @@ void drbd_device_cleanup(struct drbd_device *device)
 		drbd_bm_cleanup(device);
 	}
 
-	drbd_free_bc(device->ldev);
+	drbd_free_ldev(device->ldev);
 	device->ldev = NULL;
 
 	clear_bit(AL_SUSPENDED, &device->flags);
@@ -2006,7 +2014,6 @@ void drbd_device_cleanup(struct drbd_device *device)
 	D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
 	D_ASSERT(device, list_empty(&device->resync_work.list));
 	D_ASSERT(device, list_empty(&device->unplug_work.list));
-	D_ASSERT(device, list_empty(&device->go_diskless.list));
 
 	drbd_set_defaults(device);
 }
@@ -2129,20 +2136,6 @@ Enomem:
 	return -ENOMEM;
 }
 
-static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
-	void *unused)
-{
-	/* just so we have it.  you never know what interesting things we
-	 * might want to do here some day...
-	 */
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block drbd_notifier = {
-	.notifier_call = drbd_notify_sys,
-};
-
 static void drbd_release_all_peer_reqs(struct drbd_device *device)
 {
 	int rr;
@@ -2173,7 +2166,7 @@ void drbd_destroy_device(struct kref *kref)
 {
 	struct drbd_device *device = container_of(kref, struct drbd_device, kref);
 	struct drbd_resource *resource = device->resource;
-	struct drbd_connection *connection;
+	struct drbd_peer_device *peer_device, *tmp_peer_device;
 
 	del_timer_sync(&device->request_timer);
 
@@ -2187,7 +2180,7 @@ void drbd_destroy_device(struct kref *kref)
 	if (device->this_bdev)
 		bdput(device->this_bdev);
 
-	drbd_free_bc(device->ldev);
+	drbd_free_ldev(device->ldev);
 	device->ldev = NULL;
 
 	drbd_release_all_peer_reqs(device);
@@ -2200,15 +2193,20 @@ void drbd_destroy_device(struct kref *kref)
 
 	if (device->bitmap) /* should no longer be there. */
 		drbd_bm_cleanup(device);
-	__free_page(device->md_io_page);
+	__free_page(device->md_io.page);
 	put_disk(device->vdisk);
 	blk_cleanup_queue(device->rq_queue);
 	kfree(device->rs_plan_s);
-	kfree(first_peer_device(device));
-	kfree(device);
 
-	for_each_connection(connection, resource)
-		kref_put(&connection->kref, drbd_destroy_connection);
+	/* not for_each_connection(connection, resource):
+	 * those may have been cleaned up and disassociated already.
+	 */
+	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+		kref_put(&peer_device->connection->kref, drbd_destroy_connection);
+		kfree(peer_device);
+	}
+	memset(device, 0xfd, sizeof(*device));
+	kfree(device);
 	kref_put(&resource->kref, drbd_destroy_resource);
 }
 
@@ -2236,7 +2234,7 @@ static void do_retry(struct work_struct *ws)
 	list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
 		struct drbd_device *device = req->device;
 		struct bio *bio = req->master_bio;
-		unsigned long start_time = req->start_time;
+		unsigned long start_jif = req->start_jif;
 		bool expected;
 
 		expected =
@@ -2271,10 +2269,12 @@ static void do_retry(struct work_struct *ws)
 		/* We are not just doing generic_make_request(),
 		 * as we want to keep the start_time information. */
 		inc_ap_bio(device);
-		__drbd_make_request(device, bio, start_time);
+		__drbd_make_request(device, bio, start_jif);
 	}
 }
 
+/* called via drbd_req_put_completion_ref(),
+ * holds resource->req_lock */
 void drbd_restart_request(struct drbd_request *req)
 {
 	unsigned long flags;
@@ -2298,6 +2298,7 @@ void drbd_destroy_resource(struct kref *kref)
 	idr_destroy(&resource->devices);
 	free_cpumask_var(resource->cpu_mask);
 	kfree(resource->name);
+	memset(resource, 0xf2, sizeof(*resource));
 	kfree(resource);
 }
 
@@ -2307,8 +2308,10 @@ void drbd_free_resource(struct drbd_resource *resource)
 
 	for_each_connection_safe(connection, tmp, resource) {
 		list_del(&connection->connections);
+		drbd_debugfs_connection_cleanup(connection);
 		kref_put(&connection->kref, drbd_destroy_connection);
 	}
+	drbd_debugfs_resource_cleanup(resource);
 	kref_put(&resource->kref, drbd_destroy_resource);
 }
 
@@ -2318,8 +2321,6 @@ static void drbd_cleanup(void)
 	struct drbd_device *device;
 	struct drbd_resource *resource, *tmp;
 
-	unregister_reboot_notifier(&drbd_notifier);
-
 	/* first remove proc,
 	 * drbdsetup uses it's presence to detect
 	 * whether DRBD is loaded.
@@ -2335,6 +2336,7 @@ static void drbd_cleanup(void)
 		destroy_workqueue(retry.wq);
 
 	drbd_genl_unregister();
+	drbd_debugfs_cleanup();
 
 	idr_for_each_entry(&drbd_devices, device, i)
 		drbd_delete_device(device);
@@ -2350,7 +2352,7 @@ static void drbd_cleanup(void)
 
 	idr_destroy(&drbd_devices);
 
-	printk(KERN_INFO "drbd: module cleanup done.\n");
+	pr_info("module cleanup done.\n");
 }
 
 /**
@@ -2539,6 +2541,20 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
 	if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
 		err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
 				   cpumask_bits(new_cpu_mask), nr_cpu_ids);
+		if (err == -EOVERFLOW) {
+			/* So what. mask it out. */
+			cpumask_var_t tmp_cpu_mask;
+			if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
+				cpumask_setall(tmp_cpu_mask);
+				cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
+				drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
+					res_opts->cpu_mask,
+					strlen(res_opts->cpu_mask) > 12 ? "..." : "",
+					nr_cpu_ids);
+				free_cpumask_var(tmp_cpu_mask);
+				err = 0;
+			}
+		}
 		if (err) {
 			drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
 			/* retcode = ERR_CPU_MASK_PARSE; */
@@ -2579,10 +2595,12 @@ struct drbd_resource *drbd_create_resource(const char *name)
 	kref_init(&resource->kref);
 	idr_init(&resource->devices);
 	INIT_LIST_HEAD(&resource->connections);
+	resource->write_ordering = WO_bdev_flush;
 	list_add_tail_rcu(&resource->resources, &drbd_resources);
 	mutex_init(&resource->conf_update);
 	mutex_init(&resource->adm_mutex);
 	spin_lock_init(&resource->req_lock);
+	drbd_debugfs_resource_add(resource);
 	return resource;
 
 fail_free_name:
@@ -2593,7 +2611,7 @@ fail:
 	return NULL;
 }
 
-/* caller must be under genl_lock() */
+/* caller must be under adm_mutex */
 struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 {
 	struct drbd_resource *resource;
@@ -2617,7 +2635,6 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 	INIT_LIST_HEAD(&connection->current_epoch->list);
 	connection->epochs = 1;
 	spin_lock_init(&connection->epoch_lock);
-	connection->write_ordering = WO_bdev_flush;
 
 	connection->send.seen_any_write_yet = false;
 	connection->send.current_epoch_nr = 0;
@@ -2652,6 +2669,7 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
 
 	kref_get(&resource->kref);
 	list_add_tail_rcu(&connection->connections, &resource->connections);
+	drbd_debugfs_connection_add(connection);
 	return connection;
 
 fail_resource:
@@ -2680,6 +2698,7 @@ void drbd_destroy_connection(struct kref *kref)
 	drbd_free_socket(&connection->data);
 	kfree(connection->int_dig_in);
 	kfree(connection->int_dig_vv);
+	memset(connection, 0xfc, sizeof(*connection));
 	kfree(connection);
 	kref_put(&resource->kref, drbd_destroy_resource);
 }
@@ -2694,7 +2713,6 @@ static int init_submitter(struct drbd_device *device)
 		return -ENOMEM;
 
 	INIT_WORK(&device->submit.worker, do_submit);
-	spin_lock_init(&device->submit.lock);
 	INIT_LIST_HEAD(&device->submit.writes);
 	return 0;
 }
@@ -2764,8 +2782,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 	blk_queue_merge_bvec(q, drbd_merge_bvec);
 	q->queue_lock = &resource->req_lock;
 
-	device->md_io_page = alloc_page(GFP_KERNEL);
-	if (!device->md_io_page)
+	device->md_io.page = alloc_page(GFP_KERNEL);
+	if (!device->md_io.page)
 		goto out_no_io_page;
 
 	if (drbd_bm_init(device))
@@ -2794,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 	kref_get(&device->kref);
 
 	INIT_LIST_HEAD(&device->peer_devices);
+	INIT_LIST_HEAD(&device->pending_bitmap_io);
 	for_each_connection(connection, resource) {
 		peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
 		if (!peer_device)
@@ -2829,7 +2848,10 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 		for_each_peer_device(peer_device, device)
 			drbd_connected(peer_device);
 	}
-
+	/* move to create_peer_device() */
+	for_each_peer_device(peer_device, device)
+		drbd_debugfs_peer_device_add(peer_device);
+	drbd_debugfs_device_add(device);
 	return NO_ERROR;
 
 out_idr_remove_vol:
@@ -2853,7 +2875,7 @@ out_idr_remove_minor:
 out_no_minor_idr:
 	drbd_bm_cleanup(device);
 out_no_bitmap:
-	__free_page(device->md_io_page);
+	__free_page(device->md_io.page);
 out_no_io_page:
 	put_disk(disk);
 out_no_disk:
@@ -2868,8 +2890,13 @@ void drbd_delete_device(struct drbd_device *device)
 {
 	struct drbd_resource *resource = device->resource;
 	struct drbd_connection *connection;
+	struct drbd_peer_device *peer_device;
 	int refs = 3;
 
+	/* move to free_peer_device() */
+	for_each_peer_device(peer_device, device)
+		drbd_debugfs_peer_device_cleanup(peer_device);
+	drbd_debugfs_device_cleanup(device);
 	for_each_connection(connection, resource) {
 		idr_remove(&connection->peer_devices, device->vnr);
 		refs++;
@@ -2881,13 +2908,12 @@ void drbd_delete_device(struct drbd_device *device)
 	kref_sub(&device->kref, refs, drbd_destroy_device);
 }
 
-int __init drbd_init(void)
+static int __init drbd_init(void)
 {
 	int err;
 
 	if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
-		printk(KERN_ERR
-		       "drbd: invalid minor_count (%d)\n", minor_count);
+		pr_err("invalid minor_count (%d)\n", minor_count);
 #ifdef MODULE
 		return -EINVAL;
 #else
@@ -2897,14 +2923,11 @@ int __init drbd_init(void)
 
 	err = register_blkdev(DRBD_MAJOR, "drbd");
 	if (err) {
-		printk(KERN_ERR
-		       "drbd: unable to register block device major %d\n",
+		pr_err("unable to register block device major %d\n",
 		       DRBD_MAJOR);
 		return err;
 	}
 
-	register_reboot_notifier(&drbd_notifier);
-
 	/*
 	 * allocate all necessary structs
 	 */
@@ -2918,7 +2941,7 @@ int __init drbd_init(void)
 
 	err = drbd_genl_register();
 	if (err) {
-		printk(KERN_ERR "drbd: unable to register generic netlink family\n");
+		pr_err("unable to register generic netlink family\n");
 		goto fail;
 	}
 
@@ -2929,38 +2952,39 @@ int __init drbd_init(void)
 	err = -ENOMEM;
 	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
 	if (!drbd_proc)	{
-		printk(KERN_ERR "drbd: unable to register proc file\n");
+		pr_err("unable to register proc file\n");
 		goto fail;
 	}
 
 	retry.wq = create_singlethread_workqueue("drbd-reissue");
 	if (!retry.wq) {
-		printk(KERN_ERR "drbd: unable to create retry workqueue\n");
+		pr_err("unable to create retry workqueue\n");
 		goto fail;
 	}
 	INIT_WORK(&retry.worker, do_retry);
 	spin_lock_init(&retry.lock);
 	INIT_LIST_HEAD(&retry.writes);
 
-	printk(KERN_INFO "drbd: initialized. "
+	if (drbd_debugfs_init())
+		pr_notice("failed to initialize debugfs -- will not be available\n");
+
+	pr_info("initialized. "
 	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
 	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
-	printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
-	printk(KERN_INFO "drbd: registered as block device major %d\n",
-		DRBD_MAJOR);
-
+	pr_info("%s\n", drbd_buildtag());
+	pr_info("registered as block device major %d\n", DRBD_MAJOR);
 	return 0; /* Success! */
 
 fail:
 	drbd_cleanup();
 	if (err == -ENOMEM)
-		printk(KERN_ERR "drbd: ran out of memory\n");
+		pr_err("ran out of memory\n");
 	else
-		printk(KERN_ERR "drbd: initialization failure\n");
+		pr_err("initialization failure\n");
 	return err;
 }
 
-void drbd_free_bc(struct drbd_backing_dev *ldev)
+void drbd_free_ldev(struct drbd_backing_dev *ldev)
 {
 	if (ldev == NULL)
 		return;
@@ -2972,24 +2996,29 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
 	kfree(ldev);
 }
 
-void drbd_free_sock(struct drbd_connection *connection)
+static void drbd_free_one_sock(struct drbd_socket *ds)
 {
-	if (connection->data.socket) {
-		mutex_lock(&connection->data.mutex);
-		kernel_sock_shutdown(connection->data.socket, SHUT_RDWR);
-		sock_release(connection->data.socket);
-		connection->data.socket = NULL;
-		mutex_unlock(&connection->data.mutex);
-	}
-	if (connection->meta.socket) {
-		mutex_lock(&connection->meta.mutex);
-		kernel_sock_shutdown(connection->meta.socket, SHUT_RDWR);
-		sock_release(connection->meta.socket);
-		connection->meta.socket = NULL;
-		mutex_unlock(&connection->meta.mutex);
+	struct socket *s;
+	mutex_lock(&ds->mutex);
+	s = ds->socket;
+	ds->socket = NULL;
+	mutex_unlock(&ds->mutex);
+	if (s) {
+		/* so debugfs does not need to mutex_lock() */
+		synchronize_rcu();
+		kernel_sock_shutdown(s, SHUT_RDWR);
+		sock_release(s);
 	}
 }
 
+void drbd_free_sock(struct drbd_connection *connection)
+{
+	if (connection->data.socket)
+		drbd_free_one_sock(&connection->data);
+	if (connection->meta.socket)
+		drbd_free_one_sock(&connection->meta);
+}
+
 /* meta data management */
 
 void conn_md_sync(struct drbd_connection *connection)
@@ -3093,7 +3122,7 @@ void drbd_md_sync(struct drbd_device *device)
 	if (!get_ldev_if_state(device, D_FAILED))
 		return;
 
-	buffer = drbd_md_get_buffer(device);
+	buffer = drbd_md_get_buffer(device, __func__);
 	if (!buffer)
 		goto out;
 
@@ -3253,7 +3282,7 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
 	if (device->state.disk != D_DISKLESS)
 		return ERR_DISK_CONFIGURED;
 
-	buffer = drbd_md_get_buffer(device);
+	buffer = drbd_md_get_buffer(device, __func__);
 	if (!buffer)
 		return ERR_NOMEM;
 
@@ -3466,23 +3495,19 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
  *
  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
  */
-int drbd_bmio_set_n_write(struct drbd_device *device)
+int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local)
 {
 	int rv = -EIO;
 
-	if (get_ldev_if_state(device, D_ATTACHING)) {
-		drbd_md_set_flag(device, MDF_FULL_SYNC);
-		drbd_md_sync(device);
-		drbd_bm_set_all(device);
-
-		rv = drbd_bm_write(device);
+	drbd_md_set_flag(device, MDF_FULL_SYNC);
+	drbd_md_sync(device);
+	drbd_bm_set_all(device);
 
-		if (!rv) {
-			drbd_md_clear_flag(device, MDF_FULL_SYNC);
-			drbd_md_sync(device);
-		}
+	rv = drbd_bm_write(device);
 
-		put_ldev(device);
+	if (!rv) {
+		drbd_md_clear_flag(device, MDF_FULL_SYNC);
+		drbd_md_sync(device);
 	}
 
 	return rv;
@@ -3494,18 +3519,11 @@ int drbd_bmio_set_n_write(struct drbd_device *device)
  *
  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
  */
-int drbd_bmio_clear_n_write(struct drbd_device *device)
+int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local)
 {
-	int rv = -EIO;
-
 	drbd_resume_al(device);
-	if (get_ldev_if_state(device, D_ATTACHING)) {
-		drbd_bm_clear_all(device);
-		rv = drbd_bm_write(device);
-		put_ldev(device);
-	}
-
-	return rv;
+	drbd_bm_clear_all(device);
+	return drbd_bm_write(device);
 }
 
 static int w_bitmap_io(struct drbd_work *w, int unused)
@@ -3537,61 +3555,6 @@ static int w_bitmap_io(struct drbd_work *w, int unused)
 	return 0;
 }
 
-void drbd_ldev_destroy(struct drbd_device *device)
-{
-	lc_destroy(device->resync);
-	device->resync = NULL;
-	lc_destroy(device->act_log);
-	device->act_log = NULL;
-	__no_warn(local,
-		drbd_free_bc(device->ldev);
-		device->ldev = NULL;);
-
-	clear_bit(GO_DISKLESS, &device->flags);
-}
-
-static int w_go_diskless(struct drbd_work *w, int unused)
-{
-	struct drbd_device *device =
-		container_of(w, struct drbd_device, go_diskless);
-
-	D_ASSERT(device, device->state.disk == D_FAILED);
-	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
-	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
-	 * the protected members anymore, though, so once put_ldev reaches zero
-	 * again, it will be safe to free them. */
-
-	/* Try to write changed bitmap pages, read errors may have just
-	 * set some bits outside the area covered by the activity log.
-	 *
-	 * If we have an IO error during the bitmap writeout,
-	 * we will want a full sync next time, just in case.
-	 * (Do we want a specific meta data flag for this?)
-	 *
-	 * If that does not make it to stable storage either,
-	 * we cannot do anything about that anymore.
-	 *
-	 * We still need to check if both bitmap and ldev are present, we may
-	 * end up here after a failed attach, before ldev was even assigned.
-	 */
-	if (device->bitmap && device->ldev) {
-		/* An interrupted resync or similar is allowed to recounts bits
-		 * while we detach.
-		 * Any modifications would not be expected anymore, though.
-		 */
-		if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
-					"detach", BM_LOCKED_TEST_ALLOWED)) {
-			if (test_bit(WAS_READ_ERROR, &device->flags)) {
-				drbd_md_set_flag(device, MDF_FULL_SYNC);
-				drbd_md_sync(device);
-			}
-		}
-	}
-
-	drbd_force_state(device, NS(disk, D_DISKLESS));
-	return 0;
-}
-
 /**
  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
  * @device:	DRBD device.
@@ -3603,6 +3566,9 @@ static int w_go_diskless(struct drbd_work *w, int unused)
  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
  * called from worker context. It MUST NOT be used while a previous such
  * work is still pending!
+ *
+ * Its worker function encloses the call of io_fn() by get_ldev() and
+ * put_ldev().
  */
 void drbd_queue_bitmap_io(struct drbd_device *device,
 			  int (*io_fn)(struct drbd_device *),
@@ -3685,25 +3651,7 @@ int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
 static void md_sync_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
-
-	/* must not double-queue! */
-	if (list_empty(&device->md_sync_work.list))
-		drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
-				      &device->md_sync_work);
-}
-
-static int w_md_sync(struct drbd_work *w, int unused)
-{
-	struct drbd_device *device =
-		container_of(w, struct drbd_device, md_sync_work);
-
-	drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
-#ifdef DEBUG
-	drbd_warn(device, "last md_mark_dirty: %s:%u\n",
-		device->last_md_mark_dirty.func, device->last_md_mark_dirty.line);
-#endif
-	drbd_md_sync(device);
-	return 0;
+	drbd_device_post_work(device, MD_SYNC);
 }
 
 const char *cmdname(enum drbd_packet cmd)
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 3f2e16738080..1cd47df44bda 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -23,6 +23,8 @@
 
  */
 
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/drbd.h>
 #include <linux/in.h>
@@ -85,7 +87,7 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
 {
 	genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
 	if (genlmsg_reply(skb, info))
-		printk(KERN_ERR "drbd: error sending genl reply\n");
+		pr_err("error sending genl reply\n");
 }
 
 /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
@@ -558,8 +560,10 @@ void conn_try_outdate_peer_async(struct drbd_connection *connection)
 }
 
 enum drbd_state_rv
-drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
+drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force)
 {
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
 	const int max_tries = 4;
 	enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
 	struct net_conf *nc;
@@ -607,7 +611,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 		    device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
 			D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
 
-			if (conn_try_outdate_peer(first_peer_device(device)->connection)) {
+			if (conn_try_outdate_peer(connection)) {
 				val.disk = D_UP_TO_DATE;
 				mask.disk = D_MASK;
 			}
@@ -617,7 +621,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 		if (rv == SS_NOTHING_TO_DO)
 			goto out;
 		if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
-			if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) {
+			if (!conn_try_outdate_peer(connection) && force) {
 				drbd_warn(device, "Forced into split brain situation!\n");
 				mask.pdsk = D_MASK;
 				val.pdsk  = D_OUTDATED;
@@ -630,7 +634,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 			   retry at most once more in this case. */
 			int timeo;
 			rcu_read_lock();
-			nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+			nc = rcu_dereference(connection->net_conf);
 			timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
 			rcu_read_unlock();
 			schedule_timeout_interruptible(timeo);
@@ -659,19 +663,17 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 	/* FIXME also wait for all pending P_BARRIER_ACK? */
 
 	if (new_role == R_SECONDARY) {
-		set_disk_ro(device->vdisk, true);
 		if (get_ldev(device)) {
 			device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
 			put_ldev(device);
 		}
 	} else {
-		/* Called from drbd_adm_set_role only.
-		 * We are still holding the conf_update mutex. */
-		nc = first_peer_device(device)->connection->net_conf;
+		mutex_lock(&device->resource->conf_update);
+		nc = connection->net_conf;
 		if (nc)
 			nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+		mutex_unlock(&device->resource->conf_update);
 
-		set_disk_ro(device->vdisk, false);
 		if (get_ldev(device)) {
 			if (((device->state.conn < C_CONNECTED ||
 			       device->state.pdsk <= D_FAILED)
@@ -689,12 +691,12 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 	if (device->state.conn >= C_WF_REPORT_PARAMS) {
 		/* if this was forced, we should consider sync */
 		if (forced)
-			drbd_send_uuids(first_peer_device(device));
-		drbd_send_current_state(first_peer_device(device));
+			drbd_send_uuids(peer_device);
+		drbd_send_current_state(peer_device);
 	}
 
 	drbd_md_sync(device);
-
+	set_disk_ro(device->vdisk, new_role == R_SECONDARY);
 	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
 out:
 	mutex_unlock(device->state_mutex);
@@ -891,7 +893,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 	 * still lock the act_log to not trigger ASSERTs there.
 	 */
 	drbd_suspend_io(device);
-	buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */
+	buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
 	if (!buffer) {
 		drbd_resume_io(device);
 		return DS_ERROR;
@@ -971,6 +973,10 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
 	if (la_size_changed || md_moved || rs) {
 		u32 prev_flags;
 
+		/* We do some synchronous IO below, which may take some time.
+		 * Clear the timer, to avoid scary "timer expired!" messages,
+		 * "Superblock" is written out at least twice below, anyways. */
+		del_timer(&device->md_sync_timer);
 		drbd_al_shrink(device); /* All extents inactive. */
 
 		prev_flags = md->flags;
@@ -1116,15 +1122,16 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
 	return 0;
 }
 
-static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size)
+static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
+				   unsigned int max_bio_size)
 {
 	struct request_queue * const q = device->rq_queue;
 	unsigned int max_hw_sectors = max_bio_size >> 9;
 	unsigned int max_segments = 0;
 	struct request_queue *b = NULL;
 
-	if (get_ldev_if_state(device, D_ATTACHING)) {
-		b = device->ldev->backing_bdev->bd_disk->queue;
+	if (bdev) {
+		b = bdev->backing_bdev->bd_disk->queue;
 
 		max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
 		rcu_read_lock();
@@ -1169,11 +1176,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_
 				 b->backing_dev_info.ra_pages);
 			q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
 		}
-		put_ldev(device);
 	}
 }
 
-void drbd_reconsider_max_bio_size(struct drbd_device *device)
+void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
 {
 	unsigned int now, new, local, peer;
 
@@ -1181,10 +1187,9 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
 	local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
 	peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
 
-	if (get_ldev_if_state(device, D_ATTACHING)) {
-		local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+	if (bdev) {
+		local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
 		device->local_max_bio_size = local;
-		put_ldev(device);
 	}
 	local = min(local, DRBD_MAX_BIO_SIZE);
 
@@ -1217,7 +1222,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
 	if (new != now)
 		drbd_info(device, "max BIO size = %u\n", new);
 
-	drbd_setup_queue_param(device, new);
+	drbd_setup_queue_param(device, bdev, new);
 }
 
 /* Starts the worker thread */
@@ -1299,6 +1304,13 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
 	return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
 }
 
+static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
+{
+	return	a->disk_barrier != b->disk_barrier ||
+		a->disk_flushes != b->disk_flushes ||
+		a->disk_drain != b->disk_drain;
+}
+
 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 {
 	struct drbd_config_context adm_ctx;
@@ -1405,7 +1417,8 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 	else
 		set_bit(MD_NO_FUA, &device->flags);
 
-	drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
+	if (write_ordering_changed(old_disk_conf, new_disk_conf))
+		drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
 
 	drbd_md_sync(device);
 
@@ -1440,6 +1453,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 {
 	struct drbd_config_context adm_ctx;
 	struct drbd_device *device;
+	struct drbd_peer_device *peer_device;
+	struct drbd_connection *connection;
 	int err;
 	enum drbd_ret_code retcode;
 	enum determine_dev_size dd;
@@ -1462,7 +1477,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 
 	device = adm_ctx.device;
 	mutex_lock(&adm_ctx.resource->adm_mutex);
-	conn_reconfig_start(first_peer_device(device)->connection);
+	peer_device = first_peer_device(device);
+	connection = peer_device ? peer_device->connection : NULL;
+	conn_reconfig_start(connection);
 
 	/* if you want to reconfigure, please tear down first */
 	if (device->state.disk > D_DISKLESS) {
@@ -1473,7 +1490,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	 * drbd_ldev_destroy is done already, we may end up here very fast,
 	 * e.g. if someone calls attach from the on-io-error handler,
 	 * to realize a "hot spare" feature (not that I'd recommend that) */
-	wait_event(device->misc_wait, !atomic_read(&device->local_cnt));
+	wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags));
 
 	/* make sure there is no leftover from previous force-detach attempts */
 	clear_bit(FORCE_DETACH, &device->flags);
@@ -1529,7 +1546,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		goto fail;
 
 	rcu_read_lock();
-	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	nc = rcu_dereference(connection->net_conf);
 	if (nc) {
 		if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
 			rcu_read_unlock();
@@ -1649,7 +1666,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	 */
 	wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
 	/* and for any other previously queued work */
-	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+	drbd_flush_workqueue(&connection->sender_work);
 
 	rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
 	retcode = rv;  /* FIXME: Type mismatch. */
@@ -1710,7 +1727,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	new_disk_conf = NULL;
 	new_plan = NULL;
 
-	drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
+	drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
 
 	if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
 		set_bit(CRASHED_PRIMARY, &device->flags);
@@ -1726,7 +1743,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	device->read_cnt = 0;
 	device->writ_cnt = 0;
 
-	drbd_reconsider_max_bio_size(device);
+	drbd_reconsider_max_bio_size(device, device->ldev);
 
 	/* If I am currently not R_PRIMARY,
 	 * but meta data primary indicator is set,
@@ -1845,7 +1862,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 
 	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
 	put_ldev(device);
-	conn_reconfig_done(first_peer_device(device)->connection);
+	conn_reconfig_done(connection);
 	mutex_unlock(&adm_ctx.resource->adm_mutex);
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -1856,7 +1873,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	drbd_force_state(device, NS(disk, D_DISKLESS));
 	drbd_md_sync(device);
  fail:
-	conn_reconfig_done(first_peer_device(device)->connection);
+	conn_reconfig_done(connection);
 	if (nbc) {
 		if (nbc->backing_bdev)
 			blkdev_put(nbc->backing_bdev,
@@ -1888,7 +1905,7 @@ static int adm_detach(struct drbd_device *device, int force)
 	}
 
 	drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
-	drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */
+	drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
 	retcode = drbd_request_state(device, NS(disk, D_FAILED));
 	drbd_md_put_buffer(device);
 	/* D_FAILED will transition to DISKLESS. */
@@ -2654,8 +2671,13 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto out;
 
-	mutex_lock(&adm_ctx.resource->adm_mutex);
 	device = adm_ctx.device;
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 
 	/* If there is still bitmap IO pending, probably because of a previous
 	 * resync just being finished, wait for it before requesting a new resync.
@@ -2679,6 +2701,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
 		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
 	drbd_resume_io(device);
 	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	put_ldev(device);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -2704,7 +2727,7 @@ out:
 	return 0;
 }
 
-static int drbd_bmio_set_susp_al(struct drbd_device *device)
+static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local)
 {
 	int rv;
 
@@ -2725,8 +2748,13 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto out;
 
-	mutex_lock(&adm_ctx.resource->adm_mutex);
 	device = adm_ctx.device;
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 
 	/* If there is still bitmap IO pending, probably because of a previous
 	 * resync just being finished, wait for it before requesting a new resync.
@@ -2753,6 +2781,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
 		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
 	drbd_resume_io(device);
 	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	put_ldev(device);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -2892,7 +2921,7 @@ static struct drbd_connection *the_only_connection(struct drbd_resource *resourc
 	return list_first_entry(&resource->connections, struct drbd_connection, connections);
 }
 
-int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
+static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
 		const struct sib_info *sib)
 {
 	struct drbd_resource *resource = device->resource;
@@ -3622,13 +3651,6 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
 	unsigned seq;
 	int err = -ENOMEM;
 
-	if (sib->sib_reason == SIB_SYNC_PROGRESS) {
-		if (time_after(jiffies, device->rs_last_bcast + HZ))
-			device->rs_last_bcast = jiffies;
-		else
-			return;
-	}
-
 	seq = atomic_inc_return(&drbd_genl_seq);
 	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
 	if (!msg)
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 89736bdbbc70..06e6147c7601 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -60,20 +60,65 @@ static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
 		seq_printf(seq, "%ld", v);
 }
 
+static void drbd_get_syncer_progress(struct drbd_device *device,
+		union drbd_dev_state state, unsigned long *rs_total,
+		unsigned long *bits_left, unsigned int *per_mil_done)
+{
+	/* this is to break it at compile time when we change that, in case we
+	 * want to support more than (1<<32) bits on a 32bit arch. */
+	typecheck(unsigned long, device->rs_total);
+	*rs_total = device->rs_total;
+
+	/* note: both rs_total and rs_left are in bits, i.e. in
+	 * units of BM_BLOCK_SIZE.
+	 * for the percentage, we don't care. */
+
+	if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+		*bits_left = device->ov_left;
+	else
+		*bits_left = drbd_bm_total_weight(device) - device->rs_failed;
+	/* >> 10 to prevent overflow,
+	 * +1 to prevent division by zero */
+	if (*bits_left > *rs_total) {
+		/* D'oh. Maybe a logic bug somewhere.  More likely just a race
+		 * between state change and reset of rs_total.
+		 */
+		*bits_left = *rs_total;
+		*per_mil_done = *rs_total ? 0 : 1000;
+	} else {
+		/* Make sure the division happens in long context.
+		 * We allow up to one petabyte storage right now,
+		 * at a granularity of 4k per bit that is 2**38 bits.
+		 * After shift right and multiplication by 1000,
+		 * this should still fit easily into a 32bit long,
+		 * so we don't need a 64bit division on 32bit arch.
+		 * Note: currently we don't support such large bitmaps on 32bit
+		 * arch anyways, but no harm done to be prepared for it here.
+		 */
+		unsigned int shift = *rs_total > UINT_MAX ? 16 : 10;
+		unsigned long left = *bits_left >> shift;
+		unsigned long total = 1UL + (*rs_total >> shift);
+		unsigned long tmp = 1000UL - left * 1000UL/total;
+		*per_mil_done = tmp;
+	}
+}
+
+
 /*lge
  * progress bars shamelessly adapted from driver/md/md.c
  * output looks like
  *	[=====>..............] 33.5% (23456/123456)
  *	finish: 2:20:20 speed: 6,345 (6,456) K/sec
  */
-static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq)
+static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq,
+		union drbd_dev_state state)
 {
-	unsigned long db, dt, dbdt, rt, rs_left;
+	unsigned long db, dt, dbdt, rt, rs_total, rs_left;
 	unsigned int res;
 	int i, x, y;
 	int stalled = 0;
 
-	drbd_get_syncer_progress(device, &rs_left, &res);
+	drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res);
 
 	x = res/50;
 	y = 20-x;
@@ -85,21 +130,21 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
 		seq_printf(seq, ".");
 	seq_printf(seq, "] ");
 
-	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+	if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
 		seq_printf(seq, "verified:");
 	else
 		seq_printf(seq, "sync'ed:");
 	seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
 
 	/* if more than a few GB, display in MB */
-	if (device->rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+	if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
 		seq_printf(seq, "(%lu/%lu)M",
 			    (unsigned long) Bit2KB(rs_left >> 10),
-			    (unsigned long) Bit2KB(device->rs_total >> 10));
+			    (unsigned long) Bit2KB(rs_total >> 10));
 	else
 		seq_printf(seq, "(%lu/%lu)K\n\t",
 			    (unsigned long) Bit2KB(rs_left),
-			    (unsigned long) Bit2KB(device->rs_total));
+			    (unsigned long) Bit2KB(rs_total));
 
 	/* see drivers/md/md.c
 	 * We do not want to overflow, so the order of operands and
@@ -150,13 +195,13 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
 	dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
 	if (dt == 0)
 		dt = 1;
-	db = device->rs_total - rs_left;
+	db = rs_total - rs_left;
 	dbdt = Bit2KB(db/dt);
 	seq_printf_with_thousands_grouping(seq, dbdt);
 	seq_printf(seq, ")");
 
-	if (device->state.conn == C_SYNC_TARGET ||
-	    device->state.conn == C_VERIFY_S) {
+	if (state.conn == C_SYNC_TARGET ||
+	    state.conn == C_VERIFY_S) {
 		seq_printf(seq, " want: ");
 		seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
 	}
@@ -168,8 +213,8 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
 		unsigned long bm_bits = drbd_bm_bits(device);
 		unsigned long bit_pos;
 		unsigned long long stop_sector = 0;
-		if (device->state.conn == C_VERIFY_S ||
-		    device->state.conn == C_VERIFY_T) {
+		if (state.conn == C_VERIFY_S ||
+		    state.conn == C_VERIFY_T) {
 			bit_pos = bm_bits - device->ov_left;
 			if (verify_can_do_stop_sector(device))
 				stop_sector = device->ov_stop_sector;
@@ -188,22 +233,13 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
 	}
 }
 
-static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
-{
-	struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
-
-	seq_printf(seq, "%5d %s %s\n", bme->rs_left,
-		   bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
-		   bme->flags & BME_LOCKED ? "LOCKED" : "------"
-		   );
-}
-
 static int drbd_seq_show(struct seq_file *seq, void *v)
 {
 	int i, prev_i = -1;
 	const char *sn;
 	struct drbd_device *device;
 	struct net_conf *nc;
+	union drbd_dev_state state;
 	char wp;
 
 	static char write_ordering_chars[] = {
@@ -241,11 +277,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			seq_printf(seq, "\n");
 		prev_i = i;
 
-		sn = drbd_conn_str(device->state.conn);
+		state = device->state;
+		sn = drbd_conn_str(state.conn);
 
-		if (device->state.conn == C_STANDALONE &&
-		    device->state.disk == D_DISKLESS &&
-		    device->state.role == R_SECONDARY) {
+		if (state.conn == C_STANDALONE &&
+		    state.disk == D_DISKLESS &&
+		    state.role == R_SECONDARY) {
 			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
 		} else {
 			/* reset device->congestion_reason */
@@ -258,15 +295,15 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
 			   "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
 			   i, sn,
-			   drbd_role_str(device->state.role),
-			   drbd_role_str(device->state.peer),
-			   drbd_disk_str(device->state.disk),
-			   drbd_disk_str(device->state.pdsk),
+			   drbd_role_str(state.role),
+			   drbd_role_str(state.peer),
+			   drbd_disk_str(state.disk),
+			   drbd_disk_str(state.pdsk),
 			   wp,
 			   drbd_suspended(device) ? 's' : 'r',
-			   device->state.aftr_isp ? 'a' : '-',
-			   device->state.peer_isp ? 'p' : '-',
-			   device->state.user_isp ? 'u' : '-',
+			   state.aftr_isp ? 'a' : '-',
+			   state.peer_isp ? 'p' : '-',
+			   state.user_isp ? 'u' : '-',
 			   device->congestion_reason ?: '-',
 			   test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
 			   device->send_cnt/2,
@@ -281,17 +318,17 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			   atomic_read(&device->unacked_cnt),
 			   atomic_read(&device->ap_bio_cnt),
 			   first_peer_device(device)->connection->epochs,
-			   write_ordering_chars[first_peer_device(device)->connection->write_ordering]
+			   write_ordering_chars[device->resource->write_ordering]
 			);
 			seq_printf(seq, " oos:%llu\n",
 				   Bit2KB((unsigned long long)
 					   drbd_bm_total_weight(device)));
 		}
-		if (device->state.conn == C_SYNC_SOURCE ||
-		    device->state.conn == C_SYNC_TARGET ||
-		    device->state.conn == C_VERIFY_S ||
-		    device->state.conn == C_VERIFY_T)
-			drbd_syncer_progress(device, seq);
+		if (state.conn == C_SYNC_SOURCE ||
+		    state.conn == C_SYNC_TARGET ||
+		    state.conn == C_VERIFY_S ||
+		    state.conn == C_VERIFY_T)
+			drbd_syncer_progress(device, seq, state);
 
 		if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
 			lc_seq_printf_stats(seq, device->resync);
@@ -299,12 +336,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			put_ldev(device);
 		}
 
-		if (proc_details >= 2) {
-			if (device->resync) {
-				lc_seq_dump_details(seq, device->resync, "rs_left",
-					resync_dump_detail);
-			}
-		}
+		if (proc_details >= 2)
+			seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt));
 	}
 	rcu_read_unlock();
 
@@ -316,7 +349,7 @@ static int drbd_proc_open(struct inode *inode, struct file *file)
 	int err;
 
 	if (try_module_get(THIS_MODULE)) {
-		err = single_open(file, drbd_seq_show, PDE_DATA(inode));
+		err = single_open(file, drbd_seq_show, NULL);
 		if (err)
 			module_put(THIS_MODULE);
 		return err;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 5b17ec88ea05..9342b8da73ab 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -362,17 +362,14 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 			goto fail;
 	}
 
+	memset(peer_req, 0, sizeof(*peer_req));
+	INIT_LIST_HEAD(&peer_req->w.list);
 	drbd_clear_interval(&peer_req->i);
 	peer_req->i.size = data_size;
 	peer_req->i.sector = sector;
-	peer_req->i.local = false;
-	peer_req->i.waiting = false;
-
-	peer_req->epoch = NULL;
+	peer_req->submit_jif = jiffies;
 	peer_req->peer_device = peer_device;
 	peer_req->pages = page;
-	atomic_set(&peer_req->pending_bios, 0);
-	peer_req->flags = 0;
 	/*
 	 * The block_id is opaque to the receiver.  It is not endianness
 	 * converted, and sent back to the sender unchanged.
@@ -389,11 +386,16 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
 		       int is_net)
 {
+	might_sleep();
 	if (peer_req->flags & EE_HAS_DIGEST)
 		kfree(peer_req->digest);
 	drbd_free_pages(device, peer_req->pages, is_net);
 	D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
 	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+	if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
+		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+		drbd_al_complete_io(device, &peer_req->i);
+	}
 	mempool_free(peer_req, drbd_ee_mempool);
 }
 
@@ -791,8 +793,18 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke
 {
 	unsigned int header_size = drbd_header_size(connection);
 	struct packet_info pi;
+	struct net_conf *nc;
 	int err;
 
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -EIO;
+	}
+	sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
+	rcu_read_unlock();
+
 	err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
 	if (err != header_size) {
 		if (err >= 0)
@@ -809,7 +821,7 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke
  * drbd_socket_okay() - Free the socket if its connection is not okay
  * @sock:	pointer to the pointer to the socket.
  */
-static int drbd_socket_okay(struct socket **sock)
+static bool drbd_socket_okay(struct socket **sock)
 {
 	int rr;
 	char tb[4];
@@ -827,6 +839,30 @@ static int drbd_socket_okay(struct socket **sock)
 		return false;
 	}
 }
+
+static bool connection_established(struct drbd_connection *connection,
+				   struct socket **sock1,
+				   struct socket **sock2)
+{
+	struct net_conf *nc;
+	int timeout;
+	bool ok;
+
+	if (!*sock1 || !*sock2)
+		return false;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
+	rcu_read_unlock();
+	schedule_timeout_interruptible(timeout);
+
+	ok = drbd_socket_okay(sock1);
+	ok = drbd_socket_okay(sock2) && ok;
+
+	return ok;
+}
+
 /* Gets called if a connection is established, or if a new minor gets created
    in a connection */
 int drbd_connected(struct drbd_peer_device *peer_device)
@@ -868,8 +904,8 @@ static int conn_connect(struct drbd_connection *connection)
 	struct drbd_socket sock, msock;
 	struct drbd_peer_device *peer_device;
 	struct net_conf *nc;
-	int vnr, timeout, h, ok;
-	bool discard_my_data;
+	int vnr, timeout, h;
+	bool discard_my_data, ok;
 	enum drbd_state_rv rv;
 	struct accept_wait_data ad = {
 		.connection = connection,
@@ -913,17 +949,8 @@ static int conn_connect(struct drbd_connection *connection)
 			}
 		}
 
-		if (sock.socket && msock.socket) {
-			rcu_read_lock();
-			nc = rcu_dereference(connection->net_conf);
-			timeout = nc->ping_timeo * HZ / 10;
-			rcu_read_unlock();
-			schedule_timeout_interruptible(timeout);
-			ok = drbd_socket_okay(&sock.socket);
-			ok = drbd_socket_okay(&msock.socket) && ok;
-			if (ok)
-				break;
-		}
+		if (connection_established(connection, &sock.socket, &msock.socket))
+			break;
 
 retry:
 		s = drbd_wait_for_connect(connection, &ad);
@@ -969,8 +996,7 @@ randomize:
 				goto out_release_sockets;
 		}
 
-		ok = drbd_socket_okay(&sock.socket);
-		ok = drbd_socket_okay(&msock.socket) && ok;
+		ok = connection_established(connection, &sock.socket, &msock.socket);
 	} while (!ok);
 
 	if (ad.s_listen)
@@ -1151,7 +1177,7 @@ static void drbd_flush(struct drbd_connection *connection)
 	struct drbd_peer_device *peer_device;
 	int vnr;
 
-	if (connection->write_ordering >= WO_bdev_flush) {
+	if (connection->resource->write_ordering >= WO_bdev_flush) {
 		rcu_read_lock();
 		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 			struct drbd_device *device = peer_device->device;
@@ -1161,14 +1187,22 @@ static void drbd_flush(struct drbd_connection *connection)
 			kref_get(&device->kref);
 			rcu_read_unlock();
 
+			/* Right now, we have only this one synchronous code path
+			 * for flushes between request epochs.
+			 * We may want to make those asynchronous,
+			 * or at least parallelize the flushes to the volume devices.
+			 */
+			device->flush_jif = jiffies;
+			set_bit(FLUSH_PENDING, &device->flags);
 			rv = blkdev_issue_flush(device->ldev->backing_bdev,
 					GFP_NOIO, NULL);
+			clear_bit(FLUSH_PENDING, &device->flags);
 			if (rv) {
 				drbd_info(device, "local disk flush failed with status %d\n", rv);
 				/* would rather check on EOPNOTSUPP, but that is not reliable.
 				 * don't try again for ANY return value != 0
 				 * if (rv == -EOPNOTSUPP) */
-				drbd_bump_write_ordering(connection, WO_drain_io);
+				drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
 			}
 			put_ldev(device);
 			kref_put(&device->kref, drbd_destroy_device);
@@ -1257,15 +1291,30 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio
 	return rv;
 }
 
+static enum write_ordering_e
+max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
+{
+	struct disk_conf *dc;
+
+	dc = rcu_dereference(bdev->disk_conf);
+
+	if (wo == WO_bdev_flush && !dc->disk_flushes)
+		wo = WO_drain_io;
+	if (wo == WO_drain_io && !dc->disk_drain)
+		wo = WO_none;
+
+	return wo;
+}
+
 /**
  * drbd_bump_write_ordering() - Fall back to an other write ordering method
  * @connection:	DRBD connection.
  * @wo:		Write ordering method to try.
  */
-void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo)
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+			      enum write_ordering_e wo)
 {
-	struct disk_conf *dc;
-	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
 	enum write_ordering_e pwo;
 	int vnr;
 	static char *write_ordering_str[] = {
@@ -1274,26 +1323,27 @@ void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ord
 		[WO_bdev_flush] = "flush",
 	};
 
-	pwo = connection->write_ordering;
-	wo = min(pwo, wo);
+	pwo = resource->write_ordering;
+	if (wo != WO_bdev_flush)
+		wo = min(pwo, wo);
 	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		if (get_ldev(device)) {
+			wo = max_allowed_wo(device->ldev, wo);
+			if (device->ldev == bdev)
+				bdev = NULL;
+			put_ldev(device);
+		}
+	}
 
-		if (!get_ldev_if_state(device, D_ATTACHING))
-			continue;
-		dc = rcu_dereference(device->ldev->disk_conf);
+	if (bdev)
+		wo = max_allowed_wo(bdev, wo);
 
-		if (wo == WO_bdev_flush && !dc->disk_flushes)
-			wo = WO_drain_io;
-		if (wo == WO_drain_io && !dc->disk_drain)
-			wo = WO_none;
-		put_ldev(device);
-	}
 	rcu_read_unlock();
-	connection->write_ordering = wo;
-	if (pwo != connection->write_ordering || wo == WO_bdev_flush)
-		drbd_info(connection, "Method to ensure write ordering: %s\n", write_ordering_str[connection->write_ordering]);
+
+	resource->write_ordering = wo;
+	if (pwo != resource->write_ordering || wo == WO_bdev_flush)
+		drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
 
 /**
@@ -1330,6 +1380,13 @@ int drbd_submit_peer_request(struct drbd_device *device,
 		/* wait for all pending IO completions, before we start
 		 * zeroing things out. */
 		conn_wait_active_ee_empty(first_peer_device(device)->connection);
+		/* add it to the active list now,
+		 * so we can find it to present it in debugfs */
+		peer_req->submit_jif = jiffies;
+		peer_req->flags |= EE_SUBMITTED;
+		spin_lock_irq(&device->resource->req_lock);
+		list_add_tail(&peer_req->w.list, &device->active_ee);
+		spin_unlock_irq(&device->resource->req_lock);
 		if (blkdev_issue_zeroout(device->ldev->backing_bdev,
 			sector, ds >> 9, GFP_NOIO))
 			peer_req->flags |= EE_WAS_ERROR;
@@ -1398,6 +1455,9 @@ submit:
 	D_ASSERT(device, page == NULL);
 
 	atomic_set(&peer_req->pending_bios, n_bios);
+	/* for debugfs: update timestamp, mark as submitted */
+	peer_req->submit_jif = jiffies;
+	peer_req->flags |= EE_SUBMITTED;
 	do {
 		bio = bios;
 		bios = bios->bi_next;
@@ -1471,7 +1531,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 	 * R_PRIMARY crashes now.
 	 * Therefore we must send the barrier_ack after the barrier request was
 	 * completed. */
-	switch (connection->write_ordering) {
+	switch (connection->resource->write_ordering) {
 	case WO_none:
 		if (rv == FE_RECYCLED)
 			return 0;
@@ -1498,7 +1558,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 
 		return 0;
 	default:
-		drbd_err(connection, "Strangeness in connection->write_ordering %d\n", connection->write_ordering);
+		drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
+			 connection->resource->write_ordering);
 		return -EIO;
 	}
 
@@ -1531,7 +1592,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 	struct drbd_peer_request *peer_req;
 	struct page *page;
 	int dgs, ds, err;
-	int data_size = pi->size;
+	unsigned int data_size = pi->size;
 	void *dig_in = peer_device->connection->int_dig_in;
 	void *dig_vv = peer_device->connection->int_dig_vv;
 	unsigned long *data;
@@ -1578,6 +1639,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 	if (!peer_req)
 		return NULL;
 
+	peer_req->flags |= EE_WRITE;
 	if (trim)
 		return peer_req;
 
@@ -1734,9 +1796,10 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
 	 * respective _drbd_clear_done_ee */
 
 	peer_req->w.cb = e_end_resync_block;
+	peer_req->submit_jif = jiffies;
 
 	spin_lock_irq(&device->resource->req_lock);
-	list_add(&peer_req->w.list, &device->sync_ee);
+	list_add_tail(&peer_req->w.list, &device->sync_ee);
 	spin_unlock_irq(&device->resource->req_lock);
 
 	atomic_add(pi->size >> 9, &device->rs_sect_ev);
@@ -1889,6 +1952,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
 		}
 		dec_unacked(device);
 	}
+
 	/* we delete from the conflict detection hash _after_ we sent out the
 	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
 	if (peer_req->flags & EE_IN_INTERVAL_TREE) {
@@ -2115,6 +2179,8 @@ static int handle_write_conflicts(struct drbd_device *device,
 	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
 		if (i == &peer_req->i)
 			continue;
+		if (i->completed)
+			continue;
 
 		if (!i->local) {
 			/*
@@ -2147,7 +2213,6 @@ static int handle_write_conflicts(struct drbd_device *device,
 					  (unsigned long long)sector, size,
 					  superseded ? "local" : "remote");
 
-			inc_unacked(device);
 			peer_req->w.cb = superseded ? e_send_superseded :
 						   e_send_retry_write;
 			list_add_tail(&peer_req->w.list, &device->done_ee);
@@ -2206,6 +2271,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 {
 	struct drbd_peer_device *peer_device;
 	struct drbd_device *device;
+	struct net_conf *nc;
 	sector_t sector;
 	struct drbd_peer_request *peer_req;
 	struct p_data *p = pi->data;
@@ -2245,6 +2311,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	}
 
 	peer_req->w.cb = e_end_block;
+	peer_req->submit_jif = jiffies;
+	peer_req->flags |= EE_APPLICATION;
 
 	dp_flags = be32_to_cpu(p->dp_flags);
 	rw |= wire_flags_to_bio(dp_flags);
@@ -2271,9 +2339,36 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	spin_unlock(&connection->epoch_lock);
 
 	rcu_read_lock();
-	tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
+	nc = rcu_dereference(peer_device->connection->net_conf);
+	tp = nc->two_primaries;
+	if (peer_device->connection->agreed_pro_version < 100) {
+		switch (nc->wire_protocol) {
+		case DRBD_PROT_C:
+			dp_flags |= DP_SEND_WRITE_ACK;
+			break;
+		case DRBD_PROT_B:
+			dp_flags |= DP_SEND_RECEIVE_ACK;
+			break;
+		}
+	}
 	rcu_read_unlock();
+
+	if (dp_flags & DP_SEND_WRITE_ACK) {
+		peer_req->flags |= EE_SEND_WRITE_ACK;
+		inc_unacked(device);
+		/* corresponding dec_unacked() in e_end_block()
+		 * respective _drbd_clear_done_ee */
+	}
+
+	if (dp_flags & DP_SEND_RECEIVE_ACK) {
+		/* I really don't like it that the receiver thread
+		 * sends on the msock, but anyways */
+		drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+	}
+
 	if (tp) {
+		/* two primaries implies protocol C */
+		D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
 		peer_req->flags |= EE_IN_INTERVAL_TREE;
 		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
 		if (err)
@@ -2297,44 +2392,18 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	 * active_ee to become empty in drbd_submit_peer_request();
 	 * better not add ourselves here. */
 	if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
-		list_add(&peer_req->w.list, &device->active_ee);
+		list_add_tail(&peer_req->w.list, &device->active_ee);
 	spin_unlock_irq(&device->resource->req_lock);
 
 	if (device->state.conn == C_SYNC_TARGET)
 		wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
 
-	if (peer_device->connection->agreed_pro_version < 100) {
-		rcu_read_lock();
-		switch (rcu_dereference(peer_device->connection->net_conf)->wire_protocol) {
-		case DRBD_PROT_C:
-			dp_flags |= DP_SEND_WRITE_ACK;
-			break;
-		case DRBD_PROT_B:
-			dp_flags |= DP_SEND_RECEIVE_ACK;
-			break;
-		}
-		rcu_read_unlock();
-	}
-
-	if (dp_flags & DP_SEND_WRITE_ACK) {
-		peer_req->flags |= EE_SEND_WRITE_ACK;
-		inc_unacked(device);
-		/* corresponding dec_unacked() in e_end_block()
-		 * respective _drbd_clear_done_ee */
-	}
-
-	if (dp_flags & DP_SEND_RECEIVE_ACK) {
-		/* I really don't like it that the receiver thread
-		 * sends on the msock, but anyways */
-		drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
-	}
-
 	if (device->state.pdsk < D_INCONSISTENT) {
 		/* In case we have the only disk of the cluster, */
 		drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
-		peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
 		peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
-		drbd_al_begin_io(device, &peer_req->i, true);
+		drbd_al_begin_io(device, &peer_req->i);
+		peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
 	}
 
 	err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
@@ -2347,8 +2416,10 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	list_del(&peer_req->w.list);
 	drbd_remove_epoch_entry_interval(device, peer_req);
 	spin_unlock_irq(&device->resource->req_lock);
-	if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
+	if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
+		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 		drbd_al_complete_io(device, &peer_req->i);
+	}
 
 out_interrupted:
 	drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
@@ -2368,13 +2439,14 @@ out_interrupted:
  * The current sync rate used here uses only the most recent two step marks,
  * to have a short time average so we can react faster.
  */
-bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+		bool throttle_if_app_is_waiting)
 {
 	struct lc_element *tmp;
-	bool throttle = true;
+	bool throttle = drbd_rs_c_min_rate_throttle(device);
 
-	if (!drbd_rs_c_min_rate_throttle(device))
-		return false;
+	if (!throttle || throttle_if_app_is_waiting)
+		return throttle;
 
 	spin_lock_irq(&device->al_lock);
 	tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
@@ -2382,7 +2454,8 @@ bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
 		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
 		if (test_bit(BME_PRIORITY, &bm_ext->flags))
 			throttle = false;
-		/* Do not slow down if app IO is already waiting for this extent */
+		/* Do not slow down if app IO is already waiting for this extent,
+		 * and our progress is necessary for application IO to complete. */
 	}
 	spin_unlock_irq(&device->al_lock);
 
@@ -2407,7 +2480,9 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
 	curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
 		      (int)part_stat_read(&disk->part0, sectors[1]) -
 			atomic_read(&device->rs_sect_ev);
-	if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
+
+	if (atomic_read(&device->ap_actlog_cnt)
+	    || !device->rs_last_events || curr_events - device->rs_last_events > 64) {
 		unsigned long rs_left;
 		int i;
 
@@ -2508,6 +2583,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 		peer_req->w.cb = w_e_end_data_req;
 		fault_type = DRBD_FAULT_DT_RD;
 		/* application IO, don't drbd_rs_begin_io */
+		peer_req->flags |= EE_APPLICATION;
 		goto submit;
 
 	case P_RS_DATA_REQUEST:
@@ -2538,6 +2614,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 			peer_req->w.cb = w_e_end_csum_rs_req;
 			/* used in the sector offset progress display */
 			device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+			/* remember to report stats in drbd_resync_finished */
+			device->use_csums = true;
 		} else if (pi->cmd == P_OV_REPLY) {
 			/* track progress, we may need to throttle */
 			atomic_add(size >> 9, &device->rs_sect_in);
@@ -2595,8 +2673,20 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 	 * we would also throttle its application reads.
 	 * In that case, throttling is done on the SyncTarget only.
 	 */
-	if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector))
+
+	/* Even though this may be a resync request, we do add to "read_ee";
+	 * "sync_ee" is only used for resync WRITEs.
+	 * Add to list early, so debugfs can find this request
+	 * even if we have to sleep below. */
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&peer_req->w.list, &device->read_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	update_receiver_timing_details(connection, drbd_rs_should_slow_down);
+	if (device->state.peer != R_PRIMARY
+	&& drbd_rs_should_slow_down(device, sector, false))
 		schedule_timeout_uninterruptible(HZ/10);
+	update_receiver_timing_details(connection, drbd_rs_begin_io);
 	if (drbd_rs_begin_io(device, sector))
 		goto out_free_e;
 
@@ -2604,22 +2694,20 @@ submit_for_resync:
 	atomic_add(size >> 9, &device->rs_sect_ev);
 
 submit:
+	update_receiver_timing_details(connection, drbd_submit_peer_request);
 	inc_unacked(device);
-	spin_lock_irq(&device->resource->req_lock);
-	list_add_tail(&peer_req->w.list, &device->read_ee);
-	spin_unlock_irq(&device->resource->req_lock);
-
 	if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
 		return 0;
 
 	/* don't care for the reason here */
 	drbd_err(device, "submit failed, triggering re-connect\n");
+
+out_free_e:
 	spin_lock_irq(&device->resource->req_lock);
 	list_del(&peer_req->w.list);
 	spin_unlock_irq(&device->resource->req_lock);
 	/* no drbd_rs_complete_io(), we are dropping the connection anyways */
 
-out_free_e:
 	put_ldev(device);
 	drbd_free_peer_req(device, peer_req);
 	return -EIO;
@@ -2842,8 +2930,10 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
 -1091   requires proto 91
 -1096   requires proto 96
  */
-static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_hold(local)
+static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
 {
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
 	u64 self, peer;
 	int i, j;
 
@@ -2869,7 +2959,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 
 		if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
 
-			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+			if (connection->agreed_pro_version < 91)
 				return -1091;
 
 			if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
@@ -2892,7 +2982,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 
 		if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
 
-			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+			if (connection->agreed_pro_version < 91)
 				return -1091;
 
 			if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
@@ -2925,7 +3015,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 		case 1: /*  self_pri && !peer_pri */ return 1;
 		case 2: /* !self_pri &&  peer_pri */ return -1;
 		case 3: /*  self_pri &&  peer_pri */
-			dc = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
+			dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
 			return dc ? -1 : 1;
 		}
 	}
@@ -2938,14 +3028,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 	*rule_nr = 51;
 	peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
 	if (self == peer) {
-		if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+		if (connection->agreed_pro_version < 96 ?
 		    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
 		    (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
 		    peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
 			/* The last P_SYNC_UUID did not get though. Undo the last start of
 			   resync as sync source modifications of the peer's UUIDs. */
 
-			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+			if (connection->agreed_pro_version < 91)
 				return -1091;
 
 			device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
@@ -2975,14 +3065,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
 	*rule_nr = 71;
 	self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
 	if (self == peer) {
-		if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+		if (connection->agreed_pro_version < 96 ?
 		    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
 		    (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
 		    self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
 			/* The last P_SYNC_UUID did not get though. Undo the last start of
 			   resync as sync source modifications of our UUIDs. */
 
-			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+			if (connection->agreed_pro_version < 91)
 				return -1091;
 
 			__drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
@@ -3352,8 +3442,7 @@ disconnect:
  * return: NULL (alg name was "")
  *         ERR_PTR(error) if something goes wrong
  *         or the crypto hash ptr, if it worked out ok. */
-static
-struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
+static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
 		const char *alg, const char *name)
 {
 	struct crypto_hash *tfm;
@@ -3639,7 +3728,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 	struct drbd_device *device;
 	struct p_sizes *p = pi->data;
 	enum determine_dev_size dd = DS_UNCHANGED;
-	sector_t p_size, p_usize, my_usize;
+	sector_t p_size, p_usize, p_csize, my_usize;
 	int ldsc = 0; /* local disk size changed */
 	enum dds_flags ddsf;
 
@@ -3650,6 +3739,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 
 	p_size = be64_to_cpu(p->d_size);
 	p_usize = be64_to_cpu(p->u_size);
+	p_csize = be64_to_cpu(p->c_size);
 
 	/* just store the peer's disk size for now.
 	 * we still need to figure out whether we accept that. */
@@ -3710,7 +3800,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 	}
 
 	device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
-	drbd_reconsider_max_bio_size(device);
 	/* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
 	   In case we cleared the QUEUE_FLAG_DISCARD from our queue in
 	   drbd_reconsider_max_bio_size(), we can be sure that after
@@ -3718,14 +3807,28 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 
 	ddsf = be16_to_cpu(p->dds_flags);
 	if (get_ldev(device)) {
+		drbd_reconsider_max_bio_size(device, device->ldev);
 		dd = drbd_determine_dev_size(device, ddsf, NULL);
 		put_ldev(device);
 		if (dd == DS_ERROR)
 			return -EIO;
 		drbd_md_sync(device);
 	} else {
-		/* I am diskless, need to accept the peer's size. */
-		drbd_set_my_capacity(device, p_size);
+		/*
+		 * I am diskless, need to accept the peer's *current* size.
+		 * I must NOT accept the peers backing disk size,
+		 * it may have been larger than mine all along...
+		 *
+		 * At this point, the peer knows more about my disk, or at
+		 * least about what we last agreed upon, than myself.
+		 * So if his c_size is less than his d_size, the most likely
+		 * reason is that *my* d_size was smaller last time we checked.
+		 *
+		 * However, if he sends a zero current size,
+		 * take his (user-capped or) backing disk size anyways.
+		 */
+		drbd_reconsider_max_bio_size(device, NULL);
+		drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
 	}
 
 	if (get_ldev(device)) {
@@ -4501,6 +4604,7 @@ static void drbdd(struct drbd_connection *connection)
 		struct data_cmd *cmd;
 
 		drbd_thread_current_set_cpu(&connection->receiver);
+		update_receiver_timing_details(connection, drbd_recv_header);
 		if (drbd_recv_header(connection, &pi))
 			goto err_out;
 
@@ -4519,12 +4623,14 @@ static void drbdd(struct drbd_connection *connection)
 		}
 
 		if (shs) {
+			update_receiver_timing_details(connection, drbd_recv_all_warn);
 			err = drbd_recv_all_warn(connection, pi.data, shs);
 			if (err)
 				goto err_out;
 			pi.size -= shs;
 		}
 
+		update_receiver_timing_details(connection, cmd->fn);
 		err = cmd->fn(connection, &pi);
 		if (err) {
 			drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 09803d0d5207..c67717d572d1 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -52,7 +52,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request
 static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
 {
 	int rw = bio_data_dir(req->master_bio);
-	unsigned long duration = jiffies - req->start_time;
+	unsigned long duration = jiffies - req->start_jif;
 	int cpu;
 	cpu = part_stat_lock();
 	part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration);
@@ -66,7 +66,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
 {
 	struct drbd_request *req;
 
-	req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+	req = mempool_alloc(drbd_request_mempool, GFP_NOIO | __GFP_ZERO);
 	if (!req)
 		return NULL;
 
@@ -84,6 +84,8 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
 
 	INIT_LIST_HEAD(&req->tl_requests);
 	INIT_LIST_HEAD(&req->w.list);
+	INIT_LIST_HEAD(&req->req_pending_master_completion);
+	INIT_LIST_HEAD(&req->req_pending_local);
 
 	/* one reference to be put by __drbd_make_request */
 	atomic_set(&req->completion_ref, 1);
@@ -92,6 +94,19 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
 	return req;
 }
 
+static void drbd_remove_request_interval(struct rb_root *root,
+					 struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	struct drbd_interval *i = &req->i;
+
+	drbd_remove_interval(root, i);
+
+	/* Wake up any processes waiting for this request to complete.  */
+	if (i->waiting)
+		wake_up(&device->misc_wait);
+}
+
 void drbd_req_destroy(struct kref *kref)
 {
 	struct drbd_request *req = container_of(kref, struct drbd_request, kref);
@@ -107,14 +122,30 @@ void drbd_req_destroy(struct kref *kref)
 		return;
 	}
 
-	/* remove it from the transfer log.
-	 * well, only if it had been there in the first
-	 * place... if it had not (local only or conflicting
-	 * and never sent), it should still be "empty" as
-	 * initialized in drbd_req_new(), so we can list_del() it
-	 * here unconditionally */
+	/* If called from mod_rq_state (expected normal case) or
+	 * drbd_send_and_submit (the less likely normal path), this holds the
+	 * req_lock, and req->tl_requests will typicaly be on ->transfer_log,
+	 * though it may be still empty (never added to the transfer log).
+	 *
+	 * If called from do_retry(), we do NOT hold the req_lock, but we are
+	 * still allowed to unconditionally list_del(&req->tl_requests),
+	 * because it will be on a local on-stack list only. */
 	list_del_init(&req->tl_requests);
 
+	/* finally remove the request from the conflict detection
+	 * respective block_id verification interval tree. */
+	if (!drbd_interval_empty(&req->i)) {
+		struct rb_root *root;
+
+		if (s & RQ_WRITE)
+			root = &device->write_requests;
+		else
+			root = &device->read_requests;
+		drbd_remove_request_interval(root, req);
+	} else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0)
+		drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n",
+			s, (unsigned long long)req->i.sector, req->i.size);
+
 	/* if it was a write, we may have to set the corresponding
 	 * bit(s) out-of-sync first. If it had a local part, we need to
 	 * release the reference to the activity log. */
@@ -188,19 +219,6 @@ void complete_master_bio(struct drbd_device *device,
 }
 
 
-static void drbd_remove_request_interval(struct rb_root *root,
-					 struct drbd_request *req)
-{
-	struct drbd_device *device = req->device;
-	struct drbd_interval *i = &req->i;
-
-	drbd_remove_interval(root, i);
-
-	/* Wake up any processes waiting for this request to complete.  */
-	if (i->waiting)
-		wake_up(&device->misc_wait);
-}
-
 /* Helper for __req_mod().
  * Set m->bio to the master bio, if it is fit to be completed,
  * or leave it alone (it is initialized to NULL in __req_mod),
@@ -254,18 +272,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
 	ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
 	error = PTR_ERR(req->private_bio);
 
-	/* remove the request from the conflict detection
-	 * respective block_id verification hash */
-	if (!drbd_interval_empty(&req->i)) {
-		struct rb_root *root;
-
-		if (rw == WRITE)
-			root = &device->write_requests;
-		else
-			root = &device->read_requests;
-		drbd_remove_request_interval(root, req);
-	}
-
 	/* Before we can signal completion to the upper layers,
 	 * we may need to close the current transfer log epoch.
 	 * We are within the request lock, so we can simply compare
@@ -301,9 +307,24 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
 		m->error = ok ? 0 : (error ?: -EIO);
 		m->bio = req->master_bio;
 		req->master_bio = NULL;
+		/* We leave it in the tree, to be able to verify later
+		 * write-acks in protocol != C during resync.
+		 * But we mark it as "complete", so it won't be counted as
+		 * conflict in a multi-primary setup. */
+		req->i.completed = true;
 	}
+
+	if (req->i.waiting)
+		wake_up(&device->misc_wait);
+
+	/* Either we are about to complete to upper layers,
+	 * or we will restart this request.
+	 * In either case, the request object will be destroyed soon,
+	 * so better remove it from all lists. */
+	list_del_init(&req->req_pending_master_completion);
 }
 
+/* still holds resource->req_lock */
 static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
 {
 	struct drbd_device *device = req->device;
@@ -324,12 +345,91 @@ static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_
 	return 1;
 }
 
+static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_next == NULL)
+		connection->req_next = req;
+}
+
+static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_next != req)
+		return;
+	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+		const unsigned s = req->rq_state;
+		if (s & RQ_NET_QUEUED)
+			break;
+	}
+	if (&req->tl_requests == &connection->transfer_log)
+		req = NULL;
+	connection->req_next = req;
+}
+
+static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_ack_pending == NULL)
+		connection->req_ack_pending = req;
+}
+
+static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_ack_pending != req)
+		return;
+	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+		const unsigned s = req->rq_state;
+		if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING))
+			break;
+	}
+	if (&req->tl_requests == &connection->transfer_log)
+		req = NULL;
+	connection->req_ack_pending = req;
+}
+
+static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_not_net_done == NULL)
+		connection->req_not_net_done = req;
+}
+
+static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_not_net_done != req)
+		return;
+	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+		const unsigned s = req->rq_state;
+		if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE))
+			break;
+	}
+	if (&req->tl_requests == &connection->transfer_log)
+		req = NULL;
+	connection->req_not_net_done = req;
+}
+
 /* I'd like this to be the only place that manipulates
  * req->completion_ref and req->kref. */
 static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		int clear, int set)
 {
 	struct drbd_device *device = req->device;
+	struct drbd_peer_device *peer_device = first_peer_device(device);
 	unsigned s = req->rq_state;
 	int c_put = 0;
 	int k_put = 0;
@@ -356,14 +456,23 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		atomic_inc(&req->completion_ref);
 	}
 
-	if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED))
+	if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
 		atomic_inc(&req->completion_ref);
+		set_if_null_req_next(peer_device, req);
+	}
 
 	if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
 		kref_get(&req->kref); /* wait for the DONE */
 
-	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT))
-		atomic_add(req->i.size >> 9, &device->ap_in_flight);
+	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
+		/* potentially already completed in the asender thread */
+		if (!(s & RQ_NET_DONE)) {
+			atomic_add(req->i.size >> 9, &device->ap_in_flight);
+			set_if_null_req_not_net_done(peer_device, req);
+		}
+		if (s & RQ_NET_PENDING)
+			set_if_null_req_ack_pending(peer_device, req);
+	}
 
 	if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
 		atomic_inc(&req->completion_ref);
@@ -386,20 +495,34 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 			++k_put;
 		else
 			++c_put;
+		list_del_init(&req->req_pending_local);
 	}
 
 	if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
 		dec_ap_pending(device);
 		++c_put;
+		req->acked_jif = jiffies;
+		advance_conn_req_ack_pending(peer_device, req);
 	}
 
-	if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
+	if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) {
 		++c_put;
+		advance_conn_req_next(peer_device, req);
+	}
 
-	if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
-		if (req->rq_state & RQ_NET_SENT)
+	if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+		if (s & RQ_NET_SENT)
 			atomic_sub(req->i.size >> 9, &device->ap_in_flight);
-		++k_put;
+		if (s & RQ_EXP_BARR_ACK)
+			++k_put;
+		req->net_done_jif = jiffies;
+
+		/* in ahead/behind mode, or just in case,
+		 * before we finally destroy this request,
+		 * the caching pointers must not reference it anymore */
+		advance_conn_req_next(peer_device, req);
+		advance_conn_req_ack_pending(peer_device, req);
+		advance_conn_req_not_net_done(peer_device, req);
 	}
 
 	/* potentially complete and destroy */
@@ -439,6 +562,19 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request
 			bdevname(device->ldev->backing_bdev, b));
 }
 
+/* Helper for HANDED_OVER_TO_NETWORK.
+ * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)?
+ * Is it also still "PENDING"?
+ * --> If so, clear PENDING and set NET_OK below.
+ * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster
+ * (and we must not set RQ_NET_OK) */
+static inline bool is_pending_write_protocol_A(struct drbd_request *req)
+{
+	return (req->rq_state &
+		   (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK))
+		== (RQ_WRITE|RQ_NET_PENDING);
+}
+
 /* obviously this could be coded as many single functions
  * instead of one huge switch,
  * or by putting the code directly in the respective locations
@@ -454,7 +590,9 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request
 int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		struct bio_and_error *m)
 {
-	struct drbd_device *device = req->device;
+	struct drbd_device *const device = req->device;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
 	struct net_conf *nc;
 	int p, rv = 0;
 
@@ -477,7 +615,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		 * and from w_read_retry_remote */
 		D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
 		rcu_read_lock();
-		nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+		nc = rcu_dereference(connection->net_conf);
 		p = nc->wire_protocol;
 		rcu_read_unlock();
 		req->rq_state |=
@@ -549,7 +687,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
 		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
 		req->w.cb = w_send_read_req;
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+		drbd_queue_work(&connection->sender_work,
 				&req->w);
 		break;
 
@@ -585,23 +723,23 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
 		mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
 		req->w.cb =  w_send_dblock;
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+		drbd_queue_work(&connection->sender_work,
 				&req->w);
 
 		/* close the epoch, in case it outgrew the limit */
 		rcu_read_lock();
-		nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+		nc = rcu_dereference(connection->net_conf);
 		p = nc->max_epoch_size;
 		rcu_read_unlock();
-		if (first_peer_device(device)->connection->current_tle_writes >= p)
-			start_new_tl_epoch(first_peer_device(device)->connection);
+		if (connection->current_tle_writes >= p)
+			start_new_tl_epoch(connection);
 
 		break;
 
 	case QUEUE_FOR_SEND_OOS:
 		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
 		req->w.cb =  w_send_out_of_sync;
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+		drbd_queue_work(&connection->sender_work,
 				&req->w);
 		break;
 
@@ -615,18 +753,16 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
 	case HANDED_OVER_TO_NETWORK:
 		/* assert something? */
-		if (bio_data_dir(req->master_bio) == WRITE &&
-		    !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
+		if (is_pending_write_protocol_A(req))
 			/* this is what is dangerous about protocol A:
 			 * pretend it was successfully written on the peer. */
-			if (req->rq_state & RQ_NET_PENDING)
-				mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
-			/* else: neg-ack was faster... */
-			/* it is still not yet RQ_NET_DONE until the
-			 * corresponding epoch barrier got acked as well,
-			 * so we know what to dirty on connection loss */
-		}
-		mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+			mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING,
+						RQ_NET_SENT|RQ_NET_OK);
+		else
+			mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+		/* It is still not yet RQ_NET_DONE until the
+		 * corresponding epoch barrier got acked as well,
+		 * so we know what to dirty on connection loss. */
 		break;
 
 	case OOS_HANDED_TO_NETWORK:
@@ -658,12 +794,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 	case WRITE_ACKED_BY_PEER_AND_SIS:
 		req->rq_state |= RQ_NET_SIS;
 	case WRITE_ACKED_BY_PEER:
-		D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
-		/* protocol C; successfully written on peer.
+		/* Normal operation protocol C: successfully written on peer.
+		 * During resync, even in protocol != C,
+		 * we requested an explicit write ack anyways.
+		 * Which means we cannot even assert anything here.
 		 * Nothing more to do here.
 		 * We want to keep the tl in place for all protocols, to cater
 		 * for volatile write-back caches on lower level devices. */
-
 		goto ack_common;
 	case RECV_ACKED_BY_PEER:
 		D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
@@ -671,7 +808,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		 * see also notes above in HANDED_OVER_TO_NETWORK about
 		 * protocol != C */
 	ack_common:
-		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
 		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
 		break;
 
@@ -714,7 +850,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
 		get_ldev(device); /* always succeeds in this call path */
 		req->w.cb = w_restart_disk_io;
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+		drbd_queue_work(&connection->sender_work,
 				&req->w);
 		break;
 
@@ -736,7 +872,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
 			mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
 			if (req->w.cb) {
-				drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+				/* w.cb expected to be w_send_dblock, or w_send_read_req */
+				drbd_queue_work(&connection->sender_work,
 						&req->w);
 				rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
 			} /* else: FIXME can this happen? */
@@ -769,7 +906,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		break;
 
 	case QUEUE_AS_DRBD_BARRIER:
-		start_new_tl_epoch(first_peer_device(device)->connection);
+		start_new_tl_epoch(connection);
 		mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
 		break;
 	};
@@ -886,6 +1023,9 @@ static void maybe_pull_ahead(struct drbd_device *device)
 	    connection->agreed_pro_version < 96)
 		return;
 
+	if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD)
+		return; /* nothing to do ... */
+
 	/* If I don't even have good local storage, we can not reasonably try
 	 * to pull ahead of the peer. We also need the local reference to make
 	 * sure device->act_log is there.
@@ -1021,6 +1161,7 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 	 * stable storage, and this is a WRITE, we may not even submit
 	 * this bio. */
 	if (get_ldev(device)) {
+		req->pre_submit_jif = jiffies;
 		if (drbd_insert_fault(device,
 				      rw == WRITE ? DRBD_FAULT_DT_WR
 				    : rw == READ  ? DRBD_FAULT_DT_RD
@@ -1035,10 +1176,14 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 
 static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
 {
-	spin_lock(&device->submit.lock);
+	spin_lock_irq(&device->resource->req_lock);
 	list_add_tail(&req->tl_requests, &device->submit.writes);
-	spin_unlock(&device->submit.lock);
+	list_add_tail(&req->req_pending_master_completion,
+			&device->pending_master_completion[1 /* WRITE */]);
+	spin_unlock_irq(&device->resource->req_lock);
 	queue_work(device->submit.wq, &device->submit.worker);
+	/* do_submit() may sleep internally on al_wait, too */
+	wake_up(&device->al_wait);
 }
 
 /* returns the new drbd_request pointer, if the caller is expected to
@@ -1047,7 +1192,7 @@ static void drbd_queue_write(struct drbd_device *device, struct drbd_request *re
  * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
  */
 static struct drbd_request *
-drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
 {
 	const int rw = bio_data_dir(bio);
 	struct drbd_request *req;
@@ -1062,7 +1207,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 		bio_endio(bio, -ENOMEM);
 		return ERR_PTR(-ENOMEM);
 	}
-	req->start_time = start_time;
+	req->start_jif = start_jif;
 
 	if (!get_ldev(device)) {
 		bio_put(req->private_bio);
@@ -1075,10 +1220,12 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 	if (rw == WRITE && req->private_bio && req->i.size
 	&& !test_bit(AL_SUSPENDED, &device->flags)) {
 		if (!drbd_al_begin_io_fastpath(device, &req->i)) {
+			atomic_inc(&device->ap_actlog_cnt);
 			drbd_queue_write(device, req);
 			return NULL;
 		}
 		req->rq_state |= RQ_IN_ACT_LOG;
+		req->in_actlog_jif = jiffies;
 	}
 
 	return req;
@@ -1086,11 +1233,13 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 
 static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
 {
+	struct drbd_resource *resource = device->resource;
 	const int rw = bio_rw(req->master_bio);
 	struct bio_and_error m = { NULL, };
 	bool no_remote = false;
+	bool submit_private_bio = false;
 
-	spin_lock_irq(&device->resource->req_lock);
+	spin_lock_irq(&resource->req_lock);
 	if (rw == WRITE) {
 		/* This may temporarily give up the req_lock,
 		 * but will re-aquire it before it returns here.
@@ -1148,13 +1297,18 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
 			no_remote = true;
 	}
 
+	/* If it took the fast path in drbd_request_prepare, add it here.
+	 * The slow path has added it already. */
+	if (list_empty(&req->req_pending_master_completion))
+		list_add_tail(&req->req_pending_master_completion,
+			&device->pending_master_completion[rw == WRITE]);
 	if (req->private_bio) {
 		/* needs to be marked within the same spinlock */
+		list_add_tail(&req->req_pending_local,
+			&device->pending_completion[rw == WRITE]);
 		_req_mod(req, TO_BE_SUBMITTED);
 		/* but we need to give up the spinlock to submit */
-		spin_unlock_irq(&device->resource->req_lock);
-		drbd_submit_req_private_bio(req);
-		spin_lock_irq(&device->resource->req_lock);
+		submit_private_bio = true;
 	} else if (no_remote) {
 nodata:
 		if (__ratelimit(&drbd_ratelimit_state))
@@ -1167,15 +1321,23 @@ nodata:
 out:
 	if (drbd_req_put_completion_ref(req, &m, 1))
 		kref_put(&req->kref, drbd_req_destroy);
-	spin_unlock_irq(&device->resource->req_lock);
-
+	spin_unlock_irq(&resource->req_lock);
+
+	/* Even though above is a kref_put(), this is safe.
+	 * As long as we still need to submit our private bio,
+	 * we hold a completion ref, and the request cannot disappear.
+	 * If however this request did not even have a private bio to submit
+	 * (e.g. remote read), req may already be invalid now.
+	 * That's why we cannot check on req->private_bio. */
+	if (submit_private_bio)
+		drbd_submit_req_private_bio(req);
 	if (m.bio)
 		complete_master_bio(device, &m);
 }
 
-void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
 {
-	struct drbd_request *req = drbd_request_prepare(device, bio, start_time);
+	struct drbd_request *req = drbd_request_prepare(device, bio, start_jif);
 	if (IS_ERR_OR_NULL(req))
 		return;
 	drbd_send_and_submit(device, req);
@@ -1194,6 +1356,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
 				continue;
 
 			req->rq_state |= RQ_IN_ACT_LOG;
+			req->in_actlog_jif = jiffies;
+			atomic_dec(&device->ap_actlog_cnt);
 		}
 
 		list_del_init(&req->tl_requests);
@@ -1203,7 +1367,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
 
 static bool prepare_al_transaction_nonblock(struct drbd_device *device,
 					    struct list_head *incoming,
-					    struct list_head *pending)
+					    struct list_head *pending,
+					    struct list_head *later)
 {
 	struct drbd_request *req, *tmp;
 	int wake = 0;
@@ -1212,45 +1377,105 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device,
 	spin_lock_irq(&device->al_lock);
 	list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
 		err = drbd_al_begin_io_nonblock(device, &req->i);
+		if (err == -ENOBUFS)
+			break;
 		if (err == -EBUSY)
 			wake = 1;
 		if (err)
-			continue;
-		req->rq_state |= RQ_IN_ACT_LOG;
-		list_move_tail(&req->tl_requests, pending);
+			list_move_tail(&req->tl_requests, later);
+		else
+			list_move_tail(&req->tl_requests, pending);
 	}
 	spin_unlock_irq(&device->al_lock);
 	if (wake)
 		wake_up(&device->al_wait);
-
 	return !list_empty(pending);
 }
 
+void send_and_submit_pending(struct drbd_device *device, struct list_head *pending)
+{
+	struct drbd_request *req, *tmp;
+
+	list_for_each_entry_safe(req, tmp, pending, tl_requests) {
+		req->rq_state |= RQ_IN_ACT_LOG;
+		req->in_actlog_jif = jiffies;
+		atomic_dec(&device->ap_actlog_cnt);
+		list_del_init(&req->tl_requests);
+		drbd_send_and_submit(device, req);
+	}
+}
+
 void do_submit(struct work_struct *ws)
 {
 	struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
-	LIST_HEAD(incoming);
-	LIST_HEAD(pending);
-	struct drbd_request *req, *tmp;
+	LIST_HEAD(incoming);	/* from drbd_make_request() */
+	LIST_HEAD(pending);	/* to be submitted after next AL-transaction commit */
+	LIST_HEAD(busy);	/* blocked by resync requests */
+
+	/* grab new incoming requests */
+	spin_lock_irq(&device->resource->req_lock);
+	list_splice_tail_init(&device->submit.writes, &incoming);
+	spin_unlock_irq(&device->resource->req_lock);
 
 	for (;;) {
-		spin_lock(&device->submit.lock);
-		list_splice_tail_init(&device->submit.writes, &incoming);
-		spin_unlock(&device->submit.lock);
+		DEFINE_WAIT(wait);
 
+		/* move used-to-be-busy back to front of incoming */
+		list_splice_init(&busy, &incoming);
 		submit_fast_path(device, &incoming);
 		if (list_empty(&incoming))
 			break;
 
-skip_fast_path:
-		wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending));
-		/* Maybe more was queued, while we prepared the transaction?
-		 * Try to stuff them into this transaction as well.
-		 * Be strictly non-blocking here, no wait_event, we already
-		 * have something to commit.
-		 * Stop if we don't make any more progres.
-		 */
 		for (;;) {
+			prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+			list_splice_init(&busy, &incoming);
+			prepare_al_transaction_nonblock(device, &incoming, &pending, &busy);
+			if (!list_empty(&pending))
+				break;
+
+			schedule();
+
+			/* If all currently "hot" activity log extents are kept busy by
+			 * incoming requests, we still must not totally starve new
+			 * requests to "cold" extents.
+			 * Something left on &incoming means there had not been
+			 * enough update slots available, and the activity log
+			 * has been marked as "starving".
+			 *
+			 * Try again now, without looking for new requests,
+			 * effectively blocking all new requests until we made
+			 * at least _some_ progress with what we currently have.
+			 */
+			if (!list_empty(&incoming))
+				continue;
+
+			/* Nothing moved to pending, but nothing left
+			 * on incoming: all moved to busy!
+			 * Grab new and iterate. */
+			spin_lock_irq(&device->resource->req_lock);
+			list_splice_tail_init(&device->submit.writes, &incoming);
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+		finish_wait(&device->al_wait, &wait);
+
+		/* If the transaction was full, before all incoming requests
+		 * had been processed, skip ahead to commit, and iterate
+		 * without splicing in more incoming requests from upper layers.
+		 *
+		 * Else, if all incoming have been processed,
+		 * they have become either "pending" (to be submitted after
+		 * next transaction commit) or "busy" (blocked by resync).
+		 *
+		 * Maybe more was queued, while we prepared the transaction?
+		 * Try to stuff those into this transaction as well.
+		 * Be strictly non-blocking here,
+		 * we already have something to commit.
+		 *
+		 * Commit if we don't make any more progres.
+		 */
+
+		while (list_empty(&incoming)) {
 			LIST_HEAD(more_pending);
 			LIST_HEAD(more_incoming);
 			bool made_progress;
@@ -1260,55 +1485,32 @@ skip_fast_path:
 			if (list_empty(&device->submit.writes))
 				break;
 
-			spin_lock(&device->submit.lock);
+			spin_lock_irq(&device->resource->req_lock);
 			list_splice_tail_init(&device->submit.writes, &more_incoming);
-			spin_unlock(&device->submit.lock);
+			spin_unlock_irq(&device->resource->req_lock);
 
 			if (list_empty(&more_incoming))
 				break;
 
-			made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending);
+			made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy);
 
 			list_splice_tail_init(&more_pending, &pending);
 			list_splice_tail_init(&more_incoming, &incoming);
-
 			if (!made_progress)
 				break;
 		}
-		drbd_al_begin_io_commit(device, false);
-
-		list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
-			list_del_init(&req->tl_requests);
-			drbd_send_and_submit(device, req);
-		}
 
-		/* If all currently hot activity log extents are kept busy by
-		 * incoming requests, we still must not totally starve new
-		 * requests to cold extents. In that case, prepare one request
-		 * in blocking mode. */
-		list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
-			list_del_init(&req->tl_requests);
-			req->rq_state |= RQ_IN_ACT_LOG;
-			if (!drbd_al_begin_io_prepare(device, &req->i)) {
-				/* Corresponding extent was hot after all? */
-				drbd_send_and_submit(device, req);
-			} else {
-				/* Found a request to a cold extent.
-				 * Put on "pending" list,
-				 * and try to cumulate with more. */
-				list_add(&req->tl_requests, &pending);
-				goto skip_fast_path;
-			}
-		}
+		drbd_al_begin_io_commit(device);
+		send_and_submit_pending(device, &pending);
 	}
 }
 
 void drbd_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct drbd_device *device = (struct drbd_device *) q->queuedata;
-	unsigned long start_time;
+	unsigned long start_jif;
 
-	start_time = jiffies;
+	start_jif = jiffies;
 
 	/*
 	 * what we "blindly" assume:
@@ -1316,7 +1518,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
 	D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
 
 	inc_ap_bio(device);
-	__drbd_make_request(device, bio, start_time);
+	__drbd_make_request(device, bio, start_jif);
 }
 
 /* This is called by bio_add_page().
@@ -1353,36 +1555,13 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
 	return limit;
 }
 
-static void find_oldest_requests(
-		struct drbd_connection *connection,
-		struct drbd_device *device,
-		struct drbd_request **oldest_req_waiting_for_peer,
-		struct drbd_request **oldest_req_waiting_for_disk)
-{
-	struct drbd_request *r;
-	*oldest_req_waiting_for_peer = NULL;
-	*oldest_req_waiting_for_disk = NULL;
-	list_for_each_entry(r, &connection->transfer_log, tl_requests) {
-		const unsigned s = r->rq_state;
-		if (!*oldest_req_waiting_for_peer
-		&& ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE)))
-			*oldest_req_waiting_for_peer = r;
-
-		if (!*oldest_req_waiting_for_disk
-		&& (s & RQ_LOCAL_PENDING) && r->device == device)
-			*oldest_req_waiting_for_disk = r;
-
-		if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk)
-			break;
-	}
-}
-
 void request_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
 	struct drbd_connection *connection = first_peer_device(device)->connection;
-	struct drbd_request *req_disk, *req_peer; /* oldest request */
+	struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */
 	struct net_conf *nc;
+	unsigned long oldest_submit_jif;
 	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
 	unsigned long now;
 
@@ -1403,14 +1582,31 @@ void request_timer_fn(unsigned long data)
 		return; /* Recurring timer stopped */
 
 	now = jiffies;
+	nt = now + et;
 
 	spin_lock_irq(&device->resource->req_lock);
-	find_oldest_requests(connection, device, &req_peer, &req_disk);
-	if (req_peer == NULL && req_disk == NULL) {
-		spin_unlock_irq(&device->resource->req_lock);
-		mod_timer(&device->request_timer, now + et);
-		return;
-	}
+	req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
+	req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
+	req_peer = connection->req_not_net_done;
+	/* maybe the oldest request waiting for the peer is in fact still
+	 * blocking in tcp sendmsg */
+	if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
+		req_peer = connection->req_next;
+
+	/* evaluate the oldest peer request only in one timer! */
+	if (req_peer && req_peer->device != device)
+		req_peer = NULL;
+
+	/* do we have something to evaluate? */
+	if (req_peer == NULL && req_write == NULL && req_read == NULL)
+		goto out;
+
+	oldest_submit_jif =
+		(req_write && req_read)
+		? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif)
+		  ? req_write->pre_submit_jif : req_read->pre_submit_jif )
+		: req_write ? req_write->pre_submit_jif
+		: req_read ? req_read->pre_submit_jif : now;
 
 	/* The request is considered timed out, if
 	 * - we have some effective timeout from the configuration,
@@ -1429,13 +1625,13 @@ void request_timer_fn(unsigned long data)
 	 * to expire twice (worst case) to become effective. Good enough.
 	 */
 	if (ent && req_peer &&
-		 time_after(now, req_peer->start_time + ent) &&
+		 time_after(now, req_peer->pre_send_jif + ent) &&
 		!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
 		drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
 		_drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
 	}
-	if (dt && req_disk &&
-		 time_after(now, req_disk->start_time + dt) &&
+	if (dt && oldest_submit_jif != now &&
+		 time_after(now, oldest_submit_jif + dt) &&
 		!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
 		drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
 		__drbd_chk_io_error(device, DRBD_FORCE_DETACH);
@@ -1443,11 +1639,12 @@ void request_timer_fn(unsigned long data)
 
 	/* Reschedule timer for the nearest not already expired timeout.
 	 * Fallback to now + min(effective network timeout, disk timeout). */
-	ent = (ent && req_peer && time_before(now, req_peer->start_time + ent))
-		? req_peer->start_time + ent : now + et;
-	dt = (dt && req_disk && time_before(now, req_disk->start_time + dt))
-		? req_disk->start_time + dt : now + et;
+	ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent))
+		? req_peer->pre_send_jif + ent : now + et;
+	dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt))
+		? oldest_submit_jif + dt : now + et;
 	nt = time_before(ent, dt) ? ent : dt;
+out:
 	spin_unlock_irq(&connection->resource->req_lock);
 	mod_timer(&device->request_timer, nt);
 }
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 8566cd5866b4..9f6a04080e9f 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -288,6 +288,7 @@ extern void complete_master_bio(struct drbd_device *device,
 extern void request_timer_fn(unsigned long data);
 extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
 extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void tl_abort_disk_io(struct drbd_device *device);
 
 /* this is in drbd_main.c */
 extern void drbd_restart_request(struct drbd_request *req);
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index a5d8aae00e04..c35c0f001bb7 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -410,7 +410,7 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask,
 	return rv;
 }
 
-static void print_st(struct drbd_device *device, char *name, union drbd_state ns)
+static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
 {
 	drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
 	    name,
@@ -952,11 +952,12 @@ enum drbd_state_rv
 __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	         enum chg_state_flags flags, struct completion *done)
 {
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
 	union drbd_state os;
 	enum drbd_state_rv rv = SS_SUCCESS;
 	enum sanitize_state_warnings ssw;
 	struct after_state_chg_work *ascw;
-	bool did_remote, should_do_remote;
 
 	os = drbd_read_state(device);
 
@@ -978,9 +979,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 			   this happen...*/
 
 			if (is_valid_state(device, os) == rv)
-				rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+				rv = is_valid_soft_transition(os, ns, connection);
 		} else
-			rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+			rv = is_valid_soft_transition(os, ns, connection);
 	}
 
 	if (rv < SS_SUCCESS) {
@@ -997,7 +998,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	   sanitize_state(). Only display it here if we where not called from
 	   _conn_request_state() */
 	if (!(flags & CS_DC_SUSP))
-		conn_pr_state_change(first_peer_device(device)->connection, os, ns,
+		conn_pr_state_change(connection, os, ns,
 				     (flags & ~CS_DC_MASK) | CS_DC_SUSP);
 
 	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
@@ -1008,28 +1009,35 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
 		atomic_inc(&device->local_cnt);
 
-	did_remote = drbd_should_do_remote(device->state);
+	if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
+		clear_bit(RS_DONE, &device->flags);
+
+	/* changes to local_cnt and device flags should be visible before
+	 * changes to state, which again should be visible before anything else
+	 * depending on that change happens. */
+	smp_wmb();
 	device->state.i = ns.i;
-	should_do_remote = drbd_should_do_remote(device->state);
 	device->resource->susp = ns.susp;
 	device->resource->susp_nod = ns.susp_nod;
 	device->resource->susp_fen = ns.susp_fen;
+	smp_wmb();
 
 	/* put replicated vs not-replicated requests in seperate epochs */
-	if (did_remote != should_do_remote)
-		start_new_tl_epoch(first_peer_device(device)->connection);
+	if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
+	    drbd_should_do_remote((union drbd_dev_state)ns.i))
+		start_new_tl_epoch(connection);
 
 	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
 		drbd_print_uuids(device, "attached to UUIDs");
 
 	/* Wake up role changes, that were delayed because of connection establishing */
 	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
-	    no_peer_wf_report_params(first_peer_device(device)->connection))
-		clear_bit(STATE_SENT, &first_peer_device(device)->connection->flags);
+	    no_peer_wf_report_params(connection))
+		clear_bit(STATE_SENT, &connection->flags);
 
 	wake_up(&device->misc_wait);
 	wake_up(&device->state_wait);
-	wake_up(&first_peer_device(device)->connection->ping_wait);
+	wake_up(&connection->ping_wait);
 
 	/* Aborted verify run, or we reached the stop sector.
 	 * Log the last position, unless end-of-device. */
@@ -1118,21 +1126,21 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 
 	/* Receiver should clean up itself */
 	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
-		drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+		drbd_thread_stop_nowait(&connection->receiver);
 
 	/* Now the receiver finished cleaning up itself, it should die */
 	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
-		drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+		drbd_thread_stop_nowait(&connection->receiver);
 
 	/* Upon network failure, we need to restart the receiver. */
 	if (os.conn > C_WF_CONNECTION &&
 	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
-		drbd_thread_restart_nowait(&first_peer_device(device)->connection->receiver);
+		drbd_thread_restart_nowait(&connection->receiver);
 
 	/* Resume AL writing if we get a connection */
 	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
 		drbd_resume_al(device);
-		first_peer_device(device)->connection->connect_cnt++;
+		connection->connect_cnt++;
 	}
 
 	/* remember last attach time so request_timer_fn() won't
@@ -1150,7 +1158,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 		ascw->w.cb = w_after_state_ch;
 		ascw->device = device;
 		ascw->done = done;
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+		drbd_queue_work(&connection->sender_work,
 				&ascw->w);
 	} else {
 		drbd_err(device, "Could not kmalloc an ascw\n");
@@ -1222,13 +1230,16 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 			   union drbd_state ns, enum chg_state_flags flags)
 {
 	struct drbd_resource *resource = device->resource;
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
 	struct sib_info sib;
 
 	sib.sib_reason = SIB_STATE_CHANGE;
 	sib.os = os;
 	sib.ns = ns;
 
-	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+	if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE)
+	&&  (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) {
 		clear_bit(CRASHED_PRIMARY, &device->flags);
 		if (device->p_uuid)
 			device->p_uuid[UI_FLAGS] &= ~((u64)2);
@@ -1245,7 +1256,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	   state change. This function might sleep */
 
 	if (ns.susp_nod) {
-		struct drbd_connection *connection = first_peer_device(device)->connection;
 		enum drbd_req_event what = NOTHING;
 
 		spin_lock_irq(&device->resource->req_lock);
@@ -1267,8 +1277,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	}
 
 	if (ns.susp_fen) {
-		struct drbd_connection *connection = first_peer_device(device)->connection;
-
 		spin_lock_irq(&device->resource->req_lock);
 		if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
 			/* case2: The connection was established again: */
@@ -1294,8 +1302,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	 * which is unexpected. */
 	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
 	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
-	    first_peer_device(device)->connection->agreed_pro_version >= 96 && get_ldev(device)) {
-		drbd_gen_and_send_sync_uuid(first_peer_device(device));
+	    connection->agreed_pro_version >= 96 && get_ldev(device)) {
+		drbd_gen_and_send_sync_uuid(peer_device);
 		put_ldev(device);
 	}
 
@@ -1309,8 +1317,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 		atomic_set(&device->rs_pending_cnt, 0);
 		drbd_rs_cancel_all(device);
 
-		drbd_send_uuids(first_peer_device(device));
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_uuids(peer_device);
+		drbd_send_state(peer_device, ns);
 	}
 	/* No point in queuing send_bitmap if we don't have a connection
 	 * anymore, so check also the _current_ state, not only the new state
@@ -1335,7 +1343,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 					set_bit(NEW_CUR_UUID, &device->flags);
 				} else {
 					drbd_uuid_new_current(device);
-					drbd_send_uuids(first_peer_device(device));
+					drbd_send_uuids(peer_device);
 				}
 			}
 			put_ldev(device);
@@ -1346,7 +1354,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
 		    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
 			drbd_uuid_new_current(device);
-			drbd_send_uuids(first_peer_device(device));
+			drbd_send_uuids(peer_device);
 		}
 		/* D_DISKLESS Peer becomes secondary */
 		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1373,16 +1381,16 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	/* Last part of the attaching process ... */
 	if (ns.conn >= C_CONNECTED &&
 	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
-		drbd_send_sizes(first_peer_device(device), 0, 0);  /* to start sync... */
-		drbd_send_uuids(first_peer_device(device));
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_sizes(peer_device, 0, 0);  /* to start sync... */
+		drbd_send_uuids(peer_device);
+		drbd_send_state(peer_device, ns);
 	}
 
 	/* We want to pause/continue resync, tell peer. */
 	if (ns.conn >= C_CONNECTED &&
 	     ((os.aftr_isp != ns.aftr_isp) ||
 	      (os.user_isp != ns.user_isp)))
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	/* In case one of the isp bits got set, suspend other devices. */
 	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
@@ -1392,10 +1400,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	/* Make sure the peer gets informed about eventual state
 	   changes (ISP bits) while we were in WFReportParams. */
 	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	/* We are in the progress to start a full sync... */
 	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
@@ -1449,7 +1457,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 					drbd_disk_str(device->state.disk));
 
 			if (ns.conn >= C_CONNECTED)
-				drbd_send_state(first_peer_device(device), ns);
+				drbd_send_state(peer_device, ns);
 
 			drbd_rs_cancel_all(device);
 
@@ -1473,7 +1481,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 				 drbd_disk_str(device->state.disk));
 
 		if (ns.conn >= C_CONNECTED)
-			drbd_send_state(first_peer_device(device), ns);
+			drbd_send_state(peer_device, ns);
 		/* corresponding get_ldev in __drbd_set_state
 		 * this may finally trigger drbd_ldev_destroy. */
 		put_ldev(device);
@@ -1481,7 +1489,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 
 	/* Notify peer that I had a local IO error, and did not detached.. */
 	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	/* Disks got bigger while they were detached */
 	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
@@ -1499,14 +1507,14 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 	/* sync target done with resync.  Explicitly notify peer, even though
 	 * it should (at least for non-empty resyncs) already know itself. */
 	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	/* Verify finished, or reached stop sector.  Peer did not know about
 	 * the stop sector, and we may even have changed the stop sector during
 	 * verify to interrupt/stop early.  Send the new state. */
 	if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
 	&& verify_can_do_stop_sector(device))
-		drbd_send_state(first_peer_device(device), ns);
+		drbd_send_state(peer_device, ns);
 
 	/* This triggers bitmap writeout of potentially still unwritten pages
 	 * if the resync finished cleanly, or aborted because of peer disk
@@ -1563,7 +1571,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
 		old_conf = connection->net_conf;
 		connection->my_addr_len = 0;
 		connection->peer_addr_len = 0;
-		rcu_assign_pointer(connection->net_conf, NULL);
+		RCU_INIT_POINTER(connection->net_conf, NULL);
 		conn_free_crypto(connection);
 		mutex_unlock(&connection->resource->conf_update);
 
@@ -1599,7 +1607,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
 	return 0;
 }
 
-void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
+static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
 {
 	enum chg_state_flags flags = ~0;
 	struct drbd_peer_device *peer_device;
@@ -1688,7 +1696,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma
 	return rv;
 }
 
-void
+static void
 conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
 	       union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
 {
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index d8f57b6305cd..50776b362828 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -67,13 +67,10 @@ rwlock_t global_state_lock;
  */
 void drbd_md_io_complete(struct bio *bio, int error)
 {
-	struct drbd_md_io *md_io;
 	struct drbd_device *device;
 
-	md_io = (struct drbd_md_io *)bio->bi_private;
-	device = container_of(md_io, struct drbd_device, md_io);
-
-	md_io->error = error;
+	device = bio->bi_private;
+	device->md_io.error = error;
 
 	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
 	 * to timeout on the lower level device, and eventually detach from it.
@@ -87,7 +84,7 @@ void drbd_md_io_complete(struct bio *bio, int error)
 	 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
 	 */
 	drbd_md_put_buffer(device);
-	md_io->done = 1;
+	device->md_io.done = 1;
 	wake_up(&device->misc_wait);
 	bio_put(bio);
 	if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
@@ -135,6 +132,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
 	i = peer_req->i;
 	do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
 	block_id = peer_req->block_id;
+	peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 
 	spin_lock_irqsave(&device->resource->req_lock, flags);
 	device->writ_cnt += peer_req->i.size >> 9;
@@ -398,9 +396,6 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
 	if (!get_ldev(device))
 		return -EIO;
 
-	if (drbd_rs_should_slow_down(device, sector))
-		goto defer;
-
 	/* GFP_TRY, because if there is no memory available right now, this may
 	 * be rescheduled for later. It is "only" background resync, after all. */
 	peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
@@ -410,7 +405,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
 
 	peer_req->w.cb = w_e_send_csum;
 	spin_lock_irq(&device->resource->req_lock);
-	list_add(&peer_req->w.list, &device->read_ee);
+	list_add_tail(&peer_req->w.list, &device->read_ee);
 	spin_unlock_irq(&device->resource->req_lock);
 
 	atomic_add(size >> 9, &device->rs_sect_ev);
@@ -452,9 +447,9 @@ void resync_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
 
-	if (list_empty(&device->resync_work.list))
-		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-				&device->resync_work);
+	drbd_queue_work_if_unqueued(
+		&first_peer_device(device)->connection->sender_work,
+		&device->resync_work);
 }
 
 static void fifo_set(struct fifo_buffer *fb, int value)
@@ -504,9 +499,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size)
 static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
 {
 	struct disk_conf *dc;
-	unsigned int want;     /* The number of sectors we want in the proxy */
+	unsigned int want;     /* The number of sectors we want in-flight */
 	int req_sect; /* Number of sectors to request in this turn */
-	int correction; /* Number of sectors more we need in the proxy*/
+	int correction; /* Number of sectors more we need in-flight */
 	int cps; /* correction per invocation of drbd_rs_controller() */
 	int steps; /* Number of time steps to plan ahead */
 	int curr_corr;
@@ -577,20 +572,27 @@ static int drbd_rs_number_requests(struct drbd_device *device)
 	 * potentially causing a distributed deadlock on congestion during
 	 * online-verify or (checksum-based) resync, if max-buffers,
 	 * socket buffer sizes and resync rate settings are mis-configured. */
-	if (mxb - device->rs_in_flight < number)
-		number = mxb - device->rs_in_flight;
+
+	/* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
+	 * mxb (as used here, and in drbd_alloc_pages on the peer) is
+	 * "number of pages" (typically also 4k),
+	 * but "rs_in_flight" is in "sectors" (512 Byte). */
+	if (mxb - device->rs_in_flight/8 < number)
+		number = mxb - device->rs_in_flight/8;
 
 	return number;
 }
 
-static int make_resync_request(struct drbd_device *device, int cancel)
+static int make_resync_request(struct drbd_device *const device, int cancel)
 {
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
 	unsigned long bit;
 	sector_t sector;
 	const sector_t capacity = drbd_get_capacity(device->this_bdev);
 	int max_bio_size;
 	int number, rollback_i, size;
-	int align, queued, sndbuf;
+	int align, requeue = 0;
 	int i = 0;
 
 	if (unlikely(cancel))
@@ -617,17 +619,22 @@ static int make_resync_request(struct drbd_device *device, int cancel)
 		goto requeue;
 
 	for (i = 0; i < number; i++) {
-		/* Stop generating RS requests, when half of the send buffer is filled */
-		mutex_lock(&first_peer_device(device)->connection->data.mutex);
-		if (first_peer_device(device)->connection->data.socket) {
-			queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued;
-			sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf;
-		} else {
-			queued = 1;
-			sndbuf = 0;
-		}
-		mutex_unlock(&first_peer_device(device)->connection->data.mutex);
-		if (queued > sndbuf / 2)
+		/* Stop generating RS requests when half of the send buffer is filled,
+		 * but notify TCP that we'd like to have more space. */
+		mutex_lock(&connection->data.mutex);
+		if (connection->data.socket) {
+			struct sock *sk = connection->data.socket->sk;
+			int queued = sk->sk_wmem_queued;
+			int sndbuf = sk->sk_sndbuf;
+			if (queued > sndbuf / 2) {
+				requeue = 1;
+				if (sk->sk_socket)
+					set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			}
+		} else
+			requeue = 1;
+		mutex_unlock(&connection->data.mutex);
+		if (requeue)
 			goto requeue;
 
 next_sector:
@@ -642,8 +649,7 @@ next_sector:
 
 		sector = BM_BIT_TO_SECT(bit);
 
-		if (drbd_rs_should_slow_down(device, sector) ||
-		    drbd_try_rs_begin_io(device, sector)) {
+		if (drbd_try_rs_begin_io(device, sector)) {
 			device->bm_resync_fo = bit;
 			goto requeue;
 		}
@@ -696,9 +702,9 @@ next_sector:
 		/* adjust very last sectors, in case we are oddly sized */
 		if (sector + (size>>9) > capacity)
 			size = (capacity-sector)<<9;
-		if (first_peer_device(device)->connection->agreed_pro_version >= 89 &&
-		    first_peer_device(device)->connection->csums_tfm) {
-			switch (read_for_csum(first_peer_device(device), sector, size)) {
+
+		if (device->use_csums) {
+			switch (read_for_csum(peer_device, sector, size)) {
 			case -EIO: /* Disk failure */
 				put_ldev(device);
 				return -EIO;
@@ -717,7 +723,7 @@ next_sector:
 			int err;
 
 			inc_rs_pending(device);
-			err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST,
+			err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
 						 sector, size, ID_SYNCER);
 			if (err) {
 				drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
@@ -774,8 +780,7 @@ static int make_ov_request(struct drbd_device *device, int cancel)
 
 		size = BM_BLOCK_SIZE;
 
-		if (drbd_rs_should_slow_down(device, sector) ||
-		    drbd_try_rs_begin_io(device, sector)) {
+		if (drbd_try_rs_begin_io(device, sector)) {
 			device->ov_position = sector;
 			goto requeue;
 		}
@@ -911,7 +916,7 @@ int drbd_resync_finished(struct drbd_device *device)
 		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
 			khelper_cmd = "after-resync-target";
 
-		if (first_peer_device(device)->connection->csums_tfm && device->rs_total) {
+		if (device->use_csums && device->rs_total) {
 			const unsigned long s = device->rs_same_csum;
 			const unsigned long t = device->rs_total;
 			const int ratio =
@@ -1351,13 +1356,15 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
 {
 	struct drbd_request *req = container_of(w, struct drbd_request, w);
 	struct drbd_device *device = req->device;
-	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device->connection;
 	int err;
 
 	if (unlikely(cancel)) {
 		req_mod(req, SEND_CANCELED);
 		return 0;
 	}
+	req->pre_send_jif = jiffies;
 
 	/* this time, no connection->send.current_epoch_writes++;
 	 * If it was sent, it was the closing barrier for the last
@@ -1365,7 +1372,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
 	 * No more barriers will be sent, until we leave AHEAD mode again. */
 	maybe_send_barrier(connection, req->epoch);
 
-	err = drbd_send_out_of_sync(first_peer_device(device), req);
+	err = drbd_send_out_of_sync(peer_device, req);
 	req_mod(req, OOS_HANDED_TO_NETWORK);
 
 	return err;
@@ -1380,19 +1387,21 @@ int w_send_dblock(struct drbd_work *w, int cancel)
 {
 	struct drbd_request *req = container_of(w, struct drbd_request, w);
 	struct drbd_device *device = req->device;
-	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device->connection;
 	int err;
 
 	if (unlikely(cancel)) {
 		req_mod(req, SEND_CANCELED);
 		return 0;
 	}
+	req->pre_send_jif = jiffies;
 
 	re_init_if_first_write(connection, req->epoch);
 	maybe_send_barrier(connection, req->epoch);
 	connection->send.current_epoch_writes++;
 
-	err = drbd_send_dblock(first_peer_device(device), req);
+	err = drbd_send_dblock(peer_device, req);
 	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
 
 	return err;
@@ -1407,19 +1416,21 @@ int w_send_read_req(struct drbd_work *w, int cancel)
 {
 	struct drbd_request *req = container_of(w, struct drbd_request, w);
 	struct drbd_device *device = req->device;
-	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device->connection;
 	int err;
 
 	if (unlikely(cancel)) {
 		req_mod(req, SEND_CANCELED);
 		return 0;
 	}
+	req->pre_send_jif = jiffies;
 
 	/* Even read requests may close a write epoch,
 	 * if there was any yet. */
 	maybe_send_barrier(connection, req->epoch);
 
-	err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size,
+	err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
 				 (unsigned long)req);
 
 	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
@@ -1433,7 +1444,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
 	struct drbd_device *device = req->device;
 
 	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
-		drbd_al_begin_io(device, &req->i, false);
+		drbd_al_begin_io(device, &req->i);
 
 	drbd_req_make_private_bio(req, req->master_bio);
 	req->private_bio->bi_bdev = device->ldev->backing_bdev;
@@ -1601,26 +1612,32 @@ void drbd_rs_controller_reset(struct drbd_device *device)
 void start_resync_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
-
-	drbd_queue_work(&first_peer_device(device)->connection->sender_work,
-			&device->start_resync_work);
+	drbd_device_post_work(device, RS_START);
 }
 
-int w_start_resync(struct drbd_work *w, int cancel)
+static void do_start_resync(struct drbd_device *device)
 {
-	struct drbd_device *device =
-		container_of(w, struct drbd_device, start_resync_work);
-
 	if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
-		drbd_warn(device, "w_start_resync later...\n");
+		drbd_warn(device, "postponing start_resync ...\n");
 		device->start_resync_timer.expires = jiffies + HZ/10;
 		add_timer(&device->start_resync_timer);
-		return 0;
+		return;
 	}
 
 	drbd_start_resync(device, C_SYNC_SOURCE);
 	clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
-	return 0;
+}
+
+static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
+{
+	bool csums_after_crash_only;
+	rcu_read_lock();
+	csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
+	rcu_read_unlock();
+	return connection->agreed_pro_version >= 89 &&		/* supported? */
+		connection->csums_tfm &&			/* configured? */
+		(csums_after_crash_only == 0			/* use for each resync? */
+		 || test_bit(CRASHED_PRIMARY, &device->flags));	/* or only after Primary crash? */
 }
 
 /**
@@ -1633,6 +1650,8 @@ int w_start_resync(struct drbd_work *w, int cancel)
  */
 void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 {
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
 	union drbd_state ns;
 	int r;
 
@@ -1651,7 +1670,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 			if (r > 0) {
 				drbd_info(device, "before-resync-target handler returned %d, "
 					 "dropping connection.\n", r);
-				conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+				conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 				return;
 			}
 		} else /* C_SYNC_SOURCE */ {
@@ -1664,7 +1683,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 				} else {
 					drbd_info(device, "before-resync-source handler returned %d, "
 						 "dropping connection.\n", r);
-					conn_request_state(first_peer_device(device)->connection,
+					conn_request_state(connection,
 							   NS(conn, C_DISCONNECTING), CS_HARD);
 					return;
 				}
@@ -1672,7 +1691,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		}
 	}
 
-	if (current == first_peer_device(device)->connection->worker.task) {
+	if (current == connection->worker.task) {
 		/* The worker should not sleep waiting for state_mutex,
 		   that can take long */
 		if (!mutex_trylock(device->state_mutex)) {
@@ -1733,11 +1752,20 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 			device->rs_mark_time[i] = now;
 		}
 		_drbd_pause_after(device);
+		/* Forget potentially stale cached per resync extent bit-counts.
+		 * Open coded drbd_rs_cancel_all(device), we already have IRQs
+		 * disabled, and know the disk state is ok. */
+		spin_lock(&device->al_lock);
+		lc_reset(device->resync);
+		device->resync_locked = 0;
+		device->resync_wenr = LC_FREE;
+		spin_unlock(&device->al_lock);
 	}
 	write_unlock(&global_state_lock);
 	spin_unlock_irq(&device->resource->req_lock);
 
 	if (r == SS_SUCCESS) {
+		wake_up(&device->al_wait); /* for lc_reset() above */
 		/* reset rs_last_bcast when a resync or verify is started,
 		 * to deal with potential jiffies wrap. */
 		device->rs_last_bcast = jiffies - HZ;
@@ -1746,8 +1774,12 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		     drbd_conn_str(ns.conn),
 		     (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
 		     (unsigned long) device->rs_total);
-		if (side == C_SYNC_TARGET)
+		if (side == C_SYNC_TARGET) {
 			device->bm_resync_fo = 0;
+			device->use_csums = use_checksum_based_resync(connection, device);
+		} else {
+			device->use_csums = 0;
+		}
 
 		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
 		 * with w_send_oos, or the sync target will get confused as to
@@ -1756,12 +1788,10 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		 * drbd_resync_finished from here in that case.
 		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
 		 * and from after_state_ch otherwise. */
-		if (side == C_SYNC_SOURCE &&
-		    first_peer_device(device)->connection->agreed_pro_version < 96)
-			drbd_gen_and_send_sync_uuid(first_peer_device(device));
+		if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
+			drbd_gen_and_send_sync_uuid(peer_device);
 
-		if (first_peer_device(device)->connection->agreed_pro_version < 95 &&
-		    device->rs_total == 0) {
+		if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
 			/* This still has a race (about when exactly the peers
 			 * detect connection loss) that can lead to a full sync
 			 * on next handshake. In 8.3.9 we fixed this with explicit
@@ -1777,7 +1807,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 				int timeo;
 
 				rcu_read_lock();
-				nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+				nc = rcu_dereference(connection->net_conf);
 				timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
 				rcu_read_unlock();
 				schedule_timeout_interruptible(timeo);
@@ -1799,10 +1829,165 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	mutex_unlock(device->state_mutex);
 }
 
+static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
+{
+	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
+	device->rs_last_bcast = jiffies;
+
+	if (!get_ldev(device))
+		return;
+
+	drbd_bm_write_lazy(device, 0);
+	if (resync_done && is_sync_state(device->state.conn))
+		drbd_resync_finished(device);
+
+	drbd_bcast_event(device, &sib);
+	/* update timestamp, in case it took a while to write out stuff */
+	device->rs_last_bcast = jiffies;
+	put_ldev(device);
+}
+
+static void drbd_ldev_destroy(struct drbd_device *device)
+{
+	lc_destroy(device->resync);
+	device->resync = NULL;
+	lc_destroy(device->act_log);
+	device->act_log = NULL;
+	__no_warn(local,
+		drbd_free_ldev(device->ldev);
+		device->ldev = NULL;);
+	clear_bit(GOING_DISKLESS, &device->flags);
+	wake_up(&device->misc_wait);
+}
+
+static void go_diskless(struct drbd_device *device)
+{
+	D_ASSERT(device, device->state.disk == D_FAILED);
+	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+	 * the protected members anymore, though, so once put_ldev reaches zero
+	 * again, it will be safe to free them. */
+
+	/* Try to write changed bitmap pages, read errors may have just
+	 * set some bits outside the area covered by the activity log.
+	 *
+	 * If we have an IO error during the bitmap writeout,
+	 * we will want a full sync next time, just in case.
+	 * (Do we want a specific meta data flag for this?)
+	 *
+	 * If that does not make it to stable storage either,
+	 * we cannot do anything about that anymore.
+	 *
+	 * We still need to check if both bitmap and ldev are present, we may
+	 * end up here after a failed attach, before ldev was even assigned.
+	 */
+	if (device->bitmap && device->ldev) {
+		/* An interrupted resync or similar is allowed to recounts bits
+		 * while we detach.
+		 * Any modifications would not be expected anymore, though.
+		 */
+		if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
+					"detach", BM_LOCKED_TEST_ALLOWED)) {
+			if (test_bit(WAS_READ_ERROR, &device->flags)) {
+				drbd_md_set_flag(device, MDF_FULL_SYNC);
+				drbd_md_sync(device);
+			}
+		}
+	}
+
+	drbd_force_state(device, NS(disk, D_DISKLESS));
+}
+
+static int do_md_sync(struct drbd_device *device)
+{
+	drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+	drbd_md_sync(device);
+	return 0;
+}
+
+/* only called from drbd_worker thread, no locking */
+void __update_timing_details(
+		struct drbd_thread_timing_details *tdp,
+		unsigned int *cb_nr,
+		void *cb,
+		const char *fn, const unsigned int line)
+{
+	unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
+	struct drbd_thread_timing_details *td = tdp + i;
+
+	td->start_jif = jiffies;
+	td->cb_addr = cb;
+	td->caller_fn = fn;
+	td->line = line;
+	td->cb_nr = *cb_nr;
+
+	i = (i+1) % DRBD_THREAD_DETAILS_HIST;
+	td = tdp + i;
+	memset(td, 0, sizeof(*td));
+
+	++(*cb_nr);
+}
+
+#define WORK_PENDING(work_bit, todo)	(todo & (1UL << work_bit))
+static void do_device_work(struct drbd_device *device, const unsigned long todo)
+{
+	if (WORK_PENDING(MD_SYNC, todo))
+		do_md_sync(device);
+	if (WORK_PENDING(RS_DONE, todo) ||
+	    WORK_PENDING(RS_PROGRESS, todo))
+		update_on_disk_bitmap(device, WORK_PENDING(RS_DONE, todo));
+	if (WORK_PENDING(GO_DISKLESS, todo))
+		go_diskless(device);
+	if (WORK_PENDING(DESTROY_DISK, todo))
+		drbd_ldev_destroy(device);
+	if (WORK_PENDING(RS_START, todo))
+		do_start_resync(device);
+}
+
+#define DRBD_DEVICE_WORK_MASK	\
+	((1UL << GO_DISKLESS)	\
+	|(1UL << DESTROY_DISK)	\
+	|(1UL << MD_SYNC)	\
+	|(1UL << RS_START)	\
+	|(1UL << RS_PROGRESS)	\
+	|(1UL << RS_DONE)	\
+	)
+
+static unsigned long get_work_bits(unsigned long *flags)
+{
+	unsigned long old, new;
+	do {
+		old = *flags;
+		new = old & ~DRBD_DEVICE_WORK_MASK;
+	} while (cmpxchg(flags, old, new) != old);
+	return old & DRBD_DEVICE_WORK_MASK;
+}
+
+static void do_unqueued_work(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		unsigned long todo = get_work_bits(&device->flags);
+		if (!todo)
+			continue;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		do_device_work(device, todo);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
 {
 	spin_lock_irq(&queue->q_lock);
-	list_splice_init(&queue->q, work_list);
+	list_splice_tail_init(&queue->q, work_list);
 	spin_unlock_irq(&queue->q_lock);
 	return !list_empty(work_list);
 }
@@ -1851,7 +2036,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 		/* dequeue single item only,
 		 * we still use drbd_queue_work_front() in some places */
 		if (!list_empty(&connection->sender_work.q))
-			list_move(connection->sender_work.q.next, work_list);
+			list_splice_tail_init(&connection->sender_work.q, work_list);
 		spin_unlock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
 		if (!list_empty(work_list) || signal_pending(current)) {
 			spin_unlock_irq(&connection->resource->req_lock);
@@ -1873,6 +2058,14 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 		if (send_barrier)
 			maybe_send_barrier(connection,
 					connection->send.current_epoch_nr + 1);
+
+		if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
+			break;
+
+		/* drbd_send() may have called flush_signals() */
+		if (get_t_state(&connection->worker) != RUNNING)
+			break;
+
 		schedule();
 		/* may be woken up for other things but new work, too,
 		 * e.g. if the current epoch got closed.
@@ -1906,10 +2099,15 @@ int drbd_worker(struct drbd_thread *thi)
 	while (get_t_state(thi) == RUNNING) {
 		drbd_thread_current_set_cpu(thi);
 
-		/* as long as we use drbd_queue_work_front(),
-		 * we may only dequeue single work items here, not batches. */
-		if (list_empty(&work_list))
+		if (list_empty(&work_list)) {
+			update_worker_timing_details(connection, wait_for_work);
 			wait_for_work(connection, &work_list);
+		}
+
+		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+			update_worker_timing_details(connection, do_unqueued_work);
+			do_unqueued_work(connection);
+		}
 
 		if (signal_pending(current)) {
 			flush_signals(current);
@@ -1926,6 +2124,7 @@ int drbd_worker(struct drbd_thread *thi)
 		while (!list_empty(&work_list)) {
 			w = list_first_entry(&work_list, struct drbd_work, list);
 			list_del_init(&w->list);
+			update_worker_timing_details(connection, w->cb);
 			if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
 				continue;
 			if (connection->cstate >= C_WF_REPORT_PARAMS)
@@ -1934,13 +2133,18 @@ int drbd_worker(struct drbd_thread *thi)
 	}
 
 	do {
+		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+			update_worker_timing_details(connection, do_unqueued_work);
+			do_unqueued_work(connection);
+		}
 		while (!list_empty(&work_list)) {
 			w = list_first_entry(&work_list, struct drbd_work, list);
 			list_del_init(&w->list);
+			update_worker_timing_details(connection, w->cb);
 			w->cb(w, 1);
 		}
 		dequeue_work_batch(&connection->sender_work, &work_list);
-	} while (!list_empty(&work_list));
+	} while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
 
 	rcu_read_lock();
 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b2c98c1bc037..623c84145b79 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -42,6 +42,7 @@
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/workqueue.h>
 
 #include "rbd_types.h"
 
@@ -332,7 +333,10 @@ struct rbd_device {
 
 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 
+	struct list_head	rq_queue;	/* incoming rq queue */
 	spinlock_t		lock;		/* queue, flags, open_count */
+	struct workqueue_struct	*rq_wq;
+	struct work_struct	rq_work;
 
 	struct rbd_image_header	header;
 	unsigned long		flags;		/* possibly lock protected */
@@ -514,7 +518,8 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 
 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
-static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
+static int rbd_dev_header_info(struct rbd_device *rbd_dev);
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 					u64 snap_id);
 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
@@ -971,12 +976,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
 	header->snap_names = snap_names;
 	header->snap_sizes = snap_sizes;
 
-	/* Make sure mapping size is consistent with header info */
-
-	if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
-		if (rbd_dev->mapping.size != header->image_size)
-			rbd_dev->mapping.size = header->image_size;
-
 	return 0;
 out_2big:
 	ret = -EIO;
@@ -1139,6 +1138,13 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
 	rbd_dev->mapping.features = 0;
 }
 
+static void rbd_segment_name_free(const char *name)
+{
+	/* The explicit cast here is needed to drop the const qualifier */
+
+	kmem_cache_free(rbd_segment_name_cache, (void *)name);
+}
+
 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 {
 	char *name;
@@ -1158,20 +1164,13 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
 		pr_err("error formatting segment name for #%llu (%d)\n",
 			segment, ret);
-		kfree(name);
+		rbd_segment_name_free(name);
 		name = NULL;
 	}
 
 	return name;
 }
 
-static void rbd_segment_name_free(const char *name)
-{
-	/* The explicit cast here is needed to drop the const qualifier */
-
-	kmem_cache_free(rbd_segment_name_cache, (void *)name);
-}
-
 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 {
 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
@@ -1371,7 +1370,7 @@ static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
 		struct rbd_device *rbd_dev;
 
 		rbd_dev = obj_request->img_request->rbd_dev;
-		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
+		rbd_warn(rbd_dev, "obj_request %p already marked img_data",
 			obj_request);
 	}
 }
@@ -1389,7 +1388,7 @@ static void obj_request_done_set(struct rbd_obj_request *obj_request)
 
 		if (obj_request_img_data_test(obj_request))
 			rbd_dev = obj_request->img_request->rbd_dev;
-		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
+		rbd_warn(rbd_dev, "obj_request %p already marked done",
 			obj_request);
 	}
 }
@@ -1527,11 +1526,37 @@ static bool obj_request_type_valid(enum obj_request_type type)
 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
 				struct rbd_obj_request *obj_request)
 {
-	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
-
+	dout("%s %p\n", __func__, obj_request);
 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
 }
 
+static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
+{
+	dout("%s %p\n", __func__, obj_request);
+	ceph_osdc_cancel_request(obj_request->osd_req);
+}
+
+/*
+ * Wait for an object request to complete.  If interrupted, cancel the
+ * underlying osd request.
+ */
+static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
+{
+	int ret;
+
+	dout("%s %p\n", __func__, obj_request);
+
+	ret = wait_for_completion_interruptible(&obj_request->completion);
+	if (ret < 0) {
+		dout("%s %p interrupted\n", __func__, obj_request);
+		rbd_obj_request_end(obj_request);
+		return ret;
+	}
+
+	dout("%s %p done\n", __func__, obj_request);
+	return 0;
+}
+
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
 
@@ -1558,15 +1583,6 @@ static void rbd_img_request_complete(struct rbd_img_request *img_request)
 		rbd_img_request_put(img_request);
 }
 
-/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
-
-static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
-{
-	dout("%s: obj %p\n", __func__, obj_request);
-
-	return wait_for_completion_interruptible(&obj_request->completion);
-}
-
 /*
  * The default/initial value for all image request flags is 0.  Each
  * is conditionally set to 1 at image request initialization time
@@ -1763,7 +1779,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 		rbd_osd_trivial_callback(obj_request);
 		break;
 	default:
-		rbd_warn(NULL, "%s: unsupported op %hu\n",
+		rbd_warn(NULL, "%s: unsupported op %hu",
 			obj_request->object_name, (unsigned short) opcode);
 		break;
 	}
@@ -1998,7 +2014,7 @@ static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
 	if (!counter)
 		rbd_dev_unparent(rbd_dev);
 	else
-		rbd_warn(rbd_dev, "parent reference underflow\n");
+		rbd_warn(rbd_dev, "parent reference underflow");
 }
 
 /*
@@ -2028,7 +2044,7 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
 	/* Image was flattened, but parent is not yet torn down */
 
 	if (counter < 0)
-		rbd_warn(rbd_dev, "parent reference overflow\n");
+		rbd_warn(rbd_dev, "parent reference overflow");
 
 	return false;
 }
@@ -2045,7 +2061,7 @@ static struct rbd_img_request *rbd_img_request_create(
 {
 	struct rbd_img_request *img_request;
 
-	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
+	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
 	if (!img_request)
 		return NULL;
 
@@ -2161,11 +2177,11 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
 	if (result) {
 		struct rbd_device *rbd_dev = img_request->rbd_dev;
 
-		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
+		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
 			img_request_write_test(img_request) ? "write" : "read",
 			obj_request->length, obj_request->img_offset,
 			obj_request->offset);
-		rbd_warn(rbd_dev, "  result %d xferred %x\n",
+		rbd_warn(rbd_dev, "  result %d xferred %x",
 			result, xferred);
 		if (!img_request->result)
 			img_request->result = result;
@@ -2946,154 +2962,135 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
 		rbd_dev->header_name, (unsigned long long)notify_id,
 		(unsigned int)opcode);
+
+	/*
+	 * Until adequate refresh error handling is in place, there is
+	 * not much we can do here, except warn.
+	 *
+	 * See http://tracker.ceph.com/issues/5040
+	 */
 	ret = rbd_dev_refresh(rbd_dev);
 	if (ret)
-		rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
+		rbd_warn(rbd_dev, "refresh failed: %d", ret);
 
-	rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+	ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+	if (ret)
+		rbd_warn(rbd_dev, "notify_ack ret %d", ret);
 }
 
 /*
- * Initiate a watch request, synchronously.
+ * Send a (un)watch request and wait for the ack.  Return a request
+ * with a ref held on success or error.
  */
-static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
+static struct rbd_obj_request *rbd_obj_watch_request_helper(
+						struct rbd_device *rbd_dev,
+						bool watch)
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	struct rbd_obj_request *obj_request;
 	int ret;
 
-	rbd_assert(!rbd_dev->watch_event);
-	rbd_assert(!rbd_dev->watch_request);
-
-	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
-				     &rbd_dev->watch_event);
-	if (ret < 0)
-		return ret;
-
-	rbd_assert(rbd_dev->watch_event);
-
 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
 					     OBJ_REQUEST_NODATA);
-	if (!obj_request) {
-		ret = -ENOMEM;
-		goto out_cancel;
-	}
+	if (!obj_request)
+		return ERR_PTR(-ENOMEM);
 
 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
 						  obj_request);
 	if (!obj_request->osd_req) {
 		ret = -ENOMEM;
-		goto out_put;
+		goto out;
 	}
 
-	ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
-
 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
-			      rbd_dev->watch_event->cookie, 0, 1);
+			      rbd_dev->watch_event->cookie, 0, watch);
 	rbd_osd_req_format_write(obj_request);
 
+	if (watch)
+		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
+
 	ret = rbd_obj_request_submit(osdc, obj_request);
 	if (ret)
-		goto out_linger;
+		goto out;
 
 	ret = rbd_obj_request_wait(obj_request);
 	if (ret)
-		goto out_linger;
+		goto out;
 
 	ret = obj_request->result;
-	if (ret)
-		goto out_linger;
-
-	/*
-	 * A watch request is set to linger, so the underlying osd
-	 * request won't go away until we unregister it.  We retain
-	 * a pointer to the object request during that time (in
-	 * rbd_dev->watch_request), so we'll keep a reference to
-	 * it.  We'll drop that reference (below) after we've
-	 * unregistered it.
-	 */
-	rbd_dev->watch_request = obj_request;
+	if (ret) {
+		if (watch)
+			rbd_obj_request_end(obj_request);
+		goto out;
+	}
 
-	return 0;
+	return obj_request;
 
-out_linger:
-	ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req);
-out_put:
+out:
 	rbd_obj_request_put(obj_request);
-out_cancel:
-	ceph_osdc_cancel_event(rbd_dev->watch_event);
-	rbd_dev->watch_event = NULL;
-
-	return ret;
+	return ERR_PTR(ret);
 }
 
 /*
- * Tear down a watch request, synchronously.
+ * Initiate a watch request, synchronously.
  */
-static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	struct rbd_obj_request *obj_request;
 	int ret;
 
-	rbd_assert(rbd_dev->watch_event);
-	rbd_assert(rbd_dev->watch_request);
+	rbd_assert(!rbd_dev->watch_event);
+	rbd_assert(!rbd_dev->watch_request);
 
-	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
-					     OBJ_REQUEST_NODATA);
-	if (!obj_request) {
-		ret = -ENOMEM;
-		goto out_cancel;
-	}
+	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
+				     &rbd_dev->watch_event);
+	if (ret < 0)
+		return ret;
 
-	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
-						  obj_request);
-	if (!obj_request->osd_req) {
-		ret = -ENOMEM;
-		goto out_put;
+	obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
+	if (IS_ERR(obj_request)) {
+		ceph_osdc_cancel_event(rbd_dev->watch_event);
+		rbd_dev->watch_event = NULL;
+		return PTR_ERR(obj_request);
 	}
 
-	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
-			      rbd_dev->watch_event->cookie, 0, 0);
-	rbd_osd_req_format_write(obj_request);
-
-	ret = rbd_obj_request_submit(osdc, obj_request);
-	if (ret)
-		goto out_put;
+	/*
+	 * A watch request is set to linger, so the underlying osd
+	 * request won't go away until we unregister it.  We retain
+	 * a pointer to the object request during that time (in
+	 * rbd_dev->watch_request), so we'll keep a reference to it.
+	 * We'll drop that reference after we've unregistered it in
+	 * rbd_dev_header_unwatch_sync().
+	 */
+	rbd_dev->watch_request = obj_request;
 
-	ret = rbd_obj_request_wait(obj_request);
-	if (ret)
-		goto out_put;
+	return 0;
+}
 
-	ret = obj_request->result;
-	if (ret)
-		goto out_put;
+/*
+ * Tear down a watch request, synchronously.
+ */
+static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+{
+	struct rbd_obj_request *obj_request;
 
-	/* We have successfully torn down the watch request */
+	rbd_assert(rbd_dev->watch_event);
+	rbd_assert(rbd_dev->watch_request);
 
-	ceph_osdc_unregister_linger_request(osdc,
-					    rbd_dev->watch_request->osd_req);
+	rbd_obj_request_end(rbd_dev->watch_request);
 	rbd_obj_request_put(rbd_dev->watch_request);
 	rbd_dev->watch_request = NULL;
 
-out_put:
-	rbd_obj_request_put(obj_request);
-out_cancel:
+	obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
+	if (!IS_ERR(obj_request))
+		rbd_obj_request_put(obj_request);
+	else
+		rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
+			 PTR_ERR(obj_request));
+
 	ceph_osdc_cancel_event(rbd_dev->watch_event);
 	rbd_dev->watch_event = NULL;
-
-	return ret;
-}
-
-static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
-{
-	int ret;
-
-	ret = __rbd_dev_header_unwatch_sync(rbd_dev);
-	if (ret) {
-		rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
-			 ret);
-	}
 }
 
 /*
@@ -3183,102 +3180,129 @@ out:
 	return ret;
 }
 
-static void rbd_request_fn(struct request_queue *q)
-		__releases(q->queue_lock) __acquires(q->queue_lock)
+static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
 {
-	struct rbd_device *rbd_dev = q->queuedata;
-	struct request *rq;
+	struct rbd_img_request *img_request;
+	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
+	u64 length = blk_rq_bytes(rq);
+	bool wr = rq_data_dir(rq) == WRITE;
 	int result;
 
-	while ((rq = blk_fetch_request(q))) {
-		bool write_request = rq_data_dir(rq) == WRITE;
-		struct rbd_img_request *img_request;
-		u64 offset;
-		u64 length;
+	/* Ignore/skip any zero-length requests */
 
-		/* Ignore any non-FS requests that filter through. */
+	if (!length) {
+		dout("%s: zero-length request\n", __func__);
+		result = 0;
+		goto err_rq;
+	}
 
-		if (rq->cmd_type != REQ_TYPE_FS) {
-			dout("%s: non-fs request type %d\n", __func__,
-				(int) rq->cmd_type);
-			__blk_end_request_all(rq, 0);
-			continue;
+	/* Disallow writes to a read-only device */
+
+	if (wr) {
+		if (rbd_dev->mapping.read_only) {
+			result = -EROFS;
+			goto err_rq;
 		}
+		rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
+	}
 
-		/* Ignore/skip any zero-length requests */
+	/*
+	 * Quit early if the mapped snapshot no longer exists.  It's
+	 * still possible the snapshot will have disappeared by the
+	 * time our request arrives at the osd, but there's no sense in
+	 * sending it if we already know.
+	 */
+	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
+		dout("request for non-existent snapshot");
+		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
+		result = -ENXIO;
+		goto err_rq;
+	}
 
-		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
-		length = (u64) blk_rq_bytes(rq);
+	if (offset && length > U64_MAX - offset + 1) {
+		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
+			 length);
+		result = -EINVAL;
+		goto err_rq;	/* Shouldn't happen */
+	}
 
-		if (!length) {
-			dout("%s: zero-length request\n", __func__);
-			__blk_end_request_all(rq, 0);
-			continue;
-		}
+	if (offset + length > rbd_dev->mapping.size) {
+		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
+			 length, rbd_dev->mapping.size);
+		result = -EIO;
+		goto err_rq;
+	}
 
-		spin_unlock_irq(q->queue_lock);
+	img_request = rbd_img_request_create(rbd_dev, offset, length, wr);
+	if (!img_request) {
+		result = -ENOMEM;
+		goto err_rq;
+	}
+	img_request->rq = rq;
 
-		/* Disallow writes to a read-only device */
+	result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio);
+	if (result)
+		goto err_img_request;
 
-		if (write_request) {
-			result = -EROFS;
-			if (rbd_dev->mapping.read_only)
-				goto end_request;
-			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
-		}
+	result = rbd_img_request_submit(img_request);
+	if (result)
+		goto err_img_request;
 
-		/*
-		 * Quit early if the mapped snapshot no longer
-		 * exists.  It's still possible the snapshot will
-		 * have disappeared by the time our request arrives
-		 * at the osd, but there's no sense in sending it if
-		 * we already know.
-		 */
-		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
-			dout("request for non-existent snapshot");
-			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
-			result = -ENXIO;
-			goto end_request;
-		}
+	return;
 
-		result = -EINVAL;
-		if (offset && length > U64_MAX - offset + 1) {
-			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
-				offset, length);
-			goto end_request;	/* Shouldn't happen */
-		}
+err_img_request:
+	rbd_img_request_put(img_request);
+err_rq:
+	if (result)
+		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
+			 wr ? "write" : "read", length, offset, result);
+	blk_end_request_all(rq, result);
+}
 
-		result = -EIO;
-		if (offset + length > rbd_dev->mapping.size) {
-			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
-				offset, length, rbd_dev->mapping.size);
-			goto end_request;
-		}
+static void rbd_request_workfn(struct work_struct *work)
+{
+	struct rbd_device *rbd_dev =
+	    container_of(work, struct rbd_device, rq_work);
+	struct request *rq, *next;
+	LIST_HEAD(requests);
 
-		result = -ENOMEM;
-		img_request = rbd_img_request_create(rbd_dev, offset, length,
-							write_request);
-		if (!img_request)
-			goto end_request;
+	spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
+	list_splice_init(&rbd_dev->rq_queue, &requests);
+	spin_unlock_irq(&rbd_dev->lock);
 
-		img_request->rq = rq;
+	list_for_each_entry_safe(rq, next, &requests, queuelist) {
+		list_del_init(&rq->queuelist);
+		rbd_handle_request(rbd_dev, rq);
+	}
+}
 
-		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
-						rq->bio);
-		if (!result)
-			result = rbd_img_request_submit(img_request);
-		if (result)
-			rbd_img_request_put(img_request);
-end_request:
-		spin_lock_irq(q->queue_lock);
-		if (result < 0) {
-			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
-				write_request ? "write" : "read",
-				length, offset, result);
-
-			__blk_end_request_all(rq, result);
+/*
+ * Called with q->queue_lock held and interrupts disabled, possibly on
+ * the way to schedule().  Do not sleep here!
+ */
+static void rbd_request_fn(struct request_queue *q)
+{
+	struct rbd_device *rbd_dev = q->queuedata;
+	struct request *rq;
+	int queued = 0;
+
+	rbd_assert(rbd_dev);
+
+	while ((rq = blk_fetch_request(q))) {
+		/* Ignore any non-FS requests that filter through. */
+		if (rq->cmd_type != REQ_TYPE_FS) {
+			dout("%s: non-fs request type %d\n", __func__,
+				(int) rq->cmd_type);
+			__blk_end_request_all(rq, 0);
+			continue;
 		}
+
+		list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
+		queued++;
 	}
+
+	if (queued)
+		queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work);
 }
 
 /*
@@ -3517,24 +3541,37 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
 	u64 mapping_size;
 	int ret;
 
-	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 	down_write(&rbd_dev->header_rwsem);
 	mapping_size = rbd_dev->mapping.size;
-	if (rbd_dev->image_format == 1)
-		ret = rbd_dev_v1_header_info(rbd_dev);
-	else
-		ret = rbd_dev_v2_header_info(rbd_dev);
 
-	/* If it's a mapped snapshot, validate its EXISTS flag */
+	ret = rbd_dev_header_info(rbd_dev);
+	if (ret)
+		return ret;
+
+	/*
+	 * If there is a parent, see if it has disappeared due to the
+	 * mapped image getting flattened.
+	 */
+	if (rbd_dev->parent) {
+		ret = rbd_dev_v2_parent_info(rbd_dev);
+		if (ret)
+			return ret;
+	}
+
+	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
+		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
+			rbd_dev->mapping.size = rbd_dev->header.image_size;
+	} else {
+		/* validate mapped snapshot's EXISTS flag */
+		rbd_exists_validate(rbd_dev);
+	}
 
-	rbd_exists_validate(rbd_dev);
 	up_write(&rbd_dev->header_rwsem);
 
-	if (mapping_size != rbd_dev->mapping.size) {
+	if (mapping_size != rbd_dev->mapping.size)
 		rbd_dev_update_size(rbd_dev);
-	}
 
-	return ret;
+	return 0;
 }
 
 static int rbd_init_disk(struct rbd_device *rbd_dev)
@@ -3696,46 +3733,36 @@ static ssize_t rbd_snap_show(struct device *dev,
 }
 
 /*
- * For an rbd v2 image, shows the pool id, image id, and snapshot id
- * for the parent image.  If there is no parent, simply shows
- * "(no parent image)".
+ * For a v2 image, shows the chain of parent images, separated by empty
+ * lines.  For v1 images or if there is no parent, shows "(no parent
+ * image)".
  */
 static ssize_t rbd_parent_show(struct device *dev,
-			     struct device_attribute *attr,
-			     char *buf)
+			       struct device_attribute *attr,
+			       char *buf)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
-	struct rbd_spec *spec = rbd_dev->parent_spec;
-	int count;
-	char *bufp = buf;
+	ssize_t count = 0;
 
-	if (!spec)
+	if (!rbd_dev->parent)
 		return sprintf(buf, "(no parent image)\n");
 
-	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
-			(unsigned long long) spec->pool_id, spec->pool_name);
-	if (count < 0)
-		return count;
-	bufp += count;
-
-	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
-			spec->image_name ? spec->image_name : "(unknown)");
-	if (count < 0)
-		return count;
-	bufp += count;
-
-	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
-			(unsigned long long) spec->snap_id, spec->snap_name);
-	if (count < 0)
-		return count;
-	bufp += count;
-
-	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
-	if (count < 0)
-		return count;
-	bufp += count;
+	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
+		struct rbd_spec *spec = rbd_dev->parent_spec;
+
+		count += sprintf(&buf[count], "%s"
+			    "pool_id %llu\npool_name %s\n"
+			    "image_id %s\nimage_name %s\n"
+			    "snap_id %llu\nsnap_name %s\n"
+			    "overlap %llu\n",
+			    !count ? "" : "\n", /* first? */
+			    spec->pool_id, spec->pool_name,
+			    spec->image_id, spec->image_name ?: "(unknown)",
+			    spec->snap_id, spec->snap_name,
+			    rbd_dev->parent_overlap);
+	}
 
-	return (ssize_t) (bufp - buf);
+	return count;
 }
 
 static ssize_t rbd_image_refresh(struct device *dev,
@@ -3748,9 +3775,9 @@ static ssize_t rbd_image_refresh(struct device *dev,
 
 	ret = rbd_dev_refresh(rbd_dev);
 	if (ret)
-		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
+		return ret;
 
-	return ret < 0 ? ret : size;
+	return size;
 }
 
 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
@@ -3822,6 +3849,9 @@ static struct rbd_spec *rbd_spec_alloc(void)
 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
 	if (!spec)
 		return NULL;
+
+	spec->pool_id = CEPH_NOPOOL;
+	spec->snap_id = CEPH_NOSNAP;
 	kref_init(&spec->kref);
 
 	return spec;
@@ -3848,6 +3878,8 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 		return NULL;
 
 	spin_lock_init(&rbd_dev->lock);
+	INIT_LIST_HEAD(&rbd_dev->rq_queue);
+	INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn);
 	rbd_dev->flags = 0;
 	atomic_set(&rbd_dev->parent_ref, 0);
 	INIT_LIST_HEAD(&rbd_dev->node);
@@ -4021,7 +4053,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 		goto out_err;
 	}
 
-	snapid = cpu_to_le64(CEPH_NOSNAP);
+	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
 				"rbd", "get_parent",
 				&snapid, sizeof (snapid),
@@ -4059,7 +4091,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 
 	ret = -EIO;
 	if (pool_id > (u64)U32_MAX) {
-		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
+		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
 			(unsigned long long)pool_id, U32_MAX);
 		goto out_err;
 	}
@@ -4083,6 +4115,8 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 		parent_spec->snap_id = snap_id;
 		rbd_dev->parent_spec = parent_spec;
 		parent_spec = NULL;	/* rbd_dev now owns this */
+	} else {
+		kfree(image_id);
 	}
 
 	/*
@@ -4110,8 +4144,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 			 * overlap is zero we just pretend there was
 			 * no parent image.
 			 */
-			rbd_warn(rbd_dev, "ignoring parent of "
-						"clone with overlap 0\n");
+			rbd_warn(rbd_dev, "ignoring parent with overlap 0");
 		}
 	}
 out:
@@ -4279,18 +4312,38 @@ static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
 }
 
 /*
- * When an rbd image has a parent image, it is identified by the
- * pool, image, and snapshot ids (not names).  This function fills
- * in the names for those ids.  (It's OK if we can't figure out the
- * name for an image id, but the pool and snapshot ids should always
- * exist and have names.)  All names in an rbd spec are dynamically
- * allocated.
+ * An image being mapped will have everything but the snap id.
+ */
+static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
+{
+	struct rbd_spec *spec = rbd_dev->spec;
+
+	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
+	rbd_assert(spec->image_id && spec->image_name);
+	rbd_assert(spec->snap_name);
+
+	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
+		u64 snap_id;
+
+		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
+		if (snap_id == CEPH_NOSNAP)
+			return -ENOENT;
+
+		spec->snap_id = snap_id;
+	} else {
+		spec->snap_id = CEPH_NOSNAP;
+	}
+
+	return 0;
+}
+
+/*
+ * A parent image will have all ids but none of the names.
  *
- * When an image being mapped (not a parent) is probed, we have the
- * pool name and pool id, image name and image id, and the snapshot
- * name.  The only thing we're missing is the snapshot id.
+ * All names in an rbd spec are dynamically allocated.  It's OK if we
+ * can't figure out the name for an image id.
  */
-static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
+static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	struct rbd_spec *spec = rbd_dev->spec;
@@ -4299,24 +4352,9 @@ static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
 	const char *snap_name;
 	int ret;
 
-	/*
-	 * An image being mapped will have the pool name (etc.), but
-	 * we need to look up the snapshot id.
-	 */
-	if (spec->pool_name) {
-		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
-			u64 snap_id;
-
-			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
-			if (snap_id == CEPH_NOSNAP)
-				return -ENOENT;
-			spec->snap_id = snap_id;
-		} else {
-			spec->snap_id = CEPH_NOSNAP;
-		}
-
-		return 0;
-	}
+	rbd_assert(spec->pool_id != CEPH_NOPOOL);
+	rbd_assert(spec->image_id);
+	rbd_assert(spec->snap_id != CEPH_NOSNAP);
 
 	/* Get the pool name; we have to make our own copy of this */
 
@@ -4335,7 +4373,7 @@ static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
 	if (!image_name)
 		rbd_warn(rbd_dev, "unable to get image name");
 
-	/* Look up the snapshot name, and make a copy */
+	/* Fetch the snapshot name */
 
 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
 	if (IS_ERR(snap_name)) {
@@ -4348,10 +4386,10 @@ static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
 	spec->snap_name = snap_name;
 
 	return 0;
+
 out_err:
 	kfree(image_name);
 	kfree(pool_name);
-
 	return ret;
 }
 
@@ -4483,43 +4521,22 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
 			return ret;
 	}
 
-	/*
-	 * If the image supports layering, get the parent info.  We
-	 * need to probe the first time regardless.  Thereafter we
-	 * only need to if there's a parent, to see if it has
-	 * disappeared due to the mapped image getting flattened.
-	 */
-	if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
-			(first_time || rbd_dev->parent_spec)) {
-		bool warn;
-
-		ret = rbd_dev_v2_parent_info(rbd_dev);
-		if (ret)
-			return ret;
-
-		/*
-		 * Print a warning if this is the initial probe and
-		 * the image has a parent.  Don't print it if the
-		 * image now being probed is itself a parent.  We
-		 * can tell at this point because we won't know its
-		 * pool name yet (just its pool id).
-		 */
-		warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
-		if (first_time && warn)
-			rbd_warn(rbd_dev, "WARNING: kernel layering "
-					"is EXPERIMENTAL!");
-	}
-
-	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
-		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
-			rbd_dev->mapping.size = rbd_dev->header.image_size;
-
 	ret = rbd_dev_v2_snap_context(rbd_dev);
 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
 
 	return ret;
 }
 
+static int rbd_dev_header_info(struct rbd_device *rbd_dev)
+{
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+
+	if (rbd_dev->image_format == 1)
+		return rbd_dev_v1_header_info(rbd_dev);
+
+	return rbd_dev_v2_header_info(rbd_dev);
+}
+
 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
 {
 	struct device *dev;
@@ -5066,12 +5083,17 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
 	ret = rbd_dev_mapping_set(rbd_dev);
 	if (ret)
 		goto err_out_disk;
+
 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
 	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
 
+	rbd_dev->rq_wq = alloc_workqueue(rbd_dev->disk->disk_name, 0, 0);
+	if (!rbd_dev->rq_wq)
+		goto err_out_mapping;
+
 	ret = rbd_bus_add_dev(rbd_dev);
 	if (ret)
-		goto err_out_mapping;
+		goto err_out_workqueue;
 
 	/* Everything's ready.  Announce the disk to the world. */
 
@@ -5083,6 +5105,9 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
 
 	return ret;
 
+err_out_workqueue:
+	destroy_workqueue(rbd_dev->rq_wq);
+	rbd_dev->rq_wq = NULL;
 err_out_mapping:
 	rbd_dev_mapping_clear(rbd_dev);
 err_out_disk:
@@ -5155,8 +5180,6 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
 	ret = rbd_dev_image_id(rbd_dev);
 	if (ret)
 		return ret;
-	rbd_assert(rbd_dev->spec->image_id);
-	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
 	ret = rbd_dev_header_name(rbd_dev);
 	if (ret)
@@ -5168,25 +5191,45 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
 			goto out_header_name;
 	}
 
-	if (rbd_dev->image_format == 1)
-		ret = rbd_dev_v1_header_info(rbd_dev);
-	else
-		ret = rbd_dev_v2_header_info(rbd_dev);
+	ret = rbd_dev_header_info(rbd_dev);
 	if (ret)
 		goto err_out_watch;
 
-	ret = rbd_dev_spec_update(rbd_dev);
+	/*
+	 * If this image is the one being mapped, we have pool name and
+	 * id, image name and id, and snap name - need to fill snap id.
+	 * Otherwise this is a parent image, identified by pool, image
+	 * and snap ids - need to fill in names for those ids.
+	 */
+	if (mapping)
+		ret = rbd_spec_fill_snap_id(rbd_dev);
+	else
+		ret = rbd_spec_fill_names(rbd_dev);
 	if (ret)
 		goto err_out_probe;
 
+	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
+		ret = rbd_dev_v2_parent_info(rbd_dev);
+		if (ret)
+			goto err_out_probe;
+
+		/*
+		 * Need to warn users if this image is the one being
+		 * mapped and has a parent.
+		 */
+		if (mapping && rbd_dev->parent_spec)
+			rbd_warn(rbd_dev,
+				 "WARNING: kernel layering is EXPERIMENTAL!");
+	}
+
 	ret = rbd_dev_probe_parent(rbd_dev);
 	if (ret)
 		goto err_out_probe;
 
 	dout("discovered format %u image, header name is %s\n",
 		rbd_dev->image_format, rbd_dev->header_name);
-
 	return 0;
+
 err_out_probe:
 	rbd_dev_unprobe(rbd_dev);
 err_out_watch:
@@ -5199,9 +5242,6 @@ err_out_format:
 	rbd_dev->image_format = 0;
 	kfree(rbd_dev->spec->image_id);
 	rbd_dev->spec->image_id = NULL;
-
-	dout("probe failed, returning %d\n", ret);
-
 	return ret;
 }
 
@@ -5243,7 +5283,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
 	/* The ceph file layout needs to fit pool id in 32 bits */
 
 	if (spec->pool_id > (u64)U32_MAX) {
-		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
+		rbd_warn(NULL, "pool id too large (%llu > %u)",
 				(unsigned long long)spec->pool_id, U32_MAX);
 		rc = -EIO;
 		goto err_out_client;
@@ -5314,6 +5354,7 @@ static void rbd_dev_device_release(struct device *dev)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 
+	destroy_workqueue(rbd_dev->rq_wq);
 	rbd_free_disk(rbd_dev);
 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 	rbd_dev_mapping_clear(rbd_dev);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index f63d358f3d93..0a581400de0f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -15,17 +15,22 @@
 #include <linux/numa.h>
 
 #define PART_BITS 4
+#define VQ_NAME_LEN 16
 
 static int major;
 static DEFINE_IDA(vd_index_ida);
 
 static struct workqueue_struct *virtblk_wq;
 
+struct virtio_blk_vq {
+	struct virtqueue *vq;
+	spinlock_t lock;
+	char name[VQ_NAME_LEN];
+} ____cacheline_aligned_in_smp;
+
 struct virtio_blk
 {
 	struct virtio_device *vdev;
-	struct virtqueue *vq;
-	spinlock_t vq_lock;
 
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
@@ -47,6 +52,10 @@ struct virtio_blk
 
 	/* Ida index - used to track minor number allocations. */
 	int index;
+
+	/* num of vqs */
+	int num_vqs;
+	struct virtio_blk_vq *vqs;
 };
 
 struct virtblk_req
@@ -133,14 +142,15 @@ static void virtblk_done(struct virtqueue *vq)
 {
 	struct virtio_blk *vblk = vq->vdev->priv;
 	bool req_done = false;
+	int qid = vq->index;
 	struct virtblk_req *vbr;
 	unsigned long flags;
 	unsigned int len;
 
-	spin_lock_irqsave(&vblk->vq_lock, flags);
+	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
 	do {
 		virtqueue_disable_cb(vq);
-		while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
+		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
 			blk_mq_complete_request(vbr->req);
 			req_done = true;
 		}
@@ -151,7 +161,7 @@ static void virtblk_done(struct virtqueue *vq)
 	/* In case queue is stopped waiting for more buffers. */
 	if (req_done)
 		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
-	spin_unlock_irqrestore(&vblk->vq_lock, flags);
+	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 }
 
 static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
@@ -160,6 +170,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 	unsigned long flags;
 	unsigned int num;
+	int qid = hctx->queue_num;
 	const bool last = (req->cmd_flags & REQ_END) != 0;
 	int err;
 	bool notify = false;
@@ -202,12 +213,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
 	}
 
-	spin_lock_irqsave(&vblk->vq_lock, flags);
-	err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num);
+	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
+	err = __virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
 	if (err) {
-		virtqueue_kick(vblk->vq);
+		virtqueue_kick(vblk->vqs[qid].vq);
 		blk_mq_stop_hw_queue(hctx);
-		spin_unlock_irqrestore(&vblk->vq_lock, flags);
+		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 		/* Out of mem doesn't actually happen, since we fall back
 		 * to direct descriptors */
 		if (err == -ENOMEM || err == -ENOSPC)
@@ -215,12 +226,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 		return BLK_MQ_RQ_QUEUE_ERROR;
 	}
 
-	if (last && virtqueue_kick_prepare(vblk->vq))
+	if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
 		notify = true;
-	spin_unlock_irqrestore(&vblk->vq_lock, flags);
+	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 
 	if (notify)
-		virtqueue_notify(vblk->vq);
+		virtqueue_notify(vblk->vqs[qid].vq);
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
@@ -377,12 +388,64 @@ static void virtblk_config_changed(struct virtio_device *vdev)
 static int init_vq(struct virtio_blk *vblk)
 {
 	int err = 0;
+	int i;
+	vq_callback_t **callbacks;
+	const char **names;
+	struct virtqueue **vqs;
+	unsigned short num_vqs;
+	struct virtio_device *vdev = vblk->vdev;
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
+				   struct virtio_blk_config, num_queues,
+				   &num_vqs);
+	if (err)
+		num_vqs = 1;
+
+	vblk->vqs = kmalloc(sizeof(*vblk->vqs) * num_vqs, GFP_KERNEL);
+	if (!vblk->vqs) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL);
+	if (!names)
+		goto err_names;
+
+	callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL);
+	if (!callbacks)
+		goto err_callbacks;
+
+	vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL);
+	if (!vqs)
+		goto err_vqs;
 
-	/* We expect one virtqueue, for output. */
-	vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests");
-	if (IS_ERR(vblk->vq))
-		err = PTR_ERR(vblk->vq);
+	for (i = 0; i < num_vqs; i++) {
+		callbacks[i] = virtblk_done;
+		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
+		names[i] = vblk->vqs[i].name;
+	}
+
+	/* Discover virtqueues and write information to configuration.  */
+	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
+	if (err)
+		goto err_find_vqs;
 
+	for (i = 0; i < num_vqs; i++) {
+		spin_lock_init(&vblk->vqs[i].lock);
+		vblk->vqs[i].vq = vqs[i];
+	}
+	vblk->num_vqs = num_vqs;
+
+ err_find_vqs:
+	kfree(vqs);
+ err_vqs:
+	kfree(callbacks);
+ err_callbacks:
+	kfree(names);
+ err_names:
+	if (err)
+		kfree(vblk->vqs);
+ out:
 	return err;
 }
 
@@ -551,7 +614,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 	err = init_vq(vblk);
 	if (err)
 		goto out_free_vblk;
-	spin_lock_init(&vblk->vq_lock);
 
 	/* FIXME: How many partitions?  How long is a piece of string? */
 	vblk->disk = alloc_disk(1 << PART_BITS);
@@ -562,7 +624,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 
 	/* Default queue sizing is to fill the ring. */
 	if (!virtblk_queue_depth) {
-		virtblk_queue_depth = vblk->vq->num_free;
+		virtblk_queue_depth = vblk->vqs[0].vq->num_free;
 		/* ... but without indirect descs, we use 2 descs per req */
 		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
 			virtblk_queue_depth /= 2;
@@ -570,7 +632,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 
 	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
 	vblk->tag_set.ops = &virtio_mq_ops;
-	vblk->tag_set.nr_hw_queues = 1;
 	vblk->tag_set.queue_depth = virtblk_queue_depth;
 	vblk->tag_set.numa_node = NUMA_NO_NODE;
 	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
@@ -578,6 +639,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 		sizeof(struct virtblk_req) +
 		sizeof(struct scatterlist) * sg_elems;
 	vblk->tag_set.driver_data = vblk;
+	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
 
 	err = blk_mq_alloc_tag_set(&vblk->tag_set);
 	if (err)
@@ -727,6 +789,7 @@ static void virtblk_remove(struct virtio_device *vdev)
 	refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
 	put_disk(vblk->disk);
 	vdev->config->del_vqs(vdev);
+	kfree(vblk->vqs);
 	kfree(vblk);
 
 	/* Only free device id if we don't have any users */
@@ -777,7 +840,8 @@ static const struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
 	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
-	VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE
+	VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
+	VIRTIO_BLK_F_MQ,
 };
 
 static struct virtio_driver virtio_blk = {
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 8f6afbf9ba54..9b1ea0ef59af 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -393,6 +393,22 @@ config XILINX_VDMA
 	  channels, Memory Mapped to Stream (MM2S) and Stream to
 	  Memory Mapped (S2MM) for the data transfers.
 
+config DMA_SUN6I
+	tristate "Allwinner A31 SoCs DMA support"
+	depends on MACH_SUN6I || COMPILE_TEST
+	depends on RESET_CONTROLLER
+	select DMA_ENGINE
+	select DMA_VIRTUAL_CHANNELS
+	help
+	  Support for the DMA engine for Allwinner A31 SoCs.
+
+config NBPFAXI_DMA
+	tristate "Renesas Type-AXI NBPF DMA support"
+	select DMA_ENGINE
+	depends on ARM || COMPILE_TEST
+	help
+	  Support for "Type-AXI" NBPF DMA IPs from Renesas
+
 config DMA_ENGINE
 	bool
 
@@ -406,6 +422,7 @@ config DMA_ACPI
 config DMA_OF
 	def_bool y
 	depends on OF
+	select DMA_ENGINE
 
 comment "DMA Clients"
 	depends on DMA_ENGINE
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index bd9e7fa928bd..c6adb925f0b9 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,5 +1,5 @@
-ccflags-$(CONFIG_DMADEVICES_DEBUG)  := -DDEBUG
-ccflags-$(CONFIG_DMADEVICES_VDEBUG) += -DVERBOSE_DEBUG
+subdir-ccflags-$(CONFIG_DMADEVICES_DEBUG)  := -DDEBUG
+subdir-ccflags-$(CONFIG_DMADEVICES_VDEBUG) += -DVERBOSE_DEBUG
 
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_DMA_VIRTUAL_CHANNELS) += virt-dma.o
@@ -48,3 +48,5 @@ obj-$(CONFIG_FSL_EDMA) += fsl-edma.o
 obj-$(CONFIG_QCOM_BAM_DMA) += qcom_bam_dma.o
 obj-y += xilinx/
 obj-$(CONFIG_INTEL_MIC_X100_DMA) += mic_x100_dma.o
+obj-$(CONFIG_NBPFAXI_DMA) += nbpfaxi.o
+obj-$(CONFIG_DMA_SUN6I) += sun6i-dma.o
diff --git a/drivers/dma/TODO b/drivers/dma/TODO
index 734ed0206cd5..b8045cd42ee1 100644
--- a/drivers/dma/TODO
+++ b/drivers/dma/TODO
@@ -7,7 +7,6 @@ TODO for slave dma
 	- imx-dma
 	- imx-sdma
 	- mxs-dma.c
-	- dw_dmac
 	- intel_mid_dma
 4. Check other subsystems for dma drivers and merge/move to dmaengine
 5. Remove dma_slave_config's dma direction.
diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 8114731a1c62..e34024b000a4 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -1040,7 +1040,7 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x,
 
 		if (early_bytes) {
 			dev_vdbg(&pl08x->adev->dev,
-				"%s byte width LLIs (remain 0x%08x)\n",
+				"%s byte width LLIs (remain 0x%08zx)\n",
 				__func__, bd.remainder);
 			prep_byte_width_lli(pl08x, &bd, &cctl, early_bytes,
 				num_llis++, &total_bytes);
@@ -1653,7 +1653,7 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
 static struct dma_async_tx_descriptor *pl08x_prep_dma_cyclic(
 		struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
 		size_t period_len, enum dma_transfer_direction direction,
-		unsigned long flags, void *context)
+		unsigned long flags)
 {
 	struct pl08x_dma_chan *plchan = to_pl08x_chan(chan);
 	struct pl08x_driver_data *pl08x = plchan->host;
@@ -1662,7 +1662,7 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_cyclic(
 	dma_addr_t slave_addr;
 
 	dev_dbg(&pl08x->adev->dev,
-		"%s prepare cyclic transaction of %d/%d bytes %s %s\n",
+		"%s prepare cyclic transaction of %zd/%zd bytes %s %s\n",
 		__func__, period_len, buf_len,
 		direction == DMA_MEM_TO_DEV ? "to" : "from",
 		plchan->name);
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index c13a3bb0f594..ca9dd2613283 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -294,14 +294,16 @@ static int atc_get_bytes_left(struct dma_chan *chan)
 			ret = -EINVAL;
 			goto out;
 		}
-		atchan->remain_desc -= (desc_cur->lli.ctrla & ATC_BTSIZE_MAX)
-						<< (desc_first->tx_width);
-		if (atchan->remain_desc < 0) {
+
+		count = (desc_cur->lli.ctrla & ATC_BTSIZE_MAX)
+			<< desc_first->tx_width;
+		if (atchan->remain_desc < count) {
 			ret = -EINVAL;
 			goto out;
-		} else {
-			ret = atchan->remain_desc;
 		}
+
+		atchan->remain_desc -= count;
+		ret = atchan->remain_desc;
 	} else {
 		/*
 		 * Get residual bytes when current
@@ -893,12 +895,11 @@ atc_dma_cyclic_fill_desc(struct dma_chan *chan, struct at_desc *desc,
  * @period_len: number of bytes for each period
  * @direction: transfer direction, to or from device
  * @flags: tx descriptor status flags
- * @context: transfer context (ignored)
  */
 static struct dma_async_tx_descriptor *
 atc_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
 		size_t period_len, enum dma_transfer_direction direction,
-		unsigned long flags, void *context)
+		unsigned long flags)
 {
 	struct at_dma_chan	*atchan = to_at_dma_chan(chan);
 	struct at_dma_slave	*atslave = chan->private;
diff --git a/drivers/dma/bcm2835-dma.c b/drivers/dma/bcm2835-dma.c
index a03602164e3e..68007974961a 100644
--- a/drivers/dma/bcm2835-dma.c
+++ b/drivers/dma/bcm2835-dma.c
@@ -335,7 +335,7 @@ static void bcm2835_dma_issue_pending(struct dma_chan *chan)
 static struct dma_async_tx_descriptor *bcm2835_dma_prep_dma_cyclic(
 	struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
 	size_t period_len, enum dma_transfer_direction direction,
-	unsigned long flags, void *context)
+	unsigned long flags)
 {
 	struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
 	enum dma_slave_buswidth dev_width;
diff --git a/drivers/dma/dma-jz4740.c b/drivers/dma/dma-jz4740.c
index 94c380f07538..6a9d89c93b1f 100644
--- a/drivers/dma/dma-jz4740.c
+++ b/drivers/dma/dma-jz4740.c
@@ -433,7 +433,7 @@ static struct dma_async_tx_descriptor *jz4740_dma_prep_slave_sg(
 static struct dma_async_tx_descriptor *jz4740_dma_prep_dma_cyclic(
 	struct dma_chan *c, dma_addr_t buf_addr, size_t buf_len,
 	size_t period_len, enum dma_transfer_direction direction,
-	unsigned long flags, void *context)
+	unsigned long flags)
 {
 	struct jz4740_dmaengine_chan *chan = to_jz4740_dma_chan(c);
 	struct jz4740_dma_desc *desc;
@@ -614,4 +614,4 @@ module_platform_driver(jz4740_dma_driver);
 
 MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
 MODULE_DESCRIPTION("JZ4740 DMA driver");
-MODULE_LICENSE("GPLv2");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index a27ded53ab4f..1af731b83b3f 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -279,6 +279,19 @@ static void dwc_dostart(struct dw_dma_chan *dwc, struct dw_desc *first)
 	channel_set_bit(dw, CH_EN, dwc->mask);
 }
 
+static void dwc_dostart_first_queued(struct dw_dma_chan *dwc)
+{
+	struct dw_desc *desc;
+
+	if (list_empty(&dwc->queue))
+		return;
+
+	list_move(dwc->queue.next, &dwc->active_list);
+	desc = dwc_first_active(dwc);
+	dev_vdbg(chan2dev(&dwc->chan), "%s: started %u\n", __func__, desc->txd.cookie);
+	dwc_dostart(dwc, desc);
+}
+
 /*----------------------------------------------------------------------*/
 
 static void
@@ -335,10 +348,7 @@ static void dwc_complete_all(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	 * the completed ones.
 	 */
 	list_splice_init(&dwc->active_list, &list);
-	if (!list_empty(&dwc->queue)) {
-		list_move(dwc->queue.next, &dwc->active_list);
-		dwc_dostart(dwc, dwc_first_active(dwc));
-	}
+	dwc_dostart_first_queued(dwc);
 
 	spin_unlock_irqrestore(&dwc->lock, flags);
 
@@ -467,10 +477,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	/* Try to continue after resetting the channel... */
 	dwc_chan_disable(dw, dwc);
 
-	if (!list_empty(&dwc->queue)) {
-		list_move(dwc->queue.next, &dwc->active_list);
-		dwc_dostart(dwc, dwc_first_active(dwc));
-	}
+	dwc_dostart_first_queued(dwc);
 	spin_unlock_irqrestore(&dwc->lock, flags);
 }
 
@@ -677,17 +684,9 @@ static dma_cookie_t dwc_tx_submit(struct dma_async_tx_descriptor *tx)
 	 * possible, perhaps even appending to those already submitted
 	 * for DMA. But this is hard to do in a race-free manner.
 	 */
-	if (list_empty(&dwc->active_list)) {
-		dev_vdbg(chan2dev(tx->chan), "%s: started %u\n", __func__,
-				desc->txd.cookie);
-		list_add_tail(&desc->desc_node, &dwc->active_list);
-		dwc_dostart(dwc, dwc_first_active(dwc));
-	} else {
-		dev_vdbg(chan2dev(tx->chan), "%s: queued %u\n", __func__,
-				desc->txd.cookie);
 
-		list_add_tail(&desc->desc_node, &dwc->queue);
-	}
+	dev_vdbg(chan2dev(tx->chan), "%s: queued %u\n", __func__, desc->txd.cookie);
+	list_add_tail(&desc->desc_node, &dwc->queue);
 
 	spin_unlock_irqrestore(&dwc->lock, flags);
 
@@ -1092,9 +1091,12 @@ dwc_tx_status(struct dma_chan *chan,
 static void dwc_issue_pending(struct dma_chan *chan)
 {
 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
+	unsigned long		flags;
 
-	if (!list_empty(&dwc->queue))
-		dwc_scan_descriptors(to_dw_dma(chan->device), dwc);
+	spin_lock_irqsave(&dwc->lock, flags);
+	if (list_empty(&dwc->active_list))
+		dwc_dostart_first_queued(dwc);
+	spin_unlock_irqrestore(&dwc->lock, flags);
 }
 
 static int dwc_alloc_chan_resources(struct dma_chan *chan)
diff --git a/drivers/dma/edma.c b/drivers/dma/edma.c
index b512caf46944..7b65633f495e 100644
--- a/drivers/dma/edma.c
+++ b/drivers/dma/edma.c
@@ -23,6 +23,7 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/of.h>
 
 #include <linux/platform_data/edma.h>
 
@@ -256,8 +257,13 @@ static int edma_terminate_all(struct edma_chan *echan)
 	 * echan->edesc is NULL and exit.)
 	 */
 	if (echan->edesc) {
+		int cyclic = echan->edesc->cyclic;
 		echan->edesc = NULL;
 		edma_stop(echan->ch_num);
+		/* Move the cyclic channel back to default queue */
+		if (cyclic)
+			edma_assign_channel_eventq(echan->ch_num,
+						   EVENTQ_DEFAULT);
 	}
 
 	vchan_get_all_descriptors(&echan->vchan, &head);
@@ -592,7 +598,7 @@ struct dma_async_tx_descriptor *edma_prep_dma_memcpy(
 static struct dma_async_tx_descriptor *edma_prep_dma_cyclic(
 	struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
 	size_t period_len, enum dma_transfer_direction direction,
-	unsigned long tx_flags, void *context)
+	unsigned long tx_flags)
 {
 	struct edma_chan *echan = to_edma_chan(chan);
 	struct device *dev = chan->device->dev;
@@ -718,12 +724,15 @@ static struct dma_async_tx_descriptor *edma_prep_dma_cyclic(
 		edesc->absync = ret;
 
 		/*
-		 * Enable interrupts for every period because callback
-		 * has to be called for every period.
+		 * Enable period interrupt only if it is requested
 		 */
-		edesc->pset[i].param.opt |= TCINTEN;
+		if (tx_flags & DMA_PREP_INTERRUPT)
+			edesc->pset[i].param.opt |= TCINTEN;
 	}
 
+	/* Place the cyclic channel to highest priority queue */
+	edma_assign_channel_eventq(echan->ch_num, EVENTQ_0);
+
 	return vchan_tx_prep(&echan->vchan, &edesc->vdesc, tx_flags);
 }
 
@@ -993,7 +1002,7 @@ static int edma_dma_device_slave_caps(struct dma_chan *dchan,
 	caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
 	caps->cmd_pause = true;
 	caps->cmd_terminate = true;
-	caps->residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
+	caps->residue_granularity = DMA_RESIDUE_GRANULARITY_BURST;
 
 	return 0;
 }
@@ -1040,7 +1049,7 @@ static int edma_probe(struct platform_device *pdev)
 	ecc->dummy_slot = edma_alloc_slot(ecc->ctlr, EDMA_SLOT_ANY);
 	if (ecc->dummy_slot < 0) {
 		dev_err(&pdev->dev, "Can't allocate PaRAM dummy slot\n");
-		return -EIO;
+		return ecc->dummy_slot;
 	}
 
 	dma_cap_zero(ecc->dma_slave.cap_mask);
@@ -1125,7 +1134,7 @@ static int edma_init(void)
 		}
 	}
 
-	if (EDMA_CTLRS == 2) {
+	if (!of_have_populated_dt() && EDMA_CTLRS == 2) {
 		pdev1 = platform_device_register_full(&edma_dev_info1);
 		if (IS_ERR(pdev1)) {
 			platform_driver_unregister(&edma_driver);
diff --git a/drivers/dma/ep93xx_dma.c b/drivers/dma/ep93xx_dma.c
index cb4bf682a708..7650470196c4 100644
--- a/drivers/dma/ep93xx_dma.c
+++ b/drivers/dma/ep93xx_dma.c
@@ -1092,7 +1092,6 @@ fail:
  * @period_len: length of a single period
  * @dir: direction of the operation
  * @flags: tx descriptor status flags
- * @context: operation context (ignored)
  *
  * Prepares a descriptor for cyclic DMA operation. This means that once the
  * descriptor is submitted, we will be submitting in a @period_len sized
@@ -1105,8 +1104,7 @@ fail:
 static struct dma_async_tx_descriptor *
 ep93xx_dma_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr,
 			   size_t buf_len, size_t period_len,
-			   enum dma_transfer_direction dir, unsigned long flags,
-			   void *context)
+			   enum dma_transfer_direction dir, unsigned long flags)
 {
 	struct ep93xx_dma_chan *edmac = to_ep93xx_dma_chan(chan);
 	struct ep93xx_dma_desc *desc, *first;
diff --git a/drivers/dma/fsl-edma.c b/drivers/dma/fsl-edma.c
index b396a7fb53ab..3c5711d5fe97 100644
--- a/drivers/dma/fsl-edma.c
+++ b/drivers/dma/fsl-edma.c
@@ -248,11 +248,12 @@ static void fsl_edma_chan_mux(struct fsl_edma_chan *fsl_chan,
 			unsigned int slot, bool enable)
 {
 	u32 ch = fsl_chan->vchan.chan.chan_id;
-	void __iomem *muxaddr = fsl_chan->edma->muxbase[ch / DMAMUX_NR];
+	void __iomem *muxaddr;
 	unsigned chans_per_mux, ch_off;
 
 	chans_per_mux = fsl_chan->edma->n_chans / DMAMUX_NR;
 	ch_off = fsl_chan->vchan.chan.chan_id % chans_per_mux;
+	muxaddr = fsl_chan->edma->muxbase[ch / chans_per_mux];
 
 	if (enable)
 		edma_writeb(fsl_chan->edma,
@@ -516,7 +517,7 @@ err:
 static struct dma_async_tx_descriptor *fsl_edma_prep_dma_cyclic(
 		struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
 		size_t period_len, enum dma_transfer_direction direction,
-		unsigned long flags, void *context)
+		unsigned long flags)
 {
 	struct fsl_edma_chan *fsl_chan = to_fsl_edma_chan(chan);
 	struct fsl_edma_desc *fsl_desc;
@@ -724,6 +725,7 @@ static struct dma_chan *fsl_edma_xlate(struct of_phandle_args *dma_spec,
 {
 	struct fsl_edma_engine *fsl_edma = ofdma->of_dma_data;
 	struct dma_chan *chan, *_chan;
+	unsigned long chans_per_mux = fsl_edma->n_chans / DMAMUX_NR;
 
 	if (dma_spec->args_count != 2)
 		return NULL;
@@ -732,7 +734,7 @@ static struct dma_chan *fsl_edma_xlate(struct of_phandle_args *dma_spec,
 	list_for_each_entry_safe(chan, _chan, &fsl_edma->dma_dev.channels, device_node) {
 		if (chan->client_count)
 			continue;
-		if ((chan->chan_id / DMAMUX_NR) == dma_spec->args[0]) {
+		if ((chan->chan_id / chans_per_mux) == dma_spec->args[0]) {
 			chan = dma_get_slave_channel(chan);
 			if (chan) {
 				chan->device->privatecnt++;
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index e0fec68aed25..d5d6885ab341 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -396,10 +396,17 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	struct fsldma_chan *chan = to_fsl_chan(tx->chan);
 	struct fsl_desc_sw *desc = tx_to_fsl_desc(tx);
 	struct fsl_desc_sw *child;
-	unsigned long flags;
 	dma_cookie_t cookie = -EINVAL;
 
-	spin_lock_irqsave(&chan->desc_lock, flags);
+	spin_lock_bh(&chan->desc_lock);
+
+#ifdef CONFIG_PM
+	if (unlikely(chan->pm_state != RUNNING)) {
+		chan_dbg(chan, "cannot submit due to suspend\n");
+		spin_unlock_bh(&chan->desc_lock);
+		return -1;
+	}
+#endif
 
 	/*
 	 * assign cookies to all of the software descriptors
@@ -412,7 +419,7 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	/* put this transaction onto the tail of the pending queue */
 	append_ld_queue(chan, desc);
 
-	spin_unlock_irqrestore(&chan->desc_lock, flags);
+	spin_unlock_bh(&chan->desc_lock);
 
 	return cookie;
 }
@@ -459,6 +466,88 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(struct fsldma_chan *chan)
 }
 
 /**
+ * fsldma_clean_completed_descriptor - free all descriptors which
+ * has been completed and acked
+ * @chan: Freescale DMA channel
+ *
+ * This function is used on all completed and acked descriptors.
+ * All descriptors should only be freed in this function.
+ */
+static void fsldma_clean_completed_descriptor(struct fsldma_chan *chan)
+{
+	struct fsl_desc_sw *desc, *_desc;
+
+	/* Run the callback for each descriptor, in order */
+	list_for_each_entry_safe(desc, _desc, &chan->ld_completed, node)
+		if (async_tx_test_ack(&desc->async_tx))
+			fsl_dma_free_descriptor(chan, desc);
+}
+
+/**
+ * fsldma_run_tx_complete_actions - cleanup a single link descriptor
+ * @chan: Freescale DMA channel
+ * @desc: descriptor to cleanup and free
+ * @cookie: Freescale DMA transaction identifier
+ *
+ * This function is used on a descriptor which has been executed by the DMA
+ * controller. It will run any callbacks, submit any dependencies.
+ */
+static dma_cookie_t fsldma_run_tx_complete_actions(struct fsldma_chan *chan,
+		struct fsl_desc_sw *desc, dma_cookie_t cookie)
+{
+	struct dma_async_tx_descriptor *txd = &desc->async_tx;
+	dma_cookie_t ret = cookie;
+
+	BUG_ON(txd->cookie < 0);
+
+	if (txd->cookie > 0) {
+		ret = txd->cookie;
+
+		/* Run the link descriptor callback function */
+		if (txd->callback) {
+			chan_dbg(chan, "LD %p callback\n", desc);
+			txd->callback(txd->callback_param);
+		}
+	}
+
+	/* Run any dependencies */
+	dma_run_dependencies(txd);
+
+	return ret;
+}
+
+/**
+ * fsldma_clean_running_descriptor - move the completed descriptor from
+ * ld_running to ld_completed
+ * @chan: Freescale DMA channel
+ * @desc: the descriptor which is completed
+ *
+ * Free the descriptor directly if acked by async_tx api, or move it to
+ * queue ld_completed.
+ */
+static void fsldma_clean_running_descriptor(struct fsldma_chan *chan,
+		struct fsl_desc_sw *desc)
+{
+	/* Remove from the list of transactions */
+	list_del(&desc->node);
+
+	/*
+	 * the client is allowed to attach dependent operations
+	 * until 'ack' is set
+	 */
+	if (!async_tx_test_ack(&desc->async_tx)) {
+		/*
+		 * Move this descriptor to the list of descriptors which is
+		 * completed, but still awaiting the 'ack' bit to be set.
+		 */
+		list_add_tail(&desc->node, &chan->ld_completed);
+		return;
+	}
+
+	dma_pool_free(chan->desc_pool, desc, desc->async_tx.phys);
+}
+
+/**
  * fsl_chan_xfer_ld_queue - transfer any pending transactions
  * @chan : Freescale DMA channel
  *
@@ -526,31 +615,58 @@ static void fsl_chan_xfer_ld_queue(struct fsldma_chan *chan)
 }
 
 /**
- * fsldma_cleanup_descriptor - cleanup and free a single link descriptor
+ * fsldma_cleanup_descriptors - cleanup link descriptors which are completed
+ * and move them to ld_completed to free until flag 'ack' is set
  * @chan: Freescale DMA channel
- * @desc: descriptor to cleanup and free
  *
- * This function is used on a descriptor which has been executed by the DMA
- * controller. It will run any callbacks, submit any dependencies, and then
- * free the descriptor.
+ * This function is used on descriptors which have been executed by the DMA
+ * controller. It will run any callbacks, submit any dependencies, then
+ * free these descriptors if flag 'ack' is set.
  */
-static void fsldma_cleanup_descriptor(struct fsldma_chan *chan,
-				      struct fsl_desc_sw *desc)
+static void fsldma_cleanup_descriptors(struct fsldma_chan *chan)
 {
-	struct dma_async_tx_descriptor *txd = &desc->async_tx;
+	struct fsl_desc_sw *desc, *_desc;
+	dma_cookie_t cookie = 0;
+	dma_addr_t curr_phys = get_cdar(chan);
+	int seen_current = 0;
+
+	fsldma_clean_completed_descriptor(chan);
+
+	/* Run the callback for each descriptor, in order */
+	list_for_each_entry_safe(desc, _desc, &chan->ld_running, node) {
+		/*
+		 * do not advance past the current descriptor loaded into the
+		 * hardware channel, subsequent descriptors are either in
+		 * process or have not been submitted
+		 */
+		if (seen_current)
+			break;
+
+		/*
+		 * stop the search if we reach the current descriptor and the
+		 * channel is busy
+		 */
+		if (desc->async_tx.phys == curr_phys) {
+			seen_current = 1;
+			if (!dma_is_idle(chan))
+				break;
+		}
+
+		cookie = fsldma_run_tx_complete_actions(chan, desc, cookie);
 
-	/* Run the link descriptor callback function */
-	if (txd->callback) {
-		chan_dbg(chan, "LD %p callback\n", desc);
-		txd->callback(txd->callback_param);
+		fsldma_clean_running_descriptor(chan, desc);
 	}
 
-	/* Run any dependencies */
-	dma_run_dependencies(txd);
+	/*
+	 * Start any pending transactions automatically
+	 *
+	 * In the ideal case, we keep the DMA controller busy while we go
+	 * ahead and free the descriptors below.
+	 */
+	fsl_chan_xfer_ld_queue(chan);
 
-	dma_descriptor_unmap(txd);
-	chan_dbg(chan, "LD %p free\n", desc);
-	dma_pool_free(chan->desc_pool, desc, txd->phys);
+	if (cookie > 0)
+		chan->common.completed_cookie = cookie;
 }
 
 /**
@@ -617,13 +733,14 @@ static void fsldma_free_desc_list_reverse(struct fsldma_chan *chan,
 static void fsl_dma_free_chan_resources(struct dma_chan *dchan)
 {
 	struct fsldma_chan *chan = to_fsl_chan(dchan);
-	unsigned long flags;
 
 	chan_dbg(chan, "free all channel resources\n");
-	spin_lock_irqsave(&chan->desc_lock, flags);
+	spin_lock_bh(&chan->desc_lock);
+	fsldma_cleanup_descriptors(chan);
 	fsldma_free_desc_list(chan, &chan->ld_pending);
 	fsldma_free_desc_list(chan, &chan->ld_running);
-	spin_unlock_irqrestore(&chan->desc_lock, flags);
+	fsldma_free_desc_list(chan, &chan->ld_completed);
+	spin_unlock_bh(&chan->desc_lock);
 
 	dma_pool_destroy(chan->desc_pool);
 	chan->desc_pool = NULL;
@@ -842,7 +959,6 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 {
 	struct dma_slave_config *config;
 	struct fsldma_chan *chan;
-	unsigned long flags;
 	int size;
 
 	if (!dchan)
@@ -852,7 +968,7 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 
 	switch (cmd) {
 	case DMA_TERMINATE_ALL:
-		spin_lock_irqsave(&chan->desc_lock, flags);
+		spin_lock_bh(&chan->desc_lock);
 
 		/* Halt the DMA engine */
 		dma_halt(chan);
@@ -860,9 +976,10 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 		/* Remove and free all of the descriptors in the LD queue */
 		fsldma_free_desc_list(chan, &chan->ld_pending);
 		fsldma_free_desc_list(chan, &chan->ld_running);
+		fsldma_free_desc_list(chan, &chan->ld_completed);
 		chan->idle = true;
 
-		spin_unlock_irqrestore(&chan->desc_lock, flags);
+		spin_unlock_bh(&chan->desc_lock);
 		return 0;
 
 	case DMA_SLAVE_CONFIG:
@@ -904,11 +1021,10 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 static void fsl_dma_memcpy_issue_pending(struct dma_chan *dchan)
 {
 	struct fsldma_chan *chan = to_fsl_chan(dchan);
-	unsigned long flags;
 
-	spin_lock_irqsave(&chan->desc_lock, flags);
+	spin_lock_bh(&chan->desc_lock);
 	fsl_chan_xfer_ld_queue(chan);
-	spin_unlock_irqrestore(&chan->desc_lock, flags);
+	spin_unlock_bh(&chan->desc_lock);
 }
 
 /**
@@ -919,6 +1035,17 @@ static enum dma_status fsl_tx_status(struct dma_chan *dchan,
 					dma_cookie_t cookie,
 					struct dma_tx_state *txstate)
 {
+	struct fsldma_chan *chan = to_fsl_chan(dchan);
+	enum dma_status ret;
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_COMPLETE)
+		return ret;
+
+	spin_lock_bh(&chan->desc_lock);
+	fsldma_cleanup_descriptors(chan);
+	spin_unlock_bh(&chan->desc_lock);
+
 	return dma_cookie_status(dchan, cookie, txstate);
 }
 
@@ -996,52 +1123,18 @@ static irqreturn_t fsldma_chan_irq(int irq, void *data)
 static void dma_do_tasklet(unsigned long data)
 {
 	struct fsldma_chan *chan = (struct fsldma_chan *)data;
-	struct fsl_desc_sw *desc, *_desc;
-	LIST_HEAD(ld_cleanup);
-	unsigned long flags;
 
 	chan_dbg(chan, "tasklet entry\n");
 
-	spin_lock_irqsave(&chan->desc_lock, flags);
-
-	/* update the cookie if we have some descriptors to cleanup */
-	if (!list_empty(&chan->ld_running)) {
-		dma_cookie_t cookie;
-
-		desc = to_fsl_desc(chan->ld_running.prev);
-		cookie = desc->async_tx.cookie;
-		dma_cookie_complete(&desc->async_tx);
-
-		chan_dbg(chan, "completed_cookie=%d\n", cookie);
-	}
-
-	/*
-	 * move the descriptors to a temporary list so we can drop the lock
-	 * during the entire cleanup operation
-	 */
-	list_splice_tail_init(&chan->ld_running, &ld_cleanup);
+	spin_lock_bh(&chan->desc_lock);
 
 	/* the hardware is now idle and ready for more */
 	chan->idle = true;
 
-	/*
-	 * Start any pending transactions automatically
-	 *
-	 * In the ideal case, we keep the DMA controller busy while we go
-	 * ahead and free the descriptors below.
-	 */
-	fsl_chan_xfer_ld_queue(chan);
-	spin_unlock_irqrestore(&chan->desc_lock, flags);
-
-	/* Run the callback for each descriptor, in order */
-	list_for_each_entry_safe(desc, _desc, &ld_cleanup, node) {
-
-		/* Remove from the list of transactions */
-		list_del(&desc->node);
+	/* Run all cleanup for descriptors which have been completed */
+	fsldma_cleanup_descriptors(chan);
 
-		/* Run all cleanup for this descriptor */
-		fsldma_cleanup_descriptor(chan, desc);
-	}
+	spin_unlock_bh(&chan->desc_lock);
 
 	chan_dbg(chan, "tasklet exit\n");
 }
@@ -1225,7 +1318,11 @@ static int fsl_dma_chan_probe(struct fsldma_device *fdev,
 	spin_lock_init(&chan->desc_lock);
 	INIT_LIST_HEAD(&chan->ld_pending);
 	INIT_LIST_HEAD(&chan->ld_running);
+	INIT_LIST_HEAD(&chan->ld_completed);
 	chan->idle = true;
+#ifdef CONFIG_PM
+	chan->pm_state = RUNNING;
+#endif
 
 	chan->common.device = &fdev->common;
 	dma_cookie_init(&chan->common);
@@ -1365,6 +1462,69 @@ static int fsldma_of_remove(struct platform_device *op)
 	return 0;
 }
 
+#ifdef CONFIG_PM
+static int fsldma_suspend_late(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct fsldma_device *fdev = platform_get_drvdata(pdev);
+	struct fsldma_chan *chan;
+	int i;
+
+	for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++) {
+		chan = fdev->chan[i];
+		if (!chan)
+			continue;
+
+		spin_lock_bh(&chan->desc_lock);
+		if (unlikely(!chan->idle))
+			goto out;
+		chan->regs_save.mr = get_mr(chan);
+		chan->pm_state = SUSPENDED;
+		spin_unlock_bh(&chan->desc_lock);
+	}
+	return 0;
+
+out:
+	for (; i >= 0; i--) {
+		chan = fdev->chan[i];
+		if (!chan)
+			continue;
+		chan->pm_state = RUNNING;
+		spin_unlock_bh(&chan->desc_lock);
+	}
+	return -EBUSY;
+}
+
+static int fsldma_resume_early(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct fsldma_device *fdev = platform_get_drvdata(pdev);
+	struct fsldma_chan *chan;
+	u32 mode;
+	int i;
+
+	for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++) {
+		chan = fdev->chan[i];
+		if (!chan)
+			continue;
+
+		spin_lock_bh(&chan->desc_lock);
+		mode = chan->regs_save.mr
+			& ~FSL_DMA_MR_CS & ~FSL_DMA_MR_CC & ~FSL_DMA_MR_CA;
+		set_mr(chan, mode);
+		chan->pm_state = RUNNING;
+		spin_unlock_bh(&chan->desc_lock);
+	}
+
+	return 0;
+}
+
+static const struct dev_pm_ops fsldma_pm_ops = {
+	.suspend_late	= fsldma_suspend_late,
+	.resume_early	= fsldma_resume_early,
+};
+#endif
+
 static const struct of_device_id fsldma_of_ids[] = {
 	{ .compatible = "fsl,elo3-dma", },
 	{ .compatible = "fsl,eloplus-dma", },
@@ -1377,6 +1537,9 @@ static struct platform_driver fsldma_of_driver = {
 		.name = "fsl-elo-dma",
 		.owner = THIS_MODULE,
 		.of_match_table = fsldma_of_ids,
+#ifdef CONFIG_PM
+		.pm = &fsldma_pm_ops,
+#endif
 	},
 	.probe = fsldma_of_probe,
 	.remove = fsldma_of_remove,
diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index d56e83599825..239c20c84382 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -134,12 +134,36 @@ struct fsldma_device {
 #define FSL_DMA_CHAN_PAUSE_EXT	0x00001000
 #define FSL_DMA_CHAN_START_EXT	0x00002000
 
+#ifdef CONFIG_PM
+struct fsldma_chan_regs_save {
+	u32 mr;
+};
+
+enum fsldma_pm_state {
+	RUNNING = 0,
+	SUSPENDED,
+};
+#endif
+
 struct fsldma_chan {
 	char name[8];			/* Channel name */
 	struct fsldma_chan_regs __iomem *regs;
 	spinlock_t desc_lock;		/* Descriptor operation lock */
-	struct list_head ld_pending;	/* Link descriptors queue */
-	struct list_head ld_running;	/* Link descriptors queue */
+	/*
+	 * Descriptors which are queued to run, but have not yet been
+	 * submitted to the hardware for execution
+	 */
+	struct list_head ld_pending;
+	/*
+	 * Descriptors which are currently being executed by the hardware
+	 */
+	struct list_head ld_running;
+	/*
+	 * Descriptors which have finished execution by the hardware. These
+	 * descriptors have already had their cleanup actions run. They are
+	 * waiting for the ACK bit to be set by the async_tx API.
+	 */
+	struct list_head ld_completed;	/* Link descriptors queue */
 	struct dma_chan common;		/* DMA common channel */
 	struct dma_pool *desc_pool;	/* Descriptors pool */
 	struct device *dev;		/* Channel device */
@@ -148,6 +172,10 @@ struct fsldma_chan {
 	struct tasklet_struct tasklet;
 	u32 feature;
 	bool idle;			/* DMA controller is idle */
+#ifdef CONFIG_PM
+	struct fsldma_chan_regs_save regs_save;
+	enum fsldma_pm_state pm_state;
+#endif
 
 	void (*toggle_ext_pause)(struct fsldma_chan *fsl_chan, int enable);
 	void (*toggle_ext_start)(struct fsldma_chan *fsl_chan, int enable);
diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
index 286660a12cc6..9d2c9e7374dc 100644
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -866,7 +866,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_slave_sg(
 static struct dma_async_tx_descriptor *imxdma_prep_dma_cyclic(
 		struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
 		size_t period_len, enum dma_transfer_direction direction,
-		unsigned long flags, void *context)
+		unsigned long flags)
 {
 	struct imxdma_channel *imxdmac = to_imxdma_chan(chan);
 	struct imxdma_engine *imxdma = imxdmac->imxdma;
diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 14867e3ac8ff..f7626e37d0b8 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -271,6 +271,7 @@ struct sdma_channel {
 	unsigned int			chn_count;
 	unsigned int			chn_real_count;
 	struct tasklet_struct		tasklet;
+	struct imx_dma_data		data;
 };
 
 #define IMX_DMA_SG_LOOP		BIT(0)
@@ -749,6 +750,11 @@ static void sdma_get_pc(struct sdma_channel *sdmac,
 		emi_2_per = sdma->script_addrs->asrc_2_mcu_addr;
 		per_2_per = sdma->script_addrs->per_2_per_addr;
 		break;
+	case IMX_DMATYPE_ASRC_SP:
+		per_2_emi = sdma->script_addrs->shp_2_mcu_addr;
+		emi_2_per = sdma->script_addrs->mcu_2_shp_addr;
+		per_2_per = sdma->script_addrs->per_2_per_addr;
+		break;
 	case IMX_DMATYPE_MSHC:
 		per_2_emi = sdma->script_addrs->mshc_2_mcu_addr;
 		emi_2_per = sdma->script_addrs->mcu_2_mshc_addr;
@@ -911,14 +917,13 @@ static int sdma_request_channel(struct sdma_channel *sdmac)
 	int channel = sdmac->channel;
 	int ret = -EBUSY;
 
-	sdmac->bd = dma_alloc_coherent(NULL, PAGE_SIZE, &sdmac->bd_phys, GFP_KERNEL);
+	sdmac->bd = dma_zalloc_coherent(NULL, PAGE_SIZE, &sdmac->bd_phys,
+					GFP_KERNEL);
 	if (!sdmac->bd) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	memset(sdmac->bd, 0, PAGE_SIZE);
-
 	sdma->channel_control[channel].base_bd_ptr = sdmac->bd_phys;
 	sdma->channel_control[channel].current_bd_ptr = sdmac->bd_phys;
 
@@ -1120,7 +1125,7 @@ err_out:
 static struct dma_async_tx_descriptor *sdma_prep_dma_cyclic(
 		struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
 		size_t period_len, enum dma_transfer_direction direction,
-		unsigned long flags, void *context)
+		unsigned long flags)
 {
 	struct sdma_channel *sdmac = to_sdma_chan(chan);
 	struct sdma_engine *sdma = sdmac->sdma;
@@ -1414,12 +1419,14 @@ err_dma_alloc:
 
 static bool sdma_filter_fn(struct dma_chan *chan, void *fn_param)
 {
+	struct sdma_channel *sdmac = to_sdma_chan(chan);
 	struct imx_dma_data *data = fn_param;
 
 	if (!imx_dma_is_general_purpose(chan))
 		return false;
 
-	chan->private = data;
+	sdmac->data = *data;
+	chan->private = &sdmac->data;
 
 	return true;
 }
diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c
index 128ca143486d..bbf62927bd72 100644
--- a/drivers/dma/ipu/ipu_idmac.c
+++ b/drivers/dma/ipu/ipu_idmac.c
@@ -1532,11 +1532,17 @@ static int idmac_alloc_chan_resources(struct dma_chan *chan)
 #ifdef DEBUG
 	if (chan->chan_id == IDMAC_IC_7) {
 		ic_sof = ipu_irq_map(69);
-		if (ic_sof > 0)
-			request_irq(ic_sof, ic_sof_irq, 0, "IC SOF", ichan);
+		if (ic_sof > 0) {
+			ret = request_irq(ic_sof, ic_sof_irq, 0, "IC SOF", ichan);
+			if (ret)
+				dev_err(&chan->dev->device, "request irq failed for IC SOF");
+		}
 		ic_eof = ipu_irq_map(70);
-		if (ic_eof > 0)
-			request_irq(ic_eof, ic_eof_irq, 0, "IC EOF", ichan);
+		if (ic_eof > 0) {
+			ret = request_irq(ic_eof, ic_eof_irq, 0, "IC EOF", ichan);
+			if (ret)
+				dev_err(&chan->dev->device, "request irq failed for IC EOF");
+		}
 	}
 #endif
 
diff --git a/drivers/dma/mmp_pdma.c b/drivers/dma/mmp_pdma.c
index a7b186d536b3..a1a4db5721b8 100644
--- a/drivers/dma/mmp_pdma.c
+++ b/drivers/dma/mmp_pdma.c
@@ -601,7 +601,7 @@ static struct dma_async_tx_descriptor *
 mmp_pdma_prep_dma_cyclic(struct dma_chan *dchan,
 			 dma_addr_t buf_addr, size_t len, size_t period_len,
 			 enum dma_transfer_direction direction,
-			 unsigned long flags, void *context)
+			 unsigned long flags)
 {
 	struct mmp_pdma_chan *chan;
 	struct mmp_pdma_desc_sw *first = NULL, *prev = NULL, *new;
diff --git a/drivers/dma/mmp_tdma.c b/drivers/dma/mmp_tdma.c
index 724f7f4c9720..6ad30e2c5038 100644
--- a/drivers/dma/mmp_tdma.c
+++ b/drivers/dma/mmp_tdma.c
@@ -389,7 +389,7 @@ struct mmp_tdma_desc *mmp_tdma_alloc_descriptor(struct mmp_tdma_chan *tdmac)
 static struct dma_async_tx_descriptor *mmp_tdma_prep_dma_cyclic(
 		struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
 		size_t period_len, enum dma_transfer_direction direction,
-		unsigned long flags, void *context)
+		unsigned long flags)
 {
 	struct mmp_tdma_chan *tdmac = to_mmp_tdma_chan(chan);
 	struct mmp_tdma_desc *desc;
diff --git a/drivers/dma/mpc512x_dma.c b/drivers/dma/mpc512x_dma.c
index 2ad43738ac8b..881db2bcb48b 100644
--- a/drivers/dma/mpc512x_dma.c
+++ b/drivers/dma/mpc512x_dma.c
@@ -53,6 +53,7 @@
 #include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
+#include <linux/of_dma.h>
 #include <linux/of_platform.h>
 
 #include <linux/random.h>
@@ -1036,7 +1037,15 @@ static int mpc_dma_probe(struct platform_device *op)
 	if (retval)
 		goto err_free2;
 
-	return retval;
+	/* Register with OF helpers for DMA lookups (nonfatal) */
+	if (dev->of_node) {
+		retval = of_dma_controller_register(dev->of_node,
+						of_dma_xlate_by_chan_id, mdma);
+		if (retval)
+			dev_warn(dev, "Could not register for OF lookup\n");
+	}
+
+	return 0;
 
 err_free2:
 	if (mdma->is_mpc8308)
@@ -1057,6 +1066,8 @@ static int mpc_dma_remove(struct platform_device *op)
 	struct device *dev = &op->dev;
 	struct mpc_dma *mdma = dev_get_drvdata(dev);
 
+	if (dev->of_node)
+		of_dma_controller_free(dev->of_node);
 	dma_async_device_unregister(&mdma->dma);
 	if (mdma->is_mpc8308) {
 		free_irq(mdma->irq2, mdma);
diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index ead491346da7..5ea61201dbf0 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -413,16 +413,14 @@ static int mxs_dma_alloc_chan_resources(struct dma_chan *chan)
 	struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma;
 	int ret;
 
-	mxs_chan->ccw = dma_alloc_coherent(mxs_dma->dma_device.dev,
-				CCW_BLOCK_SIZE, &mxs_chan->ccw_phys,
-				GFP_KERNEL);
+	mxs_chan->ccw = dma_zalloc_coherent(mxs_dma->dma_device.dev,
+					    CCW_BLOCK_SIZE,
+					    &mxs_chan->ccw_phys, GFP_KERNEL);
 	if (!mxs_chan->ccw) {
 		ret = -ENOMEM;
 		goto err_alloc;
 	}
 
-	memset(mxs_chan->ccw, 0, CCW_BLOCK_SIZE);
-
 	if (mxs_chan->chan_irq != NO_IRQ) {
 		ret = request_irq(mxs_chan->chan_irq, mxs_dma_int_handler,
 					0, "mxs-dma", mxs_dma);
@@ -591,7 +589,7 @@ err_out:
 static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic(
 		struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
 		size_t period_len, enum dma_transfer_direction direction,
-		unsigned long flags, void *context)
+		unsigned long flags)
 {
 	struct mxs_dma_chan *mxs_chan = to_mxs_dma_chan(chan);
 	struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma;
diff --git a/drivers/dma/nbpfaxi.c b/drivers/dma/nbpfaxi.c
new file mode 100644
index 000000000000..5aeada56a442
--- /dev/null
+++ b/drivers/dma/nbpfaxi.c
@@ -0,0 +1,1517 @@
+/*
+ * Copyright (C) 2013-2014 Renesas Electronics Europe Ltd.
+ * Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_dma.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include <dt-bindings/dma/nbpfaxi.h>
+
+#include "dmaengine.h"
+
+#define NBPF_REG_CHAN_OFFSET	0
+#define NBPF_REG_CHAN_SIZE	0x40
+
+/* Channel Current Transaction Byte register */
+#define NBPF_CHAN_CUR_TR_BYTE	0x20
+
+/* Channel Status register */
+#define NBPF_CHAN_STAT	0x24
+#define NBPF_CHAN_STAT_EN	1
+#define NBPF_CHAN_STAT_TACT	4
+#define NBPF_CHAN_STAT_ERR	0x10
+#define NBPF_CHAN_STAT_END	0x20
+#define NBPF_CHAN_STAT_TC	0x40
+#define NBPF_CHAN_STAT_DER	0x400
+
+/* Channel Control register */
+#define NBPF_CHAN_CTRL	0x28
+#define NBPF_CHAN_CTRL_SETEN	1
+#define NBPF_CHAN_CTRL_CLREN	2
+#define NBPF_CHAN_CTRL_STG	4
+#define NBPF_CHAN_CTRL_SWRST	8
+#define NBPF_CHAN_CTRL_CLRRQ	0x10
+#define NBPF_CHAN_CTRL_CLREND	0x20
+#define NBPF_CHAN_CTRL_CLRTC	0x40
+#define NBPF_CHAN_CTRL_SETSUS	0x100
+#define NBPF_CHAN_CTRL_CLRSUS	0x200
+
+/* Channel Configuration register */
+#define NBPF_CHAN_CFG	0x2c
+#define NBPF_CHAN_CFG_SEL	7		/* terminal SELect: 0..7 */
+#define NBPF_CHAN_CFG_REQD	8		/* REQuest Direction: DMAREQ is 0: input, 1: output */
+#define NBPF_CHAN_CFG_LOEN	0x10		/* LOw ENable: low DMA request line is: 0: inactive, 1: active */
+#define NBPF_CHAN_CFG_HIEN	0x20		/* HIgh ENable: high DMA request line is: 0: inactive, 1: active */
+#define NBPF_CHAN_CFG_LVL	0x40		/* LeVeL: DMA request line is sensed as 0: edge, 1: level */
+#define NBPF_CHAN_CFG_AM	0x700		/* ACK Mode: 0: Pulse mode, 1: Level mode, b'1x: Bus Cycle */
+#define NBPF_CHAN_CFG_SDS	0xf000		/* Source Data Size: 0: 8 bits,... , 7: 1024 bits */
+#define NBPF_CHAN_CFG_DDS	0xf0000		/* Destination Data Size: as above */
+#define NBPF_CHAN_CFG_SAD	0x100000	/* Source ADdress counting: 0: increment, 1: fixed */
+#define NBPF_CHAN_CFG_DAD	0x200000	/* Destination ADdress counting: 0: increment, 1: fixed */
+#define NBPF_CHAN_CFG_TM	0x400000	/* Transfer Mode: 0: single, 1: block TM */
+#define NBPF_CHAN_CFG_DEM	0x1000000	/* DMAEND interrupt Mask */
+#define NBPF_CHAN_CFG_TCM	0x2000000	/* DMATCO interrupt Mask */
+#define NBPF_CHAN_CFG_SBE	0x8000000	/* Sweep Buffer Enable */
+#define NBPF_CHAN_CFG_RSEL	0x10000000	/* RM: Register Set sELect */
+#define NBPF_CHAN_CFG_RSW	0x20000000	/* RM: Register Select sWitch */
+#define NBPF_CHAN_CFG_REN	0x40000000	/* RM: Register Set Enable */
+#define NBPF_CHAN_CFG_DMS	0x80000000	/* 0: register mode (RM), 1: link mode (LM) */
+
+#define NBPF_CHAN_NXLA	0x38
+#define NBPF_CHAN_CRLA	0x3c
+
+/* Link Header field */
+#define NBPF_HEADER_LV	1
+#define NBPF_HEADER_LE	2
+#define NBPF_HEADER_WBD	4
+#define NBPF_HEADER_DIM	8
+
+#define NBPF_CTRL	0x300
+#define NBPF_CTRL_PR	1		/* 0: fixed priority, 1: round robin */
+#define NBPF_CTRL_LVINT	2		/* DMAEND and DMAERR signalling: 0: pulse, 1: level */
+
+#define NBPF_DSTAT_ER	0x314
+#define NBPF_DSTAT_END	0x318
+
+#define NBPF_DMA_BUSWIDTHS \
+	(BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED) | \
+	 BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | \
+	 BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | \
+	 BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) | \
+	 BIT(DMA_SLAVE_BUSWIDTH_8_BYTES))
+
+struct nbpf_config {
+	int num_channels;
+	int buffer_size;
+};
+
+/*
+ * We've got 3 types of objects, used to describe DMA transfers:
+ * 1. high-level descriptor, containing a struct dma_async_tx_descriptor object
+ *	in it, used to communicate with the user
+ * 2. hardware DMA link descriptors, that we pass to DMAC for DMA transfer
+ *	queuing, these must be DMAable, using either the streaming DMA API or
+ *	allocated from coherent memory - one per SG segment
+ * 3. one per SG segment descriptors, used to manage HW link descriptors from
+ *	(2). They do not have to be DMAable. They can either be (a) allocated
+ *	together with link descriptors as mixed (DMA / CPU) objects, or (b)
+ *	separately. Even if allocated separately it would be best to link them
+ *	to link descriptors once during channel resource allocation and always
+ *	use them as a single object.
+ * Therefore for both cases (a) and (b) at run-time objects (2) and (3) shall be
+ * treated as a single SG segment descriptor.
+ */
+
+struct nbpf_link_reg {
+	u32	header;
+	u32	src_addr;
+	u32	dst_addr;
+	u32	transaction_size;
+	u32	config;
+	u32	interval;
+	u32	extension;
+	u32	next;
+} __packed;
+
+struct nbpf_device;
+struct nbpf_channel;
+struct nbpf_desc;
+
+struct nbpf_link_desc {
+	struct nbpf_link_reg *hwdesc;
+	dma_addr_t hwdesc_dma_addr;
+	struct nbpf_desc *desc;
+	struct list_head node;
+};
+
+/**
+ * struct nbpf_desc - DMA transfer descriptor
+ * @async_tx:	dmaengine object
+ * @user_wait:	waiting for a user ack
+ * @length:	total transfer length
+ * @sg:		list of hardware descriptors, represented by struct nbpf_link_desc
+ * @node:	member in channel descriptor lists
+ */
+struct nbpf_desc {
+	struct dma_async_tx_descriptor async_tx;
+	bool user_wait;
+	size_t length;
+	struct nbpf_channel *chan;
+	struct list_head sg;
+	struct list_head node;
+};
+
+/* Take a wild guess: allocate 4 segments per descriptor */
+#define NBPF_SEGMENTS_PER_DESC 4
+#define NBPF_DESCS_PER_PAGE ((PAGE_SIZE - sizeof(struct list_head)) /	\
+	(sizeof(struct nbpf_desc) +					\
+	 NBPF_SEGMENTS_PER_DESC *					\
+	 (sizeof(struct nbpf_link_desc) + sizeof(struct nbpf_link_reg))))
+#define NBPF_SEGMENTS_PER_PAGE (NBPF_SEGMENTS_PER_DESC * NBPF_DESCS_PER_PAGE)
+
+struct nbpf_desc_page {
+	struct list_head node;
+	struct nbpf_desc desc[NBPF_DESCS_PER_PAGE];
+	struct nbpf_link_desc ldesc[NBPF_SEGMENTS_PER_PAGE];
+	struct nbpf_link_reg hwdesc[NBPF_SEGMENTS_PER_PAGE];
+};
+
+/**
+ * struct nbpf_channel - one DMAC channel
+ * @dma_chan:	standard dmaengine channel object
+ * @base:	register address base
+ * @nbpf:	DMAC
+ * @name:	IRQ name
+ * @irq:	IRQ number
+ * @slave_addr:	address for slave DMA
+ * @slave_width:slave data size in bytes
+ * @slave_burst:maximum slave burst size in bytes
+ * @terminal:	DMA terminal, assigned to this channel
+ * @dmarq_cfg:	DMA request line configuration - high / low, edge / level for NBPF_CHAN_CFG
+ * @flags:	configuration flags from DT
+ * @lock:	protect descriptor lists
+ * @free_links:	list of free link descriptors
+ * @free:	list of free descriptors
+ * @queued:	list of queued descriptors
+ * @active:	list of descriptors, scheduled for processing
+ * @done:	list of completed descriptors, waiting post-processing
+ * @desc_page:	list of additionally allocated descriptor pages - if any
+ */
+struct nbpf_channel {
+	struct dma_chan dma_chan;
+	struct tasklet_struct tasklet;
+	void __iomem *base;
+	struct nbpf_device *nbpf;
+	char name[16];
+	int irq;
+	dma_addr_t slave_src_addr;
+	size_t slave_src_width;
+	size_t slave_src_burst;
+	dma_addr_t slave_dst_addr;
+	size_t slave_dst_width;
+	size_t slave_dst_burst;
+	unsigned int terminal;
+	u32 dmarq_cfg;
+	unsigned long flags;
+	spinlock_t lock;
+	struct list_head free_links;
+	struct list_head free;
+	struct list_head queued;
+	struct list_head active;
+	struct list_head done;
+	struct list_head desc_page;
+	struct nbpf_desc *running;
+	bool paused;
+};
+
+struct nbpf_device {
+	struct dma_device dma_dev;
+	void __iomem *base;
+	struct clk *clk;
+	const struct nbpf_config *config;
+	struct nbpf_channel chan[];
+};
+
+enum nbpf_model {
+	NBPF1B4,
+	NBPF1B8,
+	NBPF1B16,
+	NBPF4B4,
+	NBPF4B8,
+	NBPF4B16,
+	NBPF8B4,
+	NBPF8B8,
+	NBPF8B16,
+};
+
+static struct nbpf_config nbpf_cfg[] = {
+	[NBPF1B4] = {
+		.num_channels = 1,
+		.buffer_size = 4,
+	},
+	[NBPF1B8] = {
+		.num_channels = 1,
+		.buffer_size = 8,
+	},
+	[NBPF1B16] = {
+		.num_channels = 1,
+		.buffer_size = 16,
+	},
+	[NBPF4B4] = {
+		.num_channels = 4,
+		.buffer_size = 4,
+	},
+	[NBPF4B8] = {
+		.num_channels = 4,
+		.buffer_size = 8,
+	},
+	[NBPF4B16] = {
+		.num_channels = 4,
+		.buffer_size = 16,
+	},
+	[NBPF8B4] = {
+		.num_channels = 8,
+		.buffer_size = 4,
+	},
+	[NBPF8B8] = {
+		.num_channels = 8,
+		.buffer_size = 8,
+	},
+	[NBPF8B16] = {
+		.num_channels = 8,
+		.buffer_size = 16,
+	},
+};
+
+#define nbpf_to_chan(d) container_of(d, struct nbpf_channel, dma_chan)
+
+/*
+ * dmaengine drivers seem to have a lot in common and instead of sharing more
+ * code, they reimplement those common algorithms independently. In this driver
+ * we try to separate the hardware-specific part from the (largely) generic
+ * part. This improves code readability and makes it possible in the future to
+ * reuse the generic code in form of a helper library. That generic code should
+ * be suitable for various DMA controllers, using transfer descriptors in RAM
+ * and pushing one SG list at a time to the DMA controller.
+ */
+
+/*		Hardware-specific part		*/
+
+static inline u32 nbpf_chan_read(struct nbpf_channel *chan,
+				 unsigned int offset)
+{
+	u32 data = ioread32(chan->base + offset);
+	dev_dbg(chan->dma_chan.device->dev, "%s(0x%p + 0x%x) = 0x%x\n",
+		__func__, chan->base, offset, data);
+	return data;
+}
+
+static inline void nbpf_chan_write(struct nbpf_channel *chan,
+				   unsigned int offset, u32 data)
+{
+	iowrite32(data, chan->base + offset);
+	dev_dbg(chan->dma_chan.device->dev, "%s(0x%p + 0x%x) = 0x%x\n",
+		__func__, chan->base, offset, data);
+}
+
+static inline u32 nbpf_read(struct nbpf_device *nbpf,
+			    unsigned int offset)
+{
+	u32 data = ioread32(nbpf->base + offset);
+	dev_dbg(nbpf->dma_dev.dev, "%s(0x%p + 0x%x) = 0x%x\n",
+		__func__, nbpf->base, offset, data);
+	return data;
+}
+
+static inline void nbpf_write(struct nbpf_device *nbpf,
+			      unsigned int offset, u32 data)
+{
+	iowrite32(data, nbpf->base + offset);
+	dev_dbg(nbpf->dma_dev.dev, "%s(0x%p + 0x%x) = 0x%x\n",
+		__func__, nbpf->base, offset, data);
+}
+
+static void nbpf_chan_halt(struct nbpf_channel *chan)
+{
+	nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_CLREN);
+}
+
+static bool nbpf_status_get(struct nbpf_channel *chan)
+{
+	u32 status = nbpf_read(chan->nbpf, NBPF_DSTAT_END);
+
+	return status & BIT(chan - chan->nbpf->chan);
+}
+
+static void nbpf_status_ack(struct nbpf_channel *chan)
+{
+	nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_CLREND);
+}
+
+static u32 nbpf_error_get(struct nbpf_device *nbpf)
+{
+	return nbpf_read(nbpf, NBPF_DSTAT_ER);
+}
+
+static struct nbpf_channel *nbpf_error_get_channel(struct nbpf_device *nbpf, u32 error)
+{
+	return nbpf->chan + __ffs(error);
+}
+
+static void nbpf_error_clear(struct nbpf_channel *chan)
+{
+	u32 status;
+	int i;
+
+	/* Stop the channel, make sure DMA has been aborted */
+	nbpf_chan_halt(chan);
+
+	for (i = 1000; i; i--) {
+		status = nbpf_chan_read(chan, NBPF_CHAN_STAT);
+		if (!(status & NBPF_CHAN_STAT_TACT))
+			break;
+		cpu_relax();
+	}
+
+	if (!i)
+		dev_err(chan->dma_chan.device->dev,
+			"%s(): abort timeout, channel status 0x%x\n", __func__, status);
+
+	nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_SWRST);
+}
+
+static int nbpf_start(struct nbpf_desc *desc)
+{
+	struct nbpf_channel *chan = desc->chan;
+	struct nbpf_link_desc *ldesc = list_first_entry(&desc->sg, struct nbpf_link_desc, node);
+
+	nbpf_chan_write(chan, NBPF_CHAN_NXLA, (u32)ldesc->hwdesc_dma_addr);
+	nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_SETEN | NBPF_CHAN_CTRL_CLRSUS);
+	chan->paused = false;
+
+	/* Software trigger MEMCPY - only MEMCPY uses the block mode */
+	if (ldesc->hwdesc->config & NBPF_CHAN_CFG_TM)
+		nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_STG);
+
+	dev_dbg(chan->nbpf->dma_dev.dev, "%s(): next 0x%x, cur 0x%x\n", __func__,
+		nbpf_chan_read(chan, NBPF_CHAN_NXLA), nbpf_chan_read(chan, NBPF_CHAN_CRLA));
+
+	return 0;
+}
+
+static void nbpf_chan_prepare(struct nbpf_channel *chan)
+{
+	chan->dmarq_cfg = (chan->flags & NBPF_SLAVE_RQ_HIGH ? NBPF_CHAN_CFG_HIEN : 0) |
+		(chan->flags & NBPF_SLAVE_RQ_LOW ? NBPF_CHAN_CFG_LOEN : 0) |
+		(chan->flags & NBPF_SLAVE_RQ_LEVEL ?
+		 NBPF_CHAN_CFG_LVL | (NBPF_CHAN_CFG_AM & 0x200) : 0) |
+		chan->terminal;
+}
+
+static void nbpf_chan_prepare_default(struct nbpf_channel *chan)
+{
+	/* Don't output DMAACK */
+	chan->dmarq_cfg = NBPF_CHAN_CFG_AM & 0x400;
+	chan->terminal = 0;
+	chan->flags = 0;
+}
+
+static void nbpf_chan_configure(struct nbpf_channel *chan)
+{
+	/*
+	 * We assume, that only the link mode and DMA request line configuration
+	 * have to be set in the configuration register manually. Dynamic
+	 * per-transfer configuration will be loaded from transfer descriptors.
+	 */
+	nbpf_chan_write(chan, NBPF_CHAN_CFG, NBPF_CHAN_CFG_DMS | chan->dmarq_cfg);
+}
+
+static u32 nbpf_xfer_ds(struct nbpf_device *nbpf, size_t size)
+{
+	/* Maximum supported bursts depend on the buffer size */
+	return min_t(int, __ffs(size), ilog2(nbpf->config->buffer_size * 8));
+}
+
+static size_t nbpf_xfer_size(struct nbpf_device *nbpf,
+			     enum dma_slave_buswidth width, u32 burst)
+{
+	size_t size;
+
+	if (!burst)
+		burst = 1;
+
+	switch (width) {
+	case DMA_SLAVE_BUSWIDTH_8_BYTES:
+		size = 8 * burst;
+		break;
+
+	case DMA_SLAVE_BUSWIDTH_4_BYTES:
+		size = 4 * burst;
+		break;
+
+	case DMA_SLAVE_BUSWIDTH_2_BYTES:
+		size = 2 * burst;
+		break;
+
+	default:
+		pr_warn("%s(): invalid bus width %u\n", __func__, width);
+	case DMA_SLAVE_BUSWIDTH_1_BYTE:
+		size = burst;
+	}
+
+	return nbpf_xfer_ds(nbpf, size);
+}
+
+/*
+ * We need a way to recognise slaves, whose data is sent "raw" over the bus,
+ * i.e. it isn't known in advance how many bytes will be received. Therefore
+ * the slave driver has to provide a "large enough" buffer and either read the
+ * buffer, when it is full, or detect, that some data has arrived, then wait for
+ * a timeout, if no more data arrives - receive what's already there. We want to
+ * handle such slaves in a special way to allow an optimised mode for other
+ * users, for whom the amount of data is known in advance. So far there's no way
+ * to recognise such slaves. We use a data-width check to distinguish between
+ * the SD host and the PL011 UART.
+ */
+
+static int nbpf_prep_one(struct nbpf_link_desc *ldesc,
+			 enum dma_transfer_direction direction,
+			 dma_addr_t src, dma_addr_t dst, size_t size, bool last)
+{
+	struct nbpf_link_reg *hwdesc = ldesc->hwdesc;
+	struct nbpf_desc *desc = ldesc->desc;
+	struct nbpf_channel *chan = desc->chan;
+	struct device *dev = chan->dma_chan.device->dev;
+	size_t mem_xfer, slave_xfer;
+	bool can_burst;
+
+	hwdesc->header = NBPF_HEADER_WBD | NBPF_HEADER_LV |
+		(last ? NBPF_HEADER_LE : 0);
+
+	hwdesc->src_addr = src;
+	hwdesc->dst_addr = dst;
+	hwdesc->transaction_size = size;
+
+	/*
+	 * set config: SAD, DAD, DDS, SDS, etc.
+	 * Note on transfer sizes: the DMAC can perform unaligned DMA transfers,
+	 * but it is important to have transaction size a multiple of both
+	 * receiver and transmitter transfer sizes. It is also possible to use
+	 * different RAM and device transfer sizes, and it does work well with
+	 * some devices, e.g. with V08R07S01E SD host controllers, which can use
+	 * 128 byte transfers. But this doesn't work with other devices,
+	 * especially when the transaction size is unknown. This is the case,
+	 * e.g. with serial drivers like amba-pl011.c. For reception it sets up
+	 * the transaction size of 4K and if fewer bytes are received, it
+	 * pauses DMA and reads out data received via DMA as well as those left
+	 * in the Rx FIFO. For this to work with the RAM side using burst
+	 * transfers we enable the SBE bit and terminate the transfer in our
+	 * DMA_PAUSE handler.
+	 */
+	mem_xfer = nbpf_xfer_ds(chan->nbpf, size);
+
+	switch (direction) {
+	case DMA_DEV_TO_MEM:
+		can_burst = chan->slave_src_width >= 3;
+		slave_xfer = min(mem_xfer, can_burst ?
+				 chan->slave_src_burst : chan->slave_src_width);
+		/*
+		 * Is the slave narrower than 64 bits, i.e. isn't using the full
+		 * bus width and cannot use bursts?
+		 */
+		if (mem_xfer > chan->slave_src_burst && !can_burst)
+			mem_xfer = chan->slave_src_burst;
+		/* Device-to-RAM DMA is unreliable without REQD set */
+		hwdesc->config = NBPF_CHAN_CFG_SAD | (NBPF_CHAN_CFG_DDS & (mem_xfer << 16)) |
+			(NBPF_CHAN_CFG_SDS & (slave_xfer << 12)) | NBPF_CHAN_CFG_REQD |
+			NBPF_CHAN_CFG_SBE;
+		break;
+
+	case DMA_MEM_TO_DEV:
+		slave_xfer = min(mem_xfer, chan->slave_dst_width >= 3 ?
+				 chan->slave_dst_burst : chan->slave_dst_width);
+		hwdesc->config = NBPF_CHAN_CFG_DAD | (NBPF_CHAN_CFG_SDS & (mem_xfer << 12)) |
+			(NBPF_CHAN_CFG_DDS & (slave_xfer << 16)) | NBPF_CHAN_CFG_REQD;
+		break;
+
+	case DMA_MEM_TO_MEM:
+		hwdesc->config = NBPF_CHAN_CFG_TCM | NBPF_CHAN_CFG_TM |
+			(NBPF_CHAN_CFG_SDS & (mem_xfer << 12)) |
+			(NBPF_CHAN_CFG_DDS & (mem_xfer << 16));
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	hwdesc->config |= chan->dmarq_cfg | (last ? 0 : NBPF_CHAN_CFG_DEM) |
+		NBPF_CHAN_CFG_DMS;
+
+	dev_dbg(dev, "%s(): desc @ %pad: hdr 0x%x, cfg 0x%x, %zu @ %pad -> %pad\n",
+		__func__, &ldesc->hwdesc_dma_addr, hwdesc->header,
+		hwdesc->config, size, &src, &dst);
+
+	dma_sync_single_for_device(dev, ldesc->hwdesc_dma_addr, sizeof(*hwdesc),
+				   DMA_TO_DEVICE);
+
+	return 0;
+}
+
+static size_t nbpf_bytes_left(struct nbpf_channel *chan)
+{
+	return nbpf_chan_read(chan, NBPF_CHAN_CUR_TR_BYTE);
+}
+
+static void nbpf_configure(struct nbpf_device *nbpf)
+{
+	nbpf_write(nbpf, NBPF_CTRL, NBPF_CTRL_LVINT);
+}
+
+static void nbpf_pause(struct nbpf_channel *chan)
+{
+	nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_SETSUS);
+	/* See comment in nbpf_prep_one() */
+	nbpf_chan_write(chan, NBPF_CHAN_CTRL, NBPF_CHAN_CTRL_CLREN);
+}
+
+/*		Generic part			*/
+
+/* DMA ENGINE functions */
+static void nbpf_issue_pending(struct dma_chan *dchan)
+{
+	struct nbpf_channel *chan = nbpf_to_chan(dchan);
+	unsigned long flags;
+
+	dev_dbg(dchan->device->dev, "Entry %s()\n", __func__);
+
+	spin_lock_irqsave(&chan->lock, flags);
+	if (list_empty(&chan->queued))
+		goto unlock;
+
+	list_splice_tail_init(&chan->queued, &chan->active);
+
+	if (!chan->running) {
+		struct nbpf_desc *desc = list_first_entry(&chan->active,
+						struct nbpf_desc, node);
+		if (!nbpf_start(desc))
+			chan->running = desc;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&chan->lock, flags);
+}
+
+static enum dma_status nbpf_tx_status(struct dma_chan *dchan,
+		dma_cookie_t cookie, struct dma_tx_state *state)
+{
+	struct nbpf_channel *chan = nbpf_to_chan(dchan);
+	enum dma_status status = dma_cookie_status(dchan, cookie, state);
+
+	if (state) {
+		dma_cookie_t running;
+		unsigned long flags;
+
+		spin_lock_irqsave(&chan->lock, flags);
+		running = chan->running ? chan->running->async_tx.cookie : -EINVAL;
+
+		if (cookie == running) {
+			state->residue = nbpf_bytes_left(chan);
+			dev_dbg(dchan->device->dev, "%s(): residue %u\n", __func__,
+				state->residue);
+		} else if (status == DMA_IN_PROGRESS) {
+			struct nbpf_desc *desc;
+			bool found = false;
+
+			list_for_each_entry(desc, &chan->active, node)
+				if (desc->async_tx.cookie == cookie) {
+					found = true;
+					break;
+				}
+
+			if (!found)
+				list_for_each_entry(desc, &chan->queued, node)
+					if (desc->async_tx.cookie == cookie) {
+						found = true;
+						break;
+
+					}
+
+			state->residue = found ? desc->length : 0;
+		}
+
+		spin_unlock_irqrestore(&chan->lock, flags);
+	}
+
+	if (chan->paused)
+		status = DMA_PAUSED;
+
+	return status;
+}
+
+static dma_cookie_t nbpf_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct nbpf_desc *desc = container_of(tx, struct nbpf_desc, async_tx);
+	struct nbpf_channel *chan = desc->chan;
+	unsigned long flags;
+	dma_cookie_t cookie;
+
+	spin_lock_irqsave(&chan->lock, flags);
+	cookie = dma_cookie_assign(tx);
+	list_add_tail(&desc->node, &chan->queued);
+	spin_unlock_irqrestore(&chan->lock, flags);
+
+	dev_dbg(chan->dma_chan.device->dev, "Entry %s(%d)\n", __func__, cookie);
+
+	return cookie;
+}
+
+static int nbpf_desc_page_alloc(struct nbpf_channel *chan)
+{
+	struct dma_chan *dchan = &chan->dma_chan;
+	struct nbpf_desc_page *dpage = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	struct nbpf_link_desc *ldesc;
+	struct nbpf_link_reg *hwdesc;
+	struct nbpf_desc *desc;
+	LIST_HEAD(head);
+	LIST_HEAD(lhead);
+	int i;
+	struct device *dev = dchan->device->dev;
+
+	if (!dpage)
+		return -ENOMEM;
+
+	dev_dbg(dev, "%s(): alloc %lu descriptors, %lu segments, total alloc %zu\n",
+		__func__, NBPF_DESCS_PER_PAGE, NBPF_SEGMENTS_PER_PAGE, sizeof(*dpage));
+
+	for (i = 0, ldesc = dpage->ldesc, hwdesc = dpage->hwdesc;
+	     i < ARRAY_SIZE(dpage->ldesc);
+	     i++, ldesc++, hwdesc++) {
+		ldesc->hwdesc = hwdesc;
+		list_add_tail(&ldesc->node, &lhead);
+		ldesc->hwdesc_dma_addr = dma_map_single(dchan->device->dev,
+					hwdesc, sizeof(*hwdesc), DMA_TO_DEVICE);
+
+		dev_dbg(dev, "%s(): mapped 0x%p to %pad\n", __func__,
+			hwdesc, &ldesc->hwdesc_dma_addr);
+	}
+
+	for (i = 0, desc = dpage->desc;
+	     i < ARRAY_SIZE(dpage->desc);
+	     i++, desc++) {
+		dma_async_tx_descriptor_init(&desc->async_tx, dchan);
+		desc->async_tx.tx_submit = nbpf_tx_submit;
+		desc->chan = chan;
+		INIT_LIST_HEAD(&desc->sg);
+		list_add_tail(&desc->node, &head);
+	}
+
+	/*
+	 * This function cannot be called from interrupt context, so, no need to
+	 * save flags
+	 */
+	spin_lock_irq(&chan->lock);
+	list_splice_tail(&lhead, &chan->free_links);
+	list_splice_tail(&head, &chan->free);
+	list_add(&dpage->node, &chan->desc_page);
+	spin_unlock_irq(&chan->lock);
+
+	return ARRAY_SIZE(dpage->desc);
+}
+
+static void nbpf_desc_put(struct nbpf_desc *desc)
+{
+	struct nbpf_channel *chan = desc->chan;
+	struct nbpf_link_desc *ldesc, *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&chan->lock, flags);
+	list_for_each_entry_safe(ldesc, tmp, &desc->sg, node)
+		list_move(&ldesc->node, &chan->free_links);
+
+	list_add(&desc->node, &chan->free);
+	spin_unlock_irqrestore(&chan->lock, flags);
+}
+
+static void nbpf_scan_acked(struct nbpf_channel *chan)
+{
+	struct nbpf_desc *desc, *tmp;
+	unsigned long flags;
+	LIST_HEAD(head);
+
+	spin_lock_irqsave(&chan->lock, flags);
+	list_for_each_entry_safe(desc, tmp, &chan->done, node)
+		if (async_tx_test_ack(&desc->async_tx) && desc->user_wait) {
+			list_move(&desc->node, &head);
+			desc->user_wait = false;
+		}
+	spin_unlock_irqrestore(&chan->lock, flags);
+
+	list_for_each_entry_safe(desc, tmp, &head, node) {
+		list_del(&desc->node);
+		nbpf_desc_put(desc);
+	}
+}
+
+/*
+ * We have to allocate descriptors with the channel lock dropped. This means,
+ * before we re-acquire the lock buffers can be taken already, so we have to
+ * re-check after re-acquiring the lock and possibly retry, if buffers are gone
+ * again.
+ */
+static struct nbpf_desc *nbpf_desc_get(struct nbpf_channel *chan, size_t len)
+{
+	struct nbpf_desc *desc = NULL;
+	struct nbpf_link_desc *ldesc, *prev = NULL;
+
+	nbpf_scan_acked(chan);
+
+	spin_lock_irq(&chan->lock);
+
+	do {
+		int i = 0, ret;
+
+		if (list_empty(&chan->free)) {
+			/* No more free descriptors */
+			spin_unlock_irq(&chan->lock);
+			ret = nbpf_desc_page_alloc(chan);
+			if (ret < 0)
+				return NULL;
+			spin_lock_irq(&chan->lock);
+			continue;
+		}
+		desc = list_first_entry(&chan->free, struct nbpf_desc, node);
+		list_del(&desc->node);
+
+		do {
+			if (list_empty(&chan->free_links)) {
+				/* No more free link descriptors */
+				spin_unlock_irq(&chan->lock);
+				ret = nbpf_desc_page_alloc(chan);
+				if (ret < 0) {
+					nbpf_desc_put(desc);
+					return NULL;
+				}
+				spin_lock_irq(&chan->lock);
+				continue;
+			}
+
+			ldesc = list_first_entry(&chan->free_links,
+						 struct nbpf_link_desc, node);
+			ldesc->desc = desc;
+			if (prev)
+				prev->hwdesc->next = (u32)ldesc->hwdesc_dma_addr;
+
+			prev = ldesc;
+			list_move_tail(&ldesc->node, &desc->sg);
+
+			i++;
+		} while (i < len);
+	} while (!desc);
+
+	prev->hwdesc->next = 0;
+
+	spin_unlock_irq(&chan->lock);
+
+	return desc;
+}
+
+static void nbpf_chan_idle(struct nbpf_channel *chan)
+{
+	struct nbpf_desc *desc, *tmp;
+	unsigned long flags;
+	LIST_HEAD(head);
+
+	spin_lock_irqsave(&chan->lock, flags);
+
+	list_splice_init(&chan->done, &head);
+	list_splice_init(&chan->active, &head);
+	list_splice_init(&chan->queued, &head);
+
+	chan->running = NULL;
+
+	spin_unlock_irqrestore(&chan->lock, flags);
+
+	list_for_each_entry_safe(desc, tmp, &head, node) {
+		dev_dbg(chan->nbpf->dma_dev.dev, "%s(): force-free desc %p cookie %d\n",
+			__func__, desc, desc->async_tx.cookie);
+		list_del(&desc->node);
+		nbpf_desc_put(desc);
+	}
+}
+
+static int nbpf_control(struct dma_chan *dchan, enum dma_ctrl_cmd cmd,
+			unsigned long arg)
+{
+	struct nbpf_channel *chan = nbpf_to_chan(dchan);
+	struct dma_slave_config *config;
+
+	dev_dbg(dchan->device->dev, "Entry %s(%d)\n", __func__, cmd);
+
+	switch (cmd) {
+	case DMA_TERMINATE_ALL:
+		dev_dbg(dchan->device->dev, "Terminating\n");
+		nbpf_chan_halt(chan);
+		nbpf_chan_idle(chan);
+		break;
+
+	case DMA_SLAVE_CONFIG:
+		if (!arg)
+			return -EINVAL;
+		config = (struct dma_slave_config *)arg;
+
+		/*
+		 * We could check config->slave_id to match chan->terminal here,
+		 * but with DT they would be coming from the same source, so
+		 * such a check would be superflous
+		 */
+
+		chan->slave_dst_addr = config->dst_addr;
+		chan->slave_dst_width = nbpf_xfer_size(chan->nbpf,
+						       config->dst_addr_width, 1);
+		chan->slave_dst_burst = nbpf_xfer_size(chan->nbpf,
+						       config->dst_addr_width,
+						       config->dst_maxburst);
+		chan->slave_src_addr = config->src_addr;
+		chan->slave_src_width = nbpf_xfer_size(chan->nbpf,
+						       config->src_addr_width, 1);
+		chan->slave_src_burst = nbpf_xfer_size(chan->nbpf,
+						       config->src_addr_width,
+						       config->src_maxburst);
+		break;
+
+	case DMA_PAUSE:
+		chan->paused = true;
+		nbpf_pause(chan);
+		break;
+
+	default:
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static struct dma_async_tx_descriptor *nbpf_prep_sg(struct nbpf_channel *chan,
+		struct scatterlist *src_sg, struct scatterlist *dst_sg,
+		size_t len, enum dma_transfer_direction direction,
+		unsigned long flags)
+{
+	struct nbpf_link_desc *ldesc;
+	struct scatterlist *mem_sg;
+	struct nbpf_desc *desc;
+	bool inc_src, inc_dst;
+	size_t data_len = 0;
+	int i = 0;
+
+	switch (direction) {
+	case DMA_DEV_TO_MEM:
+		mem_sg = dst_sg;
+		inc_src = false;
+		inc_dst = true;
+		break;
+
+	case DMA_MEM_TO_DEV:
+		mem_sg = src_sg;
+		inc_src = true;
+		inc_dst = false;
+		break;
+
+	default:
+	case DMA_MEM_TO_MEM:
+		mem_sg = src_sg;
+		inc_src = true;
+		inc_dst = true;
+	}
+
+	desc = nbpf_desc_get(chan, len);
+	if (!desc)
+		return NULL;
+
+	desc->async_tx.flags = flags;
+	desc->async_tx.cookie = -EBUSY;
+	desc->user_wait = false;
+
+	/*
+	 * This is a private descriptor list, and we own the descriptor. No need
+	 * to lock.
+	 */
+	list_for_each_entry(ldesc, &desc->sg, node) {
+		int ret = nbpf_prep_one(ldesc, direction,
+					sg_dma_address(src_sg),
+					sg_dma_address(dst_sg),
+					sg_dma_len(mem_sg),
+					i == len - 1);
+		if (ret < 0) {
+			nbpf_desc_put(desc);
+			return NULL;
+		}
+		data_len += sg_dma_len(mem_sg);
+		if (inc_src)
+			src_sg = sg_next(src_sg);
+		if (inc_dst)
+			dst_sg = sg_next(dst_sg);
+		mem_sg = direction == DMA_DEV_TO_MEM ? dst_sg : src_sg;
+		i++;
+	}
+
+	desc->length = data_len;
+
+	/* The user has to return the descriptor to us ASAP via .tx_submit() */
+	return &desc->async_tx;
+}
+
+static struct dma_async_tx_descriptor *nbpf_prep_memcpy(
+	struct dma_chan *dchan, dma_addr_t dst, dma_addr_t src,
+	size_t len, unsigned long flags)
+{
+	struct nbpf_channel *chan = nbpf_to_chan(dchan);
+	struct scatterlist dst_sg;
+	struct scatterlist src_sg;
+
+	sg_init_table(&dst_sg, 1);
+	sg_init_table(&src_sg, 1);
+
+	sg_dma_address(&dst_sg) = dst;
+	sg_dma_address(&src_sg) = src;
+
+	sg_dma_len(&dst_sg) = len;
+	sg_dma_len(&src_sg) = len;
+
+	dev_dbg(dchan->device->dev, "%s(): %zu @ %pad -> %pad\n",
+		__func__, len, &src, &dst);
+
+	return nbpf_prep_sg(chan, &src_sg, &dst_sg, 1,
+			    DMA_MEM_TO_MEM, flags);
+}
+
+static struct dma_async_tx_descriptor *nbpf_prep_memcpy_sg(
+	struct dma_chan *dchan,
+	struct scatterlist *dst_sg, unsigned int dst_nents,
+	struct scatterlist *src_sg, unsigned int src_nents,
+	unsigned long flags)
+{
+	struct nbpf_channel *chan = nbpf_to_chan(dchan);
+
+	if (dst_nents != src_nents)
+		return NULL;
+
+	return nbpf_prep_sg(chan, src_sg, dst_sg, src_nents,
+			    DMA_MEM_TO_MEM, flags);
+}
+
+static struct dma_async_tx_descriptor *nbpf_prep_slave_sg(
+	struct dma_chan *dchan, struct scatterlist *sgl, unsigned int sg_len,
+	enum dma_transfer_direction direction, unsigned long flags, void *context)
+{
+	struct nbpf_channel *chan = nbpf_to_chan(dchan);
+	struct scatterlist slave_sg;
+
+	dev_dbg(dchan->device->dev, "Entry %s()\n", __func__);
+
+	sg_init_table(&slave_sg, 1);
+
+	switch (direction) {
+	case DMA_MEM_TO_DEV:
+		sg_dma_address(&slave_sg) = chan->slave_dst_addr;
+		return nbpf_prep_sg(chan, sgl, &slave_sg, sg_len,
+				    direction, flags);
+
+	case DMA_DEV_TO_MEM:
+		sg_dma_address(&slave_sg) = chan->slave_src_addr;
+		return nbpf_prep_sg(chan, &slave_sg, sgl, sg_len,
+				    direction, flags);
+
+	default:
+		return NULL;
+	}
+}
+
+static int nbpf_alloc_chan_resources(struct dma_chan *dchan)
+{
+	struct nbpf_channel *chan = nbpf_to_chan(dchan);
+	int ret;
+
+	INIT_LIST_HEAD(&chan->free);
+	INIT_LIST_HEAD(&chan->free_links);
+	INIT_LIST_HEAD(&chan->queued);
+	INIT_LIST_HEAD(&chan->active);
+	INIT_LIST_HEAD(&chan->done);
+
+	ret = nbpf_desc_page_alloc(chan);
+	if (ret < 0)
+		return ret;
+
+	dev_dbg(dchan->device->dev, "Entry %s(): terminal %u\n", __func__,
+		chan->terminal);
+
+	nbpf_chan_configure(chan);
+
+	return ret;
+}
+
+static void nbpf_free_chan_resources(struct dma_chan *dchan)
+{
+	struct nbpf_channel *chan = nbpf_to_chan(dchan);
+	struct nbpf_desc_page *dpage, *tmp;
+
+	dev_dbg(dchan->device->dev, "Entry %s()\n", __func__);
+
+	nbpf_chan_halt(chan);
+	nbpf_chan_idle(chan);
+	/* Clean up for if a channel is re-used for MEMCPY after slave DMA */
+	nbpf_chan_prepare_default(chan);
+
+	list_for_each_entry_safe(dpage, tmp, &chan->desc_page, node) {
+		struct nbpf_link_desc *ldesc;
+		int i;
+		list_del(&dpage->node);
+		for (i = 0, ldesc = dpage->ldesc;
+		     i < ARRAY_SIZE(dpage->ldesc);
+		     i++, ldesc++)
+			dma_unmap_single(dchan->device->dev, ldesc->hwdesc_dma_addr,
+					 sizeof(*ldesc->hwdesc), DMA_TO_DEVICE);
+		free_page((unsigned long)dpage);
+	}
+}
+
+static int nbpf_slave_caps(struct dma_chan *dchan,
+			   struct dma_slave_caps *caps)
+{
+	caps->src_addr_widths = NBPF_DMA_BUSWIDTHS;
+	caps->dstn_addr_widths = NBPF_DMA_BUSWIDTHS;
+	caps->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
+	caps->cmd_pause = false;
+	caps->cmd_terminate = true;
+
+	return 0;
+}
+
+static struct dma_chan *nbpf_of_xlate(struct of_phandle_args *dma_spec,
+				      struct of_dma *ofdma)
+{
+	struct nbpf_device *nbpf = ofdma->of_dma_data;
+	struct dma_chan *dchan;
+	struct nbpf_channel *chan;
+
+	if (dma_spec->args_count != 2)
+		return NULL;
+
+	dchan = dma_get_any_slave_channel(&nbpf->dma_dev);
+	if (!dchan)
+		return NULL;
+
+	dev_dbg(dchan->device->dev, "Entry %s(%s)\n", __func__,
+		dma_spec->np->name);
+
+	chan = nbpf_to_chan(dchan);
+
+	chan->terminal = dma_spec->args[0];
+	chan->flags = dma_spec->args[1];
+
+	nbpf_chan_prepare(chan);
+	nbpf_chan_configure(chan);
+
+	return dchan;
+}
+
+static void nbpf_chan_tasklet(unsigned long data)
+{
+	struct nbpf_channel *chan = (struct nbpf_channel *)data;
+	struct nbpf_desc *desc, *tmp;
+	dma_async_tx_callback callback;
+	void *param;
+
+	while (!list_empty(&chan->done)) {
+		bool found = false, must_put, recycling = false;
+
+		spin_lock_irq(&chan->lock);
+
+		list_for_each_entry_safe(desc, tmp, &chan->done, node) {
+			if (!desc->user_wait) {
+				/* Newly completed descriptor, have to process */
+				found = true;
+				break;
+			} else if (async_tx_test_ack(&desc->async_tx)) {
+				/*
+				 * This descriptor was waiting for a user ACK,
+				 * it can be recycled now.
+				 */
+				list_del(&desc->node);
+				spin_unlock_irq(&chan->lock);
+				nbpf_desc_put(desc);
+				recycling = true;
+				break;
+			}
+		}
+
+		if (recycling)
+			continue;
+
+		if (!found) {
+			/* This can happen if TERMINATE_ALL has been called */
+			spin_unlock_irq(&chan->lock);
+			break;
+		}
+
+		dma_cookie_complete(&desc->async_tx);
+
+		/*
+		 * With released lock we cannot dereference desc, maybe it's
+		 * still on the "done" list
+		 */
+		if (async_tx_test_ack(&desc->async_tx)) {
+			list_del(&desc->node);
+			must_put = true;
+		} else {
+			desc->user_wait = true;
+			must_put = false;
+		}
+
+		callback = desc->async_tx.callback;
+		param = desc->async_tx.callback_param;
+
+		/* ack and callback completed descriptor */
+		spin_unlock_irq(&chan->lock);
+
+		if (callback)
+			callback(param);
+
+		if (must_put)
+			nbpf_desc_put(desc);
+	}
+}
+
+static irqreturn_t nbpf_chan_irq(int irq, void *dev)
+{
+	struct nbpf_channel *chan = dev;
+	bool done = nbpf_status_get(chan);
+	struct nbpf_desc *desc;
+	irqreturn_t ret;
+	bool bh = false;
+
+	if (!done)
+		return IRQ_NONE;
+
+	nbpf_status_ack(chan);
+
+	dev_dbg(&chan->dma_chan.dev->device, "%s()\n", __func__);
+
+	spin_lock(&chan->lock);
+	desc = chan->running;
+	if (WARN_ON(!desc)) {
+		ret = IRQ_NONE;
+		goto unlock;
+	} else {
+		ret = IRQ_HANDLED;
+		bh = true;
+	}
+
+	list_move_tail(&desc->node, &chan->done);
+	chan->running = NULL;
+
+	if (!list_empty(&chan->active)) {
+		desc = list_first_entry(&chan->active,
+					struct nbpf_desc, node);
+		if (!nbpf_start(desc))
+			chan->running = desc;
+	}
+
+unlock:
+	spin_unlock(&chan->lock);
+
+	if (bh)
+		tasklet_schedule(&chan->tasklet);
+
+	return ret;
+}
+
+static irqreturn_t nbpf_err_irq(int irq, void *dev)
+{
+	struct nbpf_device *nbpf = dev;
+	u32 error = nbpf_error_get(nbpf);
+
+	dev_warn(nbpf->dma_dev.dev, "DMA error IRQ %u\n", irq);
+
+	if (!error)
+		return IRQ_NONE;
+
+	do {
+		struct nbpf_channel *chan = nbpf_error_get_channel(nbpf, error);
+		/* On error: abort all queued transfers, no callback */
+		nbpf_error_clear(chan);
+		nbpf_chan_idle(chan);
+		error = nbpf_error_get(nbpf);
+	} while (error);
+
+	return IRQ_HANDLED;
+}
+
+static int nbpf_chan_probe(struct nbpf_device *nbpf, int n)
+{
+	struct dma_device *dma_dev = &nbpf->dma_dev;
+	struct nbpf_channel *chan = nbpf->chan + n;
+	int ret;
+
+	chan->nbpf = nbpf;
+	chan->base = nbpf->base + NBPF_REG_CHAN_OFFSET + NBPF_REG_CHAN_SIZE * n;
+	INIT_LIST_HEAD(&chan->desc_page);
+	spin_lock_init(&chan->lock);
+	chan->dma_chan.device = dma_dev;
+	dma_cookie_init(&chan->dma_chan);
+	nbpf_chan_prepare_default(chan);
+
+	dev_dbg(dma_dev->dev, "%s(): channel %d: -> %p\n", __func__, n, chan->base);
+
+	snprintf(chan->name, sizeof(chan->name), "nbpf %d", n);
+
+	tasklet_init(&chan->tasklet, nbpf_chan_tasklet, (unsigned long)chan);
+	ret = devm_request_irq(dma_dev->dev, chan->irq,
+			nbpf_chan_irq, IRQF_SHARED,
+			chan->name, chan);
+	if (ret < 0)
+		return ret;
+
+	/* Add the channel to DMA device channel list */
+	list_add_tail(&chan->dma_chan.device_node,
+		      &dma_dev->channels);
+
+	return 0;
+}
+
+static const struct of_device_id nbpf_match[] = {
+	{.compatible = "renesas,nbpfaxi64dmac1b4",	.data = &nbpf_cfg[NBPF1B4]},
+	{.compatible = "renesas,nbpfaxi64dmac1b8",	.data = &nbpf_cfg[NBPF1B8]},
+	{.compatible = "renesas,nbpfaxi64dmac1b16",	.data = &nbpf_cfg[NBPF1B16]},
+	{.compatible = "renesas,nbpfaxi64dmac4b4",	.data = &nbpf_cfg[NBPF4B4]},
+	{.compatible = "renesas,nbpfaxi64dmac4b8",	.data = &nbpf_cfg[NBPF4B8]},
+	{.compatible = "renesas,nbpfaxi64dmac4b16",	.data = &nbpf_cfg[NBPF4B16]},
+	{.compatible = "renesas,nbpfaxi64dmac8b4",	.data = &nbpf_cfg[NBPF8B4]},
+	{.compatible = "renesas,nbpfaxi64dmac8b8",	.data = &nbpf_cfg[NBPF8B8]},
+	{.compatible = "renesas,nbpfaxi64dmac8b16",	.data = &nbpf_cfg[NBPF8B16]},
+	{}
+};
+MODULE_DEVICE_TABLE(of, nbpf_match);
+
+static int nbpf_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	const struct of_device_id *of_id = of_match_device(nbpf_match, dev);
+	struct device_node *np = dev->of_node;
+	struct nbpf_device *nbpf;
+	struct dma_device *dma_dev;
+	struct resource *iomem, *irq_res;
+	const struct nbpf_config *cfg;
+	int num_channels;
+	int ret, irq, eirq, i;
+	int irqbuf[9] /* maximum 8 channels + error IRQ */;
+	unsigned int irqs = 0;
+
+	BUILD_BUG_ON(sizeof(struct nbpf_desc_page) > PAGE_SIZE);
+
+	/* DT only */
+	if (!np || !of_id || !of_id->data)
+		return -ENODEV;
+
+	cfg = of_id->data;
+	num_channels = cfg->num_channels;
+
+	nbpf = devm_kzalloc(dev, sizeof(*nbpf) + num_channels *
+			    sizeof(nbpf->chan[0]), GFP_KERNEL);
+	if (!nbpf) {
+		dev_err(dev, "Memory allocation failed\n");
+		return -ENOMEM;
+	}
+	dma_dev = &nbpf->dma_dev;
+	dma_dev->dev = dev;
+
+	iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	nbpf->base = devm_ioremap_resource(dev, iomem);
+	if (IS_ERR(nbpf->base))
+		return PTR_ERR(nbpf->base);
+
+	nbpf->clk = devm_clk_get(dev, NULL);
+	if (IS_ERR(nbpf->clk))
+		return PTR_ERR(nbpf->clk);
+
+	nbpf->config = cfg;
+
+	for (i = 0; irqs < ARRAY_SIZE(irqbuf); i++) {
+		irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, i);
+		if (!irq_res)
+			break;
+
+		for (irq = irq_res->start; irq <= irq_res->end;
+		     irq++, irqs++)
+			irqbuf[irqs] = irq;
+	}
+
+	/*
+	 * 3 IRQ resource schemes are supported:
+	 * 1. 1 shared IRQ for error and all channels
+	 * 2. 2 IRQs: one for error and one shared for all channels
+	 * 3. 1 IRQ for error and an own IRQ for each channel
+	 */
+	if (irqs != 1 && irqs != 2 && irqs != num_channels + 1)
+		return -ENXIO;
+
+	if (irqs == 1) {
+		eirq = irqbuf[0];
+
+		for (i = 0; i <= num_channels; i++)
+			nbpf->chan[i].irq = irqbuf[0];
+	} else {
+		eirq = platform_get_irq_byname(pdev, "error");
+		if (eirq < 0)
+			return eirq;
+
+		if (irqs == num_channels + 1) {
+			struct nbpf_channel *chan;
+
+			for (i = 0, chan = nbpf->chan; i <= num_channels;
+			     i++, chan++) {
+				/* Skip the error IRQ */
+				if (irqbuf[i] == eirq)
+					i++;
+				chan->irq = irqbuf[i];
+			}
+
+			if (chan != nbpf->chan + num_channels)
+				return -EINVAL;
+		} else {
+			/* 2 IRQs and more than one channel */
+			if (irqbuf[0] == eirq)
+				irq = irqbuf[1];
+			else
+				irq = irqbuf[0];
+
+			for (i = 0; i <= num_channels; i++)
+				nbpf->chan[i].irq = irq;
+		}
+	}
+
+	ret = devm_request_irq(dev, eirq, nbpf_err_irq,
+			       IRQF_SHARED, "dma error", nbpf);
+	if (ret < 0)
+		return ret;
+
+	INIT_LIST_HEAD(&dma_dev->channels);
+
+	/* Create DMA Channel */
+	for (i = 0; i < num_channels; i++) {
+		ret = nbpf_chan_probe(nbpf, i);
+		if (ret < 0)
+			return ret;
+	}
+
+	dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
+	dma_cap_set(DMA_SLAVE, dma_dev->cap_mask);
+	dma_cap_set(DMA_PRIVATE, dma_dev->cap_mask);
+	dma_cap_set(DMA_SG, dma_dev->cap_mask);
+
+	/* Common and MEMCPY operations */
+	dma_dev->device_alloc_chan_resources
+		= nbpf_alloc_chan_resources;
+	dma_dev->device_free_chan_resources = nbpf_free_chan_resources;
+	dma_dev->device_prep_dma_sg = nbpf_prep_memcpy_sg;
+	dma_dev->device_prep_dma_memcpy = nbpf_prep_memcpy;
+	dma_dev->device_tx_status = nbpf_tx_status;
+	dma_dev->device_issue_pending = nbpf_issue_pending;
+	dma_dev->device_slave_caps = nbpf_slave_caps;
+
+	/*
+	 * If we drop support for unaligned MEMCPY buffer addresses and / or
+	 * lengths by setting
+	 * dma_dev->copy_align = 4;
+	 * then we can set transfer length to 4 bytes in nbpf_prep_one() for
+	 * DMA_MEM_TO_MEM
+	 */
+
+	/* Compulsory for DMA_SLAVE fields */
+	dma_dev->device_prep_slave_sg = nbpf_prep_slave_sg;
+	dma_dev->device_control = nbpf_control;
+
+	platform_set_drvdata(pdev, nbpf);
+
+	ret = clk_prepare_enable(nbpf->clk);
+	if (ret < 0)
+		return ret;
+
+	nbpf_configure(nbpf);
+
+	ret = dma_async_device_register(dma_dev);
+	if (ret < 0)
+		goto e_clk_off;
+
+	ret = of_dma_controller_register(np, nbpf_of_xlate, nbpf);
+	if (ret < 0)
+		goto e_dma_dev_unreg;
+
+	return 0;
+
+e_dma_dev_unreg:
+	dma_async_device_unregister(dma_dev);
+e_clk_off:
+	clk_disable_unprepare(nbpf->clk);
+
+	return ret;
+}
+
+static int nbpf_remove(struct platform_device *pdev)
+{
+	struct nbpf_device *nbpf = platform_get_drvdata(pdev);
+
+	of_dma_controller_free(pdev->dev.of_node);
+	dma_async_device_unregister(&nbpf->dma_dev);
+	clk_disable_unprepare(nbpf->clk);
+
+	return 0;
+}
+
+static struct platform_device_id nbpf_ids[] = {
+	{"nbpfaxi64dmac1b4",	(kernel_ulong_t)&nbpf_cfg[NBPF1B4]},
+	{"nbpfaxi64dmac1b8",	(kernel_ulong_t)&nbpf_cfg[NBPF1B8]},
+	{"nbpfaxi64dmac1b16",	(kernel_ulong_t)&nbpf_cfg[NBPF1B16]},
+	{"nbpfaxi64dmac4b4",	(kernel_ulong_t)&nbpf_cfg[NBPF4B4]},
+	{"nbpfaxi64dmac4b8",	(kernel_ulong_t)&nbpf_cfg[NBPF4B8]},
+	{"nbpfaxi64dmac4b16",	(kernel_ulong_t)&nbpf_cfg[NBPF4B16]},
+	{"nbpfaxi64dmac8b4",	(kernel_ulong_t)&nbpf_cfg[NBPF8B4]},
+	{"nbpfaxi64dmac8b8",	(kernel_ulong_t)&nbpf_cfg[NBPF8B8]},
+	{"nbpfaxi64dmac8b16",	(kernel_ulong_t)&nbpf_cfg[NBPF8B16]},
+	{},
+};
+MODULE_DEVICE_TABLE(platform, nbpf_ids);
+
+#ifdef CONFIG_PM_RUNTIME
+static int nbpf_runtime_suspend(struct device *dev)
+{
+	struct nbpf_device *nbpf = platform_get_drvdata(to_platform_device(dev));
+	clk_disable_unprepare(nbpf->clk);
+	return 0;
+}
+
+static int nbpf_runtime_resume(struct device *dev)
+{
+	struct nbpf_device *nbpf = platform_get_drvdata(to_platform_device(dev));
+	return clk_prepare_enable(nbpf->clk);
+}
+#endif
+
+static const struct dev_pm_ops nbpf_pm_ops = {
+	SET_RUNTIME_PM_OPS(nbpf_runtime_suspend, nbpf_runtime_resume, NULL)
+};
+
+static struct platform_driver nbpf_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "dma-nbpf",
+		.of_match_table = nbpf_match,
+		.pm = &nbpf_pm_ops,
+	},
+	.id_table = nbpf_ids,
+	.probe = nbpf_probe,
+	.remove = nbpf_remove,
+};
+
+module_platform_driver(nbpf_driver);
+
+MODULE_AUTHOR("Guennadi Liakhovetski <g.liakhovetski@gmx.de>");
+MODULE_DESCRIPTION("dmaengine driver for NBPFAXI64* DMACs");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index e8fe9dc455f4..d5fbeaa1e7ba 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -218,3 +218,38 @@ struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
 			&dma_spec->args[0]);
 }
 EXPORT_SYMBOL_GPL(of_dma_simple_xlate);
+
+/**
+ * of_dma_xlate_by_chan_id - Translate dt property to DMA channel by channel id
+ * @dma_spec:	pointer to DMA specifier as found in the device tree
+ * @of_dma:	pointer to DMA controller data
+ *
+ * This function can be used as the of xlate callback for DMA driver which wants
+ * to match the channel based on the channel id. When using this xlate function
+ * the #dma-cells propety of the DMA controller dt node needs to be set to 1.
+ * The data parameter of of_dma_controller_register must be a pointer to the
+ * dma_device struct the function should match upon.
+ *
+ * Returns pointer to appropriate dma channel on success or NULL on error.
+ */
+struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec,
+					 struct of_dma *ofdma)
+{
+	struct dma_device *dev = ofdma->of_dma_data;
+	struct dma_chan *chan, *candidate = NULL;
+
+	if (!dev || dma_spec->args_count != 1)
+		return NULL;
+
+	list_for_each_entry(chan, &dev->channels, device_node)
+		if (chan->chan_id == dma_spec->args[0]) {
+			candidate = chan;
+			break;
+		}
+
+	if (!candidate)
+		return NULL;
+
+	return dma_get_slave_channel(candidate);
+}
+EXPORT_SYMBOL_GPL(of_dma_xlate_by_chan_id);
diff --git a/drivers/dma/omap-dma.c b/drivers/dma/omap-dma.c
index b19f04f4390b..4cf7d9a950d7 100644
--- a/drivers/dma/omap-dma.c
+++ b/drivers/dma/omap-dma.c
@@ -853,8 +853,7 @@ static struct dma_async_tx_descriptor *omap_dma_prep_slave_sg(
 
 static struct dma_async_tx_descriptor *omap_dma_prep_dma_cyclic(
 	struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
-	size_t period_len, enum dma_transfer_direction dir, unsigned long flags,
-	void *context)
+	size_t period_len, enum dma_transfer_direction dir, unsigned long flags)
 {
 	struct omap_dmadev *od = to_omap_dma_dev(chan->device);
 	struct omap_chan *c = to_omap_dma_chan(chan);
diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
index 73fa9b7a10ab..d5149aacd2fe 100644
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -33,26 +33,15 @@
 #define PL330_MAX_IRQS		32
 #define PL330_MAX_PERI		32
 
-enum pl330_srccachectrl {
-	SCCTRL0,	/* Noncacheable and nonbufferable */
-	SCCTRL1,	/* Bufferable only */
-	SCCTRL2,	/* Cacheable, but do not allocate */
-	SCCTRL3,	/* Cacheable and bufferable, but do not allocate */
-	SINVALID1,
-	SINVALID2,
-	SCCTRL6,	/* Cacheable write-through, allocate on reads only */
-	SCCTRL7,	/* Cacheable write-back, allocate on reads only */
-};
-
-enum pl330_dstcachectrl {
-	DCCTRL0,	/* Noncacheable and nonbufferable */
-	DCCTRL1,	/* Bufferable only */
-	DCCTRL2,	/* Cacheable, but do not allocate */
-	DCCTRL3,	/* Cacheable and bufferable, but do not allocate */
-	DINVALID1,	/* AWCACHE = 0x1000 */
-	DINVALID2,
-	DCCTRL6,	/* Cacheable write-through, allocate on writes only */
-	DCCTRL7,	/* Cacheable write-back, allocate on writes only */
+enum pl330_cachectrl {
+	CCTRL0,		/* Noncacheable and nonbufferable */
+	CCTRL1,		/* Bufferable only */
+	CCTRL2,		/* Cacheable, but do not allocate */
+	CCTRL3,		/* Cacheable and bufferable, but do not allocate */
+	INVALID1,	/* AWCACHE = 0x1000 */
+	INVALID2,
+	CCTRL6,		/* Cacheable write-through, allocate on writes only */
+	CCTRL7,		/* Cacheable write-back, allocate on writes only */
 };
 
 enum pl330_byteswap {
@@ -63,13 +52,6 @@ enum pl330_byteswap {
 	SWAP_16,
 };
 
-enum pl330_reqtype {
-	MEMTOMEM,
-	MEMTODEV,
-	DEVTOMEM,
-	DEVTODEV,
-};
-
 /* Register and Bit field Definitions */
 #define DS			0x0
 #define DS_ST_STOP		0x0
@@ -263,9 +245,6 @@ enum pl330_reqtype {
  */
 #define MCODE_BUFF_PER_REQ	256
 
-/* If the _pl330_req is available to the client */
-#define IS_FREE(req)	(*((u8 *)((req)->mc_cpu)) == CMD_DMAEND)
-
 /* Use this _only_ to wait on transient states */
 #define UNTIL(t, s)	while (!(_state(t) & (s))) cpu_relax();
 
@@ -300,27 +279,6 @@ struct pl330_config {
 	u32		irq_ns;
 };
 
-/* Handle to the DMAC provided to the PL330 core */
-struct pl330_info {
-	/* Owning device */
-	struct device *dev;
-	/* Size of MicroCode buffers for each channel. */
-	unsigned mcbufsz;
-	/* ioremap'ed address of PL330 registers. */
-	void __iomem	*base;
-	/* Client can freely use it. */
-	void	*client_data;
-	/* PL330 core data, Client must not touch it. */
-	void	*pl330_data;
-	/* Populated by the PL330 core driver during pl330_add */
-	struct pl330_config	pcfg;
-	/*
-	 * If the DMAC has some reset mechanism, then the
-	 * client may want to provide pointer to the method.
-	 */
-	void (*dmac_reset)(struct pl330_info *pi);
-};
-
 /**
  * Request Configuration.
  * The PL330 core does not modify this and uses the last
@@ -344,8 +302,8 @@ struct pl330_reqcfg {
 	unsigned brst_len:5;
 	unsigned brst_size:3; /* in power of 2 */
 
-	enum pl330_dstcachectrl dcctl;
-	enum pl330_srccachectrl scctl;
+	enum pl330_cachectrl dcctl;
+	enum pl330_cachectrl scctl;
 	enum pl330_byteswap swap;
 	struct pl330_config *pcfg;
 };
@@ -359,11 +317,6 @@ struct pl330_xfer {
 	u32 dst_addr;
 	/* Size to xfer */
 	u32 bytes;
-	/*
-	 * Pointer to next xfer in the list.
-	 * The last xfer in the req must point to NULL.
-	 */
-	struct pl330_xfer *next;
 };
 
 /* The xfer callbacks are made with one of these arguments. */
@@ -376,67 +329,6 @@ enum pl330_op_err {
 	PL330_ERR_FAIL,
 };
 
-/* A request defining Scatter-Gather List ending with NULL xfer. */
-struct pl330_req {
-	enum pl330_reqtype rqtype;
-	/* Index of peripheral for the xfer. */
-	unsigned peri:5;
-	/* Unique token for this xfer, set by the client. */
-	void *token;
-	/* Callback to be called after xfer. */
-	void (*xfer_cb)(void *token, enum pl330_op_err err);
-	/* If NULL, req will be done at last set parameters. */
-	struct pl330_reqcfg *cfg;
-	/* Pointer to first xfer in the request. */
-	struct pl330_xfer *x;
-	/* Hook to attach to DMAC's list of reqs with due callback */
-	struct list_head rqd;
-};
-
-/*
- * To know the status of the channel and DMAC, the client
- * provides a pointer to this structure. The PL330 core
- * fills it with current information.
- */
-struct pl330_chanstatus {
-	/*
-	 * If the DMAC engine halted due to some error,
-	 * the client should remove-add DMAC.
-	 */
-	bool dmac_halted;
-	/*
-	 * If channel is halted due to some error,
-	 * the client should ABORT/FLUSH and START the channel.
-	 */
-	bool faulting;
-	/* Location of last load */
-	u32 src_addr;
-	/* Location of last store */
-	u32 dst_addr;
-	/*
-	 * Pointer to the currently active req, NULL if channel is
-	 * inactive, even though the requests may be present.
-	 */
-	struct pl330_req *top_req;
-	/* Pointer to req waiting second in the queue if any. */
-	struct pl330_req *wait_req;
-};
-
-enum pl330_chan_op {
-	/* Start the channel */
-	PL330_OP_START,
-	/* Abort the active xfer */
-	PL330_OP_ABORT,
-	/* Stop xfer and flush queue */
-	PL330_OP_FLUSH,
-};
-
-struct _xfer_spec {
-	u32 ccr;
-	struct pl330_req *r;
-	struct pl330_xfer *x;
-};
-
 enum dmamov_dst {
 	SAR = 0,
 	CCR,
@@ -454,12 +346,12 @@ enum pl330_cond {
 	ALWAYS,
 };
 
+struct dma_pl330_desc;
+
 struct _pl330_req {
 	u32 mc_bus;
 	void *mc_cpu;
-	/* Number of bytes taken to setup MC for the req */
-	u32 mc_len;
-	struct pl330_req *r;
+	struct dma_pl330_desc *desc;
 };
 
 /* ToBeDone for tasklet */
@@ -491,30 +383,6 @@ enum pl330_dmac_state {
 	DYING,
 };
 
-/* A DMAC */
-struct pl330_dmac {
-	spinlock_t		lock;
-	/* Holds list of reqs with due callbacks */
-	struct list_head	req_done;
-	/* Pointer to platform specific stuff */
-	struct pl330_info	*pinfo;
-	/* Maximum possible events/irqs */
-	int			events[32];
-	/* BUS address of MicroCode buffer */
-	dma_addr_t		mcode_bus;
-	/* CPU address of MicroCode buffer */
-	void			*mcode_cpu;
-	/* List of all Channel threads */
-	struct pl330_thread	*channels;
-	/* Pointer to the MANAGER thread */
-	struct pl330_thread	*manager;
-	/* To handle bad news in interrupt */
-	struct tasklet_struct	tasks;
-	struct _pl330_tbd	dmac_tbd;
-	/* State of DMAC operation */
-	enum pl330_dmac_state	state;
-};
-
 enum desc_status {
 	/* In the DMAC pool */
 	FREE,
@@ -555,15 +423,16 @@ struct dma_pl330_chan {
 	 * As the parent, this DMAC also provides descriptors
 	 * to the channel.
 	 */
-	struct dma_pl330_dmac *dmac;
+	struct pl330_dmac *dmac;
 
 	/* To protect channel manipulation */
 	spinlock_t lock;
 
-	/* Token of a hardware channel thread of PL330 DMAC
-	 * NULL if the channel is available to be acquired.
+	/*
+	 * Hardware channel thread of PL330 DMAC. NULL if the channel is
+	 * available.
 	 */
-	void *pl330_chid;
+	struct pl330_thread *thread;
 
 	/* For D-to-M and M-to-D channels */
 	int burst_sz; /* the peripheral fifo width */
@@ -574,9 +443,7 @@ struct dma_pl330_chan {
 	bool cyclic;
 };
 
-struct dma_pl330_dmac {
-	struct pl330_info pif;
-
+struct pl330_dmac {
 	/* DMA-Engine Device */
 	struct dma_device ddma;
 
@@ -588,6 +455,32 @@ struct dma_pl330_dmac {
 	/* To protect desc_pool manipulation */
 	spinlock_t pool_lock;
 
+	/* Size of MicroCode buffers for each channel. */
+	unsigned mcbufsz;
+	/* ioremap'ed address of PL330 registers. */
+	void __iomem	*base;
+	/* Populated by the PL330 core driver during pl330_add */
+	struct pl330_config	pcfg;
+
+	spinlock_t		lock;
+	/* Maximum possible events/irqs */
+	int			events[32];
+	/* BUS address of MicroCode buffer */
+	dma_addr_t		mcode_bus;
+	/* CPU address of MicroCode buffer */
+	void			*mcode_cpu;
+	/* List of all Channel threads */
+	struct pl330_thread	*channels;
+	/* Pointer to the MANAGER thread */
+	struct pl330_thread	*manager;
+	/* To handle bad news in interrupt */
+	struct tasklet_struct	tasks;
+	struct _pl330_tbd	dmac_tbd;
+	/* State of DMAC operation */
+	enum pl330_dmac_state	state;
+	/* Holds list of reqs with due callbacks */
+	struct list_head        req_done;
+
 	/* Peripheral channels connected to this DMAC */
 	unsigned int num_peripherals;
 	struct dma_pl330_chan *peripherals; /* keep at end */
@@ -604,49 +497,43 @@ struct dma_pl330_desc {
 	struct pl330_xfer px;
 
 	struct pl330_reqcfg rqcfg;
-	struct pl330_req req;
 
 	enum desc_status status;
 
 	/* The channel which currently holds this desc */
 	struct dma_pl330_chan *pchan;
+
+	enum dma_transfer_direction rqtype;
+	/* Index of peripheral for the xfer. */
+	unsigned peri:5;
+	/* Hook to attach to DMAC's list of reqs with due callback */
+	struct list_head rqd;
 };
 
-static inline void _callback(struct pl330_req *r, enum pl330_op_err err)
-{
-	if (r && r->xfer_cb)
-		r->xfer_cb(r->token, err);
-}
+struct _xfer_spec {
+	u32 ccr;
+	struct dma_pl330_desc *desc;
+};
 
 static inline bool _queue_empty(struct pl330_thread *thrd)
 {
-	return (IS_FREE(&thrd->req[0]) && IS_FREE(&thrd->req[1]))
-		? true : false;
+	return thrd->req[0].desc == NULL && thrd->req[1].desc == NULL;
 }
 
 static inline bool _queue_full(struct pl330_thread *thrd)
 {
-	return (IS_FREE(&thrd->req[0]) || IS_FREE(&thrd->req[1]))
-		? false : true;
+	return thrd->req[0].desc != NULL && thrd->req[1].desc != NULL;
 }
 
 static inline bool is_manager(struct pl330_thread *thrd)
 {
-	struct pl330_dmac *pl330 = thrd->dmac;
-
-	/* MANAGER is indexed at the end */
-	if (thrd->id == pl330->pinfo->pcfg.num_chan)
-		return true;
-	else
-		return false;
+	return thrd->dmac->manager == thrd;
 }
 
 /* If manager of the thread is in Non-Secure mode */
 static inline bool _manager_ns(struct pl330_thread *thrd)
 {
-	struct pl330_dmac *pl330 = thrd->dmac;
-
-	return (pl330->pinfo->pcfg.mode & DMAC_MODE_NS) ? true : false;
+	return (thrd->dmac->pcfg.mode & DMAC_MODE_NS) ? true : false;
 }
 
 static inline u32 get_revision(u32 periph_id)
@@ -1004,7 +891,7 @@ static inline u32 _emit_GO(unsigned dry_run, u8 buf[],
 /* Returns Time-Out */
 static bool _until_dmac_idle(struct pl330_thread *thrd)
 {
-	void __iomem *regs = thrd->dmac->pinfo->base;
+	void __iomem *regs = thrd->dmac->base;
 	unsigned long loops = msecs_to_loops(5);
 
 	do {
@@ -1024,7 +911,7 @@ static bool _until_dmac_idle(struct pl330_thread *thrd)
 static inline void _execute_DBGINSN(struct pl330_thread *thrd,
 		u8 insn[], bool as_manager)
 {
-	void __iomem *regs = thrd->dmac->pinfo->base;
+	void __iomem *regs = thrd->dmac->base;
 	u32 val;
 
 	val = (insn[0] << 16) | (insn[1] << 24);
@@ -1039,7 +926,7 @@ static inline void _execute_DBGINSN(struct pl330_thread *thrd,
 
 	/* If timed out due to halted state-machine */
 	if (_until_dmac_idle(thrd)) {
-		dev_err(thrd->dmac->pinfo->dev, "DMAC halted!\n");
+		dev_err(thrd->dmac->ddma.dev, "DMAC halted!\n");
 		return;
 	}
 
@@ -1047,25 +934,9 @@ static inline void _execute_DBGINSN(struct pl330_thread *thrd,
 	writel(0, regs + DBGCMD);
 }
 
-/*
- * Mark a _pl330_req as free.
- * We do it by writing DMAEND as the first instruction
- * because no valid request is going to have DMAEND as
- * its first instruction to execute.
- */
-static void mark_free(struct pl330_thread *thrd, int idx)
-{
-	struct _pl330_req *req = &thrd->req[idx];
-
-	_emit_END(0, req->mc_cpu);
-	req->mc_len = 0;
-
-	thrd->req_running = -1;
-}
-
 static inline u32 _state(struct pl330_thread *thrd)
 {
-	void __iomem *regs = thrd->dmac->pinfo->base;
+	void __iomem *regs = thrd->dmac->base;
 	u32 val;
 
 	if (is_manager(thrd))
@@ -1123,7 +994,7 @@ static inline u32 _state(struct pl330_thread *thrd)
 
 static void _stop(struct pl330_thread *thrd)
 {
-	void __iomem *regs = thrd->dmac->pinfo->base;
+	void __iomem *regs = thrd->dmac->base;
 	u8 insn[6] = {0, 0, 0, 0, 0, 0};
 
 	if (_state(thrd) == PL330_STATE_FAULT_COMPLETING)
@@ -1146,9 +1017,9 @@ static void _stop(struct pl330_thread *thrd)
 /* Start doing req 'idx' of thread 'thrd' */
 static bool _trigger(struct pl330_thread *thrd)
 {
-	void __iomem *regs = thrd->dmac->pinfo->base;
+	void __iomem *regs = thrd->dmac->base;
 	struct _pl330_req *req;
-	struct pl330_req *r;
+	struct dma_pl330_desc *desc;
 	struct _arg_GO go;
 	unsigned ns;
 	u8 insn[6] = {0, 0, 0, 0, 0, 0};
@@ -1159,32 +1030,27 @@ static bool _trigger(struct pl330_thread *thrd)
 		return true;
 
 	idx = 1 - thrd->lstenq;
-	if (!IS_FREE(&thrd->req[idx]))
+	if (thrd->req[idx].desc != NULL) {
 		req = &thrd->req[idx];
-	else {
+	} else {
 		idx = thrd->lstenq;
-		if (!IS_FREE(&thrd->req[idx]))
+		if (thrd->req[idx].desc != NULL)
 			req = &thrd->req[idx];
 		else
 			req = NULL;
 	}
 
 	/* Return if no request */
-	if (!req || !req->r)
+	if (!req)
 		return true;
 
-	r = req->r;
+	desc = req->desc;
 
-	if (r->cfg)
-		ns = r->cfg->nonsecure ? 1 : 0;
-	else if (readl(regs + CS(thrd->id)) & CS_CNS)
-		ns = 1;
-	else
-		ns = 0;
+	ns = desc->rqcfg.nonsecure ? 1 : 0;
 
 	/* See 'Abort Sources' point-4 at Page 2-25 */
 	if (_manager_ns(thrd) && !ns)
-		dev_info(thrd->dmac->pinfo->dev, "%s:%d Recipe for ABORT!\n",
+		dev_info(thrd->dmac->ddma.dev, "%s:%d Recipe for ABORT!\n",
 			__func__, __LINE__);
 
 	go.chan = thrd->id;
@@ -1240,7 +1106,7 @@ static inline int _ldst_memtomem(unsigned dry_run, u8 buf[],
 		const struct _xfer_spec *pxs, int cyc)
 {
 	int off = 0;
-	struct pl330_config *pcfg = pxs->r->cfg->pcfg;
+	struct pl330_config *pcfg = pxs->desc->rqcfg.pcfg;
 
 	/* check lock-up free version */
 	if (get_revision(pcfg->periph_id) >= PERIPH_REV_R1P0) {
@@ -1266,10 +1132,10 @@ static inline int _ldst_devtomem(unsigned dry_run, u8 buf[],
 	int off = 0;
 
 	while (cyc--) {
-		off += _emit_WFP(dry_run, &buf[off], SINGLE, pxs->r->peri);
-		off += _emit_LDP(dry_run, &buf[off], SINGLE, pxs->r->peri);
+		off += _emit_WFP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
+		off += _emit_LDP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
 		off += _emit_ST(dry_run, &buf[off], ALWAYS);
-		off += _emit_FLUSHP(dry_run, &buf[off], pxs->r->peri);
+		off += _emit_FLUSHP(dry_run, &buf[off], pxs->desc->peri);
 	}
 
 	return off;
@@ -1281,10 +1147,10 @@ static inline int _ldst_memtodev(unsigned dry_run, u8 buf[],
 	int off = 0;
 
 	while (cyc--) {
-		off += _emit_WFP(dry_run, &buf[off], SINGLE, pxs->r->peri);
+		off += _emit_WFP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
 		off += _emit_LD(dry_run, &buf[off], ALWAYS);
-		off += _emit_STP(dry_run, &buf[off], SINGLE, pxs->r->peri);
-		off += _emit_FLUSHP(dry_run, &buf[off], pxs->r->peri);
+		off += _emit_STP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
+		off += _emit_FLUSHP(dry_run, &buf[off], pxs->desc->peri);
 	}
 
 	return off;
@@ -1295,14 +1161,14 @@ static int _bursts(unsigned dry_run, u8 buf[],
 {
 	int off = 0;
 
-	switch (pxs->r->rqtype) {
-	case MEMTODEV:
+	switch (pxs->desc->rqtype) {
+	case DMA_MEM_TO_DEV:
 		off += _ldst_memtodev(dry_run, &buf[off], pxs, cyc);
 		break;
-	case DEVTOMEM:
+	case DMA_DEV_TO_MEM:
 		off += _ldst_devtomem(dry_run, &buf[off], pxs, cyc);
 		break;
-	case MEMTOMEM:
+	case DMA_MEM_TO_MEM:
 		off += _ldst_memtomem(dry_run, &buf[off], pxs, cyc);
 		break;
 	default:
@@ -1395,7 +1261,7 @@ static inline int _loop(unsigned dry_run, u8 buf[],
 static inline int _setup_loops(unsigned dry_run, u8 buf[],
 		const struct _xfer_spec *pxs)
 {
-	struct pl330_xfer *x = pxs->x;
+	struct pl330_xfer *x = &pxs->desc->px;
 	u32 ccr = pxs->ccr;
 	unsigned long c, bursts = BYTE_TO_BURST(x->bytes, ccr);
 	int off = 0;
@@ -1412,7 +1278,7 @@ static inline int _setup_loops(unsigned dry_run, u8 buf[],
 static inline int _setup_xfer(unsigned dry_run, u8 buf[],
 		const struct _xfer_spec *pxs)
 {
-	struct pl330_xfer *x = pxs->x;
+	struct pl330_xfer *x = &pxs->desc->px;
 	int off = 0;
 
 	/* DMAMOV SAR, x->src_addr */
@@ -1443,17 +1309,12 @@ static int _setup_req(unsigned dry_run, struct pl330_thread *thrd,
 	/* DMAMOV CCR, ccr */
 	off += _emit_MOV(dry_run, &buf[off], CCR, pxs->ccr);
 
-	x = pxs->r->x;
-	do {
-		/* Error if xfer length is not aligned at burst size */
-		if (x->bytes % (BRST_SIZE(pxs->ccr) * BRST_LEN(pxs->ccr)))
-			return -EINVAL;
-
-		pxs->x = x;
-		off += _setup_xfer(dry_run, &buf[off], pxs);
+	x = &pxs->desc->px;
+	/* Error if xfer length is not aligned at burst size */
+	if (x->bytes % (BRST_SIZE(pxs->ccr) * BRST_LEN(pxs->ccr)))
+		return -EINVAL;
 
-		x = x->next;
-	} while (x);
+	off += _setup_xfer(dry_run, &buf[off], pxs);
 
 	/* DMASEV peripheral/event */
 	off += _emit_SEV(dry_run, &buf[off], thrd->ev);
@@ -1495,31 +1356,15 @@ static inline u32 _prepare_ccr(const struct pl330_reqcfg *rqc)
 	return ccr;
 }
 
-static inline bool _is_valid(u32 ccr)
-{
-	enum pl330_dstcachectrl dcctl;
-	enum pl330_srccachectrl scctl;
-
-	dcctl = (ccr >> CC_DSTCCTRL_SHFT) & CC_DRCCCTRL_MASK;
-	scctl = (ccr >> CC_SRCCCTRL_SHFT) & CC_SRCCCTRL_MASK;
-
-	if (dcctl == DINVALID1 || dcctl == DINVALID2
-			|| scctl == SINVALID1 || scctl == SINVALID2)
-		return false;
-	else
-		return true;
-}
-
 /*
  * Submit a list of xfers after which the client wants notification.
  * Client is not notified after each xfer unit, just once after all
  * xfer units are done or some error occurs.
  */
-static int pl330_submit_req(void *ch_id, struct pl330_req *r)
+static int pl330_submit_req(struct pl330_thread *thrd,
+	struct dma_pl330_desc *desc)
 {
-	struct pl330_thread *thrd = ch_id;
-	struct pl330_dmac *pl330;
-	struct pl330_info *pi;
+	struct pl330_dmac *pl330 = thrd->dmac;
 	struct _xfer_spec xs;
 	unsigned long flags;
 	void __iomem *regs;
@@ -1528,25 +1373,24 @@ static int pl330_submit_req(void *ch_id, struct pl330_req *r)
 	int ret = 0;
 
 	/* No Req or Unacquired Channel or DMAC */
-	if (!r || !thrd || thrd->free)
+	if (!desc || !thrd || thrd->free)
 		return -EINVAL;
 
-	pl330 = thrd->dmac;
-	pi = pl330->pinfo;
-	regs = pi->base;
+	regs = thrd->dmac->base;
 
 	if (pl330->state == DYING
 		|| pl330->dmac_tbd.reset_chan & (1 << thrd->id)) {
-		dev_info(thrd->dmac->pinfo->dev, "%s:%d\n",
+		dev_info(thrd->dmac->ddma.dev, "%s:%d\n",
 			__func__, __LINE__);
 		return -EAGAIN;
 	}
 
 	/* If request for non-existing peripheral */
-	if (r->rqtype != MEMTOMEM && r->peri >= pi->pcfg.num_peri) {
-		dev_info(thrd->dmac->pinfo->dev,
+	if (desc->rqtype != DMA_MEM_TO_MEM &&
+	    desc->peri >= pl330->pcfg.num_peri) {
+		dev_info(thrd->dmac->ddma.dev,
 				"%s:%d Invalid peripheral(%u)!\n",
-				__func__, __LINE__, r->peri);
+				__func__, __LINE__, desc->peri);
 		return -EINVAL;
 	}
 
@@ -1557,41 +1401,26 @@ static int pl330_submit_req(void *ch_id, struct pl330_req *r)
 		goto xfer_exit;
 	}
 
+	/* Prefer Secure Channel */
+	if (!_manager_ns(thrd))
+		desc->rqcfg.nonsecure = 0;
+	else
+		desc->rqcfg.nonsecure = 1;
 
-	/* Use last settings, if not provided */
-	if (r->cfg) {
-		/* Prefer Secure Channel */
-		if (!_manager_ns(thrd))
-			r->cfg->nonsecure = 0;
-		else
-			r->cfg->nonsecure = 1;
-
-		ccr = _prepare_ccr(r->cfg);
-	} else {
-		ccr = readl(regs + CC(thrd->id));
-	}
-
-	/* If this req doesn't have valid xfer settings */
-	if (!_is_valid(ccr)) {
-		ret = -EINVAL;
-		dev_info(thrd->dmac->pinfo->dev, "%s:%d Invalid CCR(%x)!\n",
-			__func__, __LINE__, ccr);
-		goto xfer_exit;
-	}
+	ccr = _prepare_ccr(&desc->rqcfg);
 
-	idx = IS_FREE(&thrd->req[0]) ? 0 : 1;
+	idx = thrd->req[0].desc == NULL ? 0 : 1;
 
 	xs.ccr = ccr;
-	xs.r = r;
+	xs.desc = desc;
 
 	/* First dry run to check if req is acceptable */
 	ret = _setup_req(1, thrd, idx, &xs);
 	if (ret < 0)
 		goto xfer_exit;
 
-	if (ret > pi->mcbufsz / 2) {
-		dev_info(thrd->dmac->pinfo->dev,
-			"%s:%d Trying increasing mcbufsz\n",
+	if (ret > pl330->mcbufsz / 2) {
+		dev_info(pl330->ddma.dev, "%s:%d Trying increasing mcbufsz\n",
 				__func__, __LINE__);
 		ret = -ENOMEM;
 		goto xfer_exit;
@@ -1599,8 +1428,8 @@ static int pl330_submit_req(void *ch_id, struct pl330_req *r)
 
 	/* Hook the request */
 	thrd->lstenq = idx;
-	thrd->req[idx].mc_len = _setup_req(0, thrd, idx, &xs);
-	thrd->req[idx].r = r;
+	thrd->req[idx].desc = desc;
+	_setup_req(0, thrd, idx, &xs);
 
 	ret = 0;
 
@@ -1610,10 +1439,32 @@ xfer_exit:
 	return ret;
 }
 
+static void dma_pl330_rqcb(struct dma_pl330_desc *desc, enum pl330_op_err err)
+{
+	struct dma_pl330_chan *pch;
+	unsigned long flags;
+
+	if (!desc)
+		return;
+
+	pch = desc->pchan;
+
+	/* If desc aborted */
+	if (!pch)
+		return;
+
+	spin_lock_irqsave(&pch->lock, flags);
+
+	desc->status = DONE;
+
+	spin_unlock_irqrestore(&pch->lock, flags);
+
+	tasklet_schedule(&pch->task);
+}
+
 static void pl330_dotask(unsigned long data)
 {
 	struct pl330_dmac *pl330 = (struct pl330_dmac *) data;
-	struct pl330_info *pi = pl330->pinfo;
 	unsigned long flags;
 	int i;
 
@@ -1631,16 +1482,16 @@ static void pl330_dotask(unsigned long data)
 	if (pl330->dmac_tbd.reset_mngr) {
 		_stop(pl330->manager);
 		/* Reset all channels */
-		pl330->dmac_tbd.reset_chan = (1 << pi->pcfg.num_chan) - 1;
+		pl330->dmac_tbd.reset_chan = (1 << pl330->pcfg.num_chan) - 1;
 		/* Clear the reset flag */
 		pl330->dmac_tbd.reset_mngr = false;
 	}
 
-	for (i = 0; i < pi->pcfg.num_chan; i++) {
+	for (i = 0; i < pl330->pcfg.num_chan; i++) {
 
 		if (pl330->dmac_tbd.reset_chan & (1 << i)) {
 			struct pl330_thread *thrd = &pl330->channels[i];
-			void __iomem *regs = pi->base;
+			void __iomem *regs = pl330->base;
 			enum pl330_op_err err;
 
 			_stop(thrd);
@@ -1651,16 +1502,13 @@ static void pl330_dotask(unsigned long data)
 				err = PL330_ERR_ABORT;
 
 			spin_unlock_irqrestore(&pl330->lock, flags);
-
-			_callback(thrd->req[1 - thrd->lstenq].r, err);
-			_callback(thrd->req[thrd->lstenq].r, err);
-
+			dma_pl330_rqcb(thrd->req[1 - thrd->lstenq].desc, err);
+			dma_pl330_rqcb(thrd->req[thrd->lstenq].desc, err);
 			spin_lock_irqsave(&pl330->lock, flags);
 
-			thrd->req[0].r = NULL;
-			thrd->req[1].r = NULL;
-			mark_free(thrd, 0);
-			mark_free(thrd, 1);
+			thrd->req[0].desc = NULL;
+			thrd->req[1].desc = NULL;
+			thrd->req_running = -1;
 
 			/* Clear the reset flag */
 			pl330->dmac_tbd.reset_chan &= ~(1 << i);
@@ -1673,20 +1521,15 @@ static void pl330_dotask(unsigned long data)
 }
 
 /* Returns 1 if state was updated, 0 otherwise */
-static int pl330_update(const struct pl330_info *pi)
+static int pl330_update(struct pl330_dmac *pl330)
 {
-	struct pl330_req *rqdone, *tmp;
-	struct pl330_dmac *pl330;
+	struct dma_pl330_desc *descdone, *tmp;
 	unsigned long flags;
 	void __iomem *regs;
 	u32 val;
 	int id, ev, ret = 0;
 
-	if (!pi || !pi->pl330_data)
-		return 0;
-
-	regs = pi->base;
-	pl330 = pi->pl330_data;
+	regs = pl330->base;
 
 	spin_lock_irqsave(&pl330->lock, flags);
 
@@ -1696,13 +1539,13 @@ static int pl330_update(const struct pl330_info *pi)
 	else
 		pl330->dmac_tbd.reset_mngr = false;
 
-	val = readl(regs + FSC) & ((1 << pi->pcfg.num_chan) - 1);
+	val = readl(regs + FSC) & ((1 << pl330->pcfg.num_chan) - 1);
 	pl330->dmac_tbd.reset_chan |= val;
 	if (val) {
 		int i = 0;
-		while (i < pi->pcfg.num_chan) {
+		while (i < pl330->pcfg.num_chan) {
 			if (val & (1 << i)) {
-				dev_info(pi->dev,
+				dev_info(pl330->ddma.dev,
 					"Reset Channel-%d\t CS-%x FTC-%x\n",
 						i, readl(regs + CS(i)),
 						readl(regs + FTC(i)));
@@ -1714,15 +1557,16 @@ static int pl330_update(const struct pl330_info *pi)
 
 	/* Check which event happened i.e, thread notified */
 	val = readl(regs + ES);
-	if (pi->pcfg.num_events < 32
-			&& val & ~((1 << pi->pcfg.num_events) - 1)) {
+	if (pl330->pcfg.num_events < 32
+			&& val & ~((1 << pl330->pcfg.num_events) - 1)) {
 		pl330->dmac_tbd.reset_dmac = true;
-		dev_err(pi->dev, "%s:%d Unexpected!\n", __func__, __LINE__);
+		dev_err(pl330->ddma.dev, "%s:%d Unexpected!\n", __func__,
+			__LINE__);
 		ret = 1;
 		goto updt_exit;
 	}
 
-	for (ev = 0; ev < pi->pcfg.num_events; ev++) {
+	for (ev = 0; ev < pl330->pcfg.num_events; ev++) {
 		if (val & (1 << ev)) { /* Event occurred */
 			struct pl330_thread *thrd;
 			u32 inten = readl(regs + INTEN);
@@ -1743,25 +1587,22 @@ static int pl330_update(const struct pl330_info *pi)
 				continue;
 
 			/* Detach the req */
-			rqdone = thrd->req[active].r;
-			thrd->req[active].r = NULL;
-
-			mark_free(thrd, active);
+			descdone = thrd->req[active].desc;
+			thrd->req[active].desc = NULL;
 
 			/* Get going again ASAP */
 			_start(thrd);
 
 			/* For now, just make a list of callbacks to be done */
-			list_add_tail(&rqdone->rqd, &pl330->req_done);
+			list_add_tail(&descdone->rqd, &pl330->req_done);
 		}
 	}
 
 	/* Now that we are in no hurry, do the callbacks */
-	list_for_each_entry_safe(rqdone, tmp, &pl330->req_done, rqd) {
-		list_del(&rqdone->rqd);
-
+	list_for_each_entry_safe(descdone, tmp, &pl330->req_done, rqd) {
+		list_del(&descdone->rqd);
 		spin_unlock_irqrestore(&pl330->lock, flags);
-		_callback(rqdone, PL330_ERR_NONE);
+		dma_pl330_rqcb(descdone, PL330_ERR_NONE);
 		spin_lock_irqsave(&pl330->lock, flags);
 	}
 
@@ -1778,65 +1619,13 @@ updt_exit:
 	return ret;
 }
 
-static int pl330_chan_ctrl(void *ch_id, enum pl330_chan_op op)
-{
-	struct pl330_thread *thrd = ch_id;
-	struct pl330_dmac *pl330;
-	unsigned long flags;
-	int ret = 0, active;
-
-	if (!thrd || thrd->free || thrd->dmac->state == DYING)
-		return -EINVAL;
-
-	pl330 = thrd->dmac;
-	active = thrd->req_running;
-
-	spin_lock_irqsave(&pl330->lock, flags);
-
-	switch (op) {
-	case PL330_OP_FLUSH:
-		/* Make sure the channel is stopped */
-		_stop(thrd);
-
-		thrd->req[0].r = NULL;
-		thrd->req[1].r = NULL;
-		mark_free(thrd, 0);
-		mark_free(thrd, 1);
-		break;
-
-	case PL330_OP_ABORT:
-		/* Make sure the channel is stopped */
-		_stop(thrd);
-
-		/* ABORT is only for the active req */
-		if (active == -1)
-			break;
-
-		thrd->req[active].r = NULL;
-		mark_free(thrd, active);
-
-		/* Start the next */
-	case PL330_OP_START:
-		if ((active == -1) && !_start(thrd))
-			ret = -EIO;
-		break;
-
-	default:
-		ret = -EINVAL;
-	}
-
-	spin_unlock_irqrestore(&pl330->lock, flags);
-	return ret;
-}
-
 /* Reserve an event */
 static inline int _alloc_event(struct pl330_thread *thrd)
 {
 	struct pl330_dmac *pl330 = thrd->dmac;
-	struct pl330_info *pi = pl330->pinfo;
 	int ev;
 
-	for (ev = 0; ev < pi->pcfg.num_events; ev++)
+	for (ev = 0; ev < pl330->pcfg.num_events; ev++)
 		if (pl330->events[ev] == -1) {
 			pl330->events[ev] = thrd->id;
 			return ev;
@@ -1845,45 +1634,38 @@ static inline int _alloc_event(struct pl330_thread *thrd)
 	return -1;
 }
 
-static bool _chan_ns(const struct pl330_info *pi, int i)
+static bool _chan_ns(const struct pl330_dmac *pl330, int i)
 {
-	return pi->pcfg.irq_ns & (1 << i);
+	return pl330->pcfg.irq_ns & (1 << i);
 }
 
 /* Upon success, returns IdentityToken for the
  * allocated channel, NULL otherwise.
  */
-static void *pl330_request_channel(const struct pl330_info *pi)
+static struct pl330_thread *pl330_request_channel(struct pl330_dmac *pl330)
 {
 	struct pl330_thread *thrd = NULL;
-	struct pl330_dmac *pl330;
 	unsigned long flags;
 	int chans, i;
 
-	if (!pi || !pi->pl330_data)
-		return NULL;
-
-	pl330 = pi->pl330_data;
-
 	if (pl330->state == DYING)
 		return NULL;
 
-	chans = pi->pcfg.num_chan;
+	chans = pl330->pcfg.num_chan;
 
 	spin_lock_irqsave(&pl330->lock, flags);
 
 	for (i = 0; i < chans; i++) {
 		thrd = &pl330->channels[i];
 		if ((thrd->free) && (!_manager_ns(thrd) ||
-					_chan_ns(pi, i))) {
+					_chan_ns(pl330, i))) {
 			thrd->ev = _alloc_event(thrd);
 			if (thrd->ev >= 0) {
 				thrd->free = false;
 				thrd->lstenq = 1;
-				thrd->req[0].r = NULL;
-				mark_free(thrd, 0);
-				thrd->req[1].r = NULL;
-				mark_free(thrd, 1);
+				thrd->req[0].desc = NULL;
+				thrd->req[1].desc = NULL;
+				thrd->req_running = -1;
 				break;
 			}
 		}
@@ -1899,17 +1681,15 @@ static void *pl330_request_channel(const struct pl330_info *pi)
 static inline void _free_event(struct pl330_thread *thrd, int ev)
 {
 	struct pl330_dmac *pl330 = thrd->dmac;
-	struct pl330_info *pi = pl330->pinfo;
 
 	/* If the event is valid and was held by the thread */
-	if (ev >= 0 && ev < pi->pcfg.num_events
+	if (ev >= 0 && ev < pl330->pcfg.num_events
 			&& pl330->events[ev] == thrd->id)
 		pl330->events[ev] = -1;
 }
 
-static void pl330_release_channel(void *ch_id)
+static void pl330_release_channel(struct pl330_thread *thrd)
 {
-	struct pl330_thread *thrd = ch_id;
 	struct pl330_dmac *pl330;
 	unsigned long flags;
 
@@ -1918,8 +1698,8 @@ static void pl330_release_channel(void *ch_id)
 
 	_stop(thrd);
 
-	_callback(thrd->req[1 - thrd->lstenq].r, PL330_ERR_ABORT);
-	_callback(thrd->req[thrd->lstenq].r, PL330_ERR_ABORT);
+	dma_pl330_rqcb(thrd->req[1 - thrd->lstenq].desc, PL330_ERR_ABORT);
+	dma_pl330_rqcb(thrd->req[thrd->lstenq].desc, PL330_ERR_ABORT);
 
 	pl330 = thrd->dmac;
 
@@ -1932,72 +1712,70 @@ static void pl330_release_channel(void *ch_id)
 /* Initialize the structure for PL330 configuration, that can be used
  * by the client driver the make best use of the DMAC
  */
-static void read_dmac_config(struct pl330_info *pi)
+static void read_dmac_config(struct pl330_dmac *pl330)
 {
-	void __iomem *regs = pi->base;
+	void __iomem *regs = pl330->base;
 	u32 val;
 
 	val = readl(regs + CRD) >> CRD_DATA_WIDTH_SHIFT;
 	val &= CRD_DATA_WIDTH_MASK;
-	pi->pcfg.data_bus_width = 8 * (1 << val);
+	pl330->pcfg.data_bus_width = 8 * (1 << val);
 
 	val = readl(regs + CRD) >> CRD_DATA_BUFF_SHIFT;
 	val &= CRD_DATA_BUFF_MASK;
-	pi->pcfg.data_buf_dep = val + 1;
+	pl330->pcfg.data_buf_dep = val + 1;
 
 	val = readl(regs + CR0) >> CR0_NUM_CHANS_SHIFT;
 	val &= CR0_NUM_CHANS_MASK;
 	val += 1;
-	pi->pcfg.num_chan = val;
+	pl330->pcfg.num_chan = val;
 
 	val = readl(regs + CR0);
 	if (val & CR0_PERIPH_REQ_SET) {
 		val = (val >> CR0_NUM_PERIPH_SHIFT) & CR0_NUM_PERIPH_MASK;
 		val += 1;
-		pi->pcfg.num_peri = val;
-		pi->pcfg.peri_ns = readl(regs + CR4);
+		pl330->pcfg.num_peri = val;
+		pl330->pcfg.peri_ns = readl(regs + CR4);
 	} else {
-		pi->pcfg.num_peri = 0;
+		pl330->pcfg.num_peri = 0;
 	}
 
 	val = readl(regs + CR0);
 	if (val & CR0_BOOT_MAN_NS)
-		pi->pcfg.mode |= DMAC_MODE_NS;
+		pl330->pcfg.mode |= DMAC_MODE_NS;
 	else
-		pi->pcfg.mode &= ~DMAC_MODE_NS;
+		pl330->pcfg.mode &= ~DMAC_MODE_NS;
 
 	val = readl(regs + CR0) >> CR0_NUM_EVENTS_SHIFT;
 	val &= CR0_NUM_EVENTS_MASK;
 	val += 1;
-	pi->pcfg.num_events = val;
+	pl330->pcfg.num_events = val;
 
-	pi->pcfg.irq_ns = readl(regs + CR3);
+	pl330->pcfg.irq_ns = readl(regs + CR3);
 }
 
 static inline void _reset_thread(struct pl330_thread *thrd)
 {
 	struct pl330_dmac *pl330 = thrd->dmac;
-	struct pl330_info *pi = pl330->pinfo;
 
 	thrd->req[0].mc_cpu = pl330->mcode_cpu
-				+ (thrd->id * pi->mcbufsz);
+				+ (thrd->id * pl330->mcbufsz);
 	thrd->req[0].mc_bus = pl330->mcode_bus
-				+ (thrd->id * pi->mcbufsz);
-	thrd->req[0].r = NULL;
-	mark_free(thrd, 0);
+				+ (thrd->id * pl330->mcbufsz);
+	thrd->req[0].desc = NULL;
 
 	thrd->req[1].mc_cpu = thrd->req[0].mc_cpu
-				+ pi->mcbufsz / 2;
+				+ pl330->mcbufsz / 2;
 	thrd->req[1].mc_bus = thrd->req[0].mc_bus
-				+ pi->mcbufsz / 2;
-	thrd->req[1].r = NULL;
-	mark_free(thrd, 1);
+				+ pl330->mcbufsz / 2;
+	thrd->req[1].desc = NULL;
+
+	thrd->req_running = -1;
 }
 
 static int dmac_alloc_threads(struct pl330_dmac *pl330)
 {
-	struct pl330_info *pi = pl330->pinfo;
-	int chans = pi->pcfg.num_chan;
+	int chans = pl330->pcfg.num_chan;
 	struct pl330_thread *thrd;
 	int i;
 
@@ -2028,29 +1806,28 @@ static int dmac_alloc_threads(struct pl330_dmac *pl330)
 
 static int dmac_alloc_resources(struct pl330_dmac *pl330)
 {
-	struct pl330_info *pi = pl330->pinfo;
-	int chans = pi->pcfg.num_chan;
+	int chans = pl330->pcfg.num_chan;
 	int ret;
 
 	/*
 	 * Alloc MicroCode buffer for 'chans' Channel threads.
 	 * A channel's buffer offset is (Channel_Id * MCODE_BUFF_PERCHAN)
 	 */
-	pl330->mcode_cpu = dma_alloc_coherent(pi->dev,
-				chans * pi->mcbufsz,
+	pl330->mcode_cpu = dma_alloc_coherent(pl330->ddma.dev,
+				chans * pl330->mcbufsz,
 				&pl330->mcode_bus, GFP_KERNEL);
 	if (!pl330->mcode_cpu) {
-		dev_err(pi->dev, "%s:%d Can't allocate memory!\n",
+		dev_err(pl330->ddma.dev, "%s:%d Can't allocate memory!\n",
 			__func__, __LINE__);
 		return -ENOMEM;
 	}
 
 	ret = dmac_alloc_threads(pl330);
 	if (ret) {
-		dev_err(pi->dev, "%s:%d Can't to create channels for DMAC!\n",
+		dev_err(pl330->ddma.dev, "%s:%d Can't to create channels for DMAC!\n",
 			__func__, __LINE__);
-		dma_free_coherent(pi->dev,
-				chans * pi->mcbufsz,
+		dma_free_coherent(pl330->ddma.dev,
+				chans * pl330->mcbufsz,
 				pl330->mcode_cpu, pl330->mcode_bus);
 		return ret;
 	}
@@ -2058,71 +1835,45 @@ static int dmac_alloc_resources(struct pl330_dmac *pl330)
 	return 0;
 }
 
-static int pl330_add(struct pl330_info *pi)
+static int pl330_add(struct pl330_dmac *pl330)
 {
-	struct pl330_dmac *pl330;
 	void __iomem *regs;
 	int i, ret;
 
-	if (!pi || !pi->dev)
-		return -EINVAL;
-
-	/* If already added */
-	if (pi->pl330_data)
-		return -EINVAL;
-
-	/*
-	 * If the SoC can perform reset on the DMAC, then do it
-	 * before reading its configuration.
-	 */
-	if (pi->dmac_reset)
-		pi->dmac_reset(pi);
-
-	regs = pi->base;
+	regs = pl330->base;
 
 	/* Check if we can handle this DMAC */
-	if ((pi->pcfg.periph_id & 0xfffff) != PERIPH_ID_VAL) {
-		dev_err(pi->dev, "PERIPH_ID 0x%x !\n", pi->pcfg.periph_id);
+	if ((pl330->pcfg.periph_id & 0xfffff) != PERIPH_ID_VAL) {
+		dev_err(pl330->ddma.dev, "PERIPH_ID 0x%x !\n",
+			pl330->pcfg.periph_id);
 		return -EINVAL;
 	}
 
 	/* Read the configuration of the DMAC */
-	read_dmac_config(pi);
+	read_dmac_config(pl330);
 
-	if (pi->pcfg.num_events == 0) {
-		dev_err(pi->dev, "%s:%d Can't work without events!\n",
+	if (pl330->pcfg.num_events == 0) {
+		dev_err(pl330->ddma.dev, "%s:%d Can't work without events!\n",
 			__func__, __LINE__);
 		return -EINVAL;
 	}
 
-	pl330 = kzalloc(sizeof(*pl330), GFP_KERNEL);
-	if (!pl330) {
-		dev_err(pi->dev, "%s:%d Can't allocate memory!\n",
-			__func__, __LINE__);
-		return -ENOMEM;
-	}
-
-	/* Assign the info structure and private data */
-	pl330->pinfo = pi;
-	pi->pl330_data = pl330;
-
 	spin_lock_init(&pl330->lock);
 
 	INIT_LIST_HEAD(&pl330->req_done);
 
 	/* Use default MC buffer size if not provided */
-	if (!pi->mcbufsz)
-		pi->mcbufsz = MCODE_BUFF_PER_REQ * 2;
+	if (!pl330->mcbufsz)
+		pl330->mcbufsz = MCODE_BUFF_PER_REQ * 2;
 
 	/* Mark all events as free */
-	for (i = 0; i < pi->pcfg.num_events; i++)
+	for (i = 0; i < pl330->pcfg.num_events; i++)
 		pl330->events[i] = -1;
 
 	/* Allocate resources needed by the DMAC */
 	ret = dmac_alloc_resources(pl330);
 	if (ret) {
-		dev_err(pi->dev, "Unable to create channels for DMAC\n");
-		kfree(pl330);
+		dev_err(pl330->ddma.dev, "Unable to create channels for DMAC\n");
 		return ret;
 	}
 
@@ -2135,15 +1886,13 @@ static int pl330_add(struct pl330_info *pi)
 
 static int dmac_free_threads(struct pl330_dmac *pl330)
 {
-	struct pl330_info *pi = pl330->pinfo;
-	int chans = pi->pcfg.num_chan;
 	struct pl330_thread *thrd;
 	int i;
 
 	/* Release Channel threads */
-	for (i = 0; i < chans; i++) {
+	for (i = 0; i < pl330->pcfg.num_chan; i++) {
 		thrd = &pl330->channels[i];
-		pl330_release_channel((void *)thrd);
+		pl330_release_channel(thrd);
 	}
 
 	/* Free memory */
@@ -2152,35 +1901,18 @@ static int dmac_free_threads(struct pl330_dmac *pl330)
 	return 0;
 }
 
-static void dmac_free_resources(struct pl330_dmac *pl330)
+static void pl330_del(struct pl330_dmac *pl330)
 {
-	struct pl330_info *pi = pl330->pinfo;
-	int chans = pi->pcfg.num_chan;
-
-	dmac_free_threads(pl330);
-
-	dma_free_coherent(pi->dev, chans * pi->mcbufsz,
-				pl330->mcode_cpu, pl330->mcode_bus);
-}
-
-static void pl330_del(struct pl330_info *pi)
-{
-	struct pl330_dmac *pl330;
-
-	if (!pi || !pi->pl330_data)
-		return;
-
-	pl330 = pi->pl330_data;
-
 	pl330->state = UNINIT;
 
 	tasklet_kill(&pl330->tasks);
 
 	/* Free DMAC resources */
-	dmac_free_resources(pl330);
+	dmac_free_threads(pl330);
 
-	kfree(pl330);
-	pi->pl330_data = NULL;
+	dma_free_coherent(pl330->ddma.dev,
+		pl330->pcfg.num_chan * pl330->mcbufsz, pl330->mcode_cpu,
+		pl330->mcode_bus);
 }
 
 /* forward declaration */
@@ -2212,8 +1944,7 @@ static inline void fill_queue(struct dma_pl330_chan *pch)
 		if (desc->status == BUSY)
 			continue;
 
-		ret = pl330_submit_req(pch->pl330_chid,
-						&desc->req);
+		ret = pl330_submit_req(pch->thread, desc);
 		if (!ret) {
 			desc->status = BUSY;
 		} else if (ret == -EAGAIN) {
@@ -2222,7 +1953,7 @@ static inline void fill_queue(struct dma_pl330_chan *pch)
 		} else {
 			/* Unacceptable request */
 			desc->status = DONE;
-			dev_err(pch->dmac->pif.dev, "%s:%d Bad Desc(%d)\n",
+			dev_err(pch->dmac->ddma.dev, "%s:%d Bad Desc(%d)\n",
 					__func__, __LINE__, desc->txd.cookie);
 			tasklet_schedule(&pch->task);
 		}
@@ -2249,7 +1980,9 @@ static void pl330_tasklet(unsigned long data)
 	fill_queue(pch);
 
 	/* Make sure the PL330 Channel thread is active */
-	pl330_chan_ctrl(pch->pl330_chid, PL330_OP_START);
+	spin_lock(&pch->thread->dmac->lock);
+	_start(pch->thread);
+	spin_unlock(&pch->thread->dmac->lock);
 
 	while (!list_empty(&pch->completed_list)) {
 		dma_async_tx_callback callback;
@@ -2280,25 +2013,6 @@ static void pl330_tasklet(unsigned long data)
 	spin_unlock_irqrestore(&pch->lock, flags);
 }
 
-static void dma_pl330_rqcb(void *token, enum pl330_op_err err)
-{
-	struct dma_pl330_desc *desc = token;
-	struct dma_pl330_chan *pch = desc->pchan;
-	unsigned long flags;
-
-	/* If desc aborted */
-	if (!pch)
-		return;
-
-	spin_lock_irqsave(&pch->lock, flags);
-
-	desc->status = DONE;
-
-	spin_unlock_irqrestore(&pch->lock, flags);
-
-	tasklet_schedule(&pch->task);
-}
-
 bool pl330_filter(struct dma_chan *chan, void *param)
 {
 	u8 *peri_id;
@@ -2315,23 +2029,26 @@ static struct dma_chan *of_dma_pl330_xlate(struct of_phandle_args *dma_spec,
 						struct of_dma *ofdma)
 {
 	int count = dma_spec->args_count;
-	struct dma_pl330_dmac *pdmac = ofdma->of_dma_data;
+	struct pl330_dmac *pl330 = ofdma->of_dma_data;
 	unsigned int chan_id;
 
+	if (!pl330)
+		return NULL;
+
 	if (count != 1)
 		return NULL;
 
 	chan_id = dma_spec->args[0];
-	if (chan_id >= pdmac->num_peripherals)
+	if (chan_id >= pl330->num_peripherals)
 		return NULL;
 
-	return dma_get_slave_channel(&pdmac->peripherals[chan_id].chan);
+	return dma_get_slave_channel(&pl330->peripherals[chan_id].chan);
 }
 
 static int pl330_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct dma_pl330_chan *pch = to_pchan(chan);
-	struct dma_pl330_dmac *pdmac = pch->dmac;
+	struct pl330_dmac *pl330 = pch->dmac;
 	unsigned long flags;
 
 	spin_lock_irqsave(&pch->lock, flags);
@@ -2339,8 +2056,8 @@ static int pl330_alloc_chan_resources(struct dma_chan *chan)
 	dma_cookie_init(chan);
 	pch->cyclic = false;
 
-	pch->pl330_chid = pl330_request_channel(&pdmac->pif);
-	if (!pch->pl330_chid) {
+	pch->thread = pl330_request_channel(pl330);
+	if (!pch->thread) {
 		spin_unlock_irqrestore(&pch->lock, flags);
 		return -ENOMEM;
 	}
@@ -2357,7 +2074,7 @@ static int pl330_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, unsigned
 	struct dma_pl330_chan *pch = to_pchan(chan);
 	struct dma_pl330_desc *desc;
 	unsigned long flags;
-	struct dma_pl330_dmac *pdmac = pch->dmac;
+	struct pl330_dmac *pl330 = pch->dmac;
 	struct dma_slave_config *slave_config;
 	LIST_HEAD(list);
 
@@ -2365,8 +2082,13 @@ static int pl330_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, unsigned
 	case DMA_TERMINATE_ALL:
 		spin_lock_irqsave(&pch->lock, flags);
 
-		/* FLUSH the PL330 Channel thread */
-		pl330_chan_ctrl(pch->pl330_chid, PL330_OP_FLUSH);
+		spin_lock(&pl330->lock);
+		_stop(pch->thread);
+		spin_unlock(&pl330->lock);
+
+		pch->thread->req[0].desc = NULL;
+		pch->thread->req[1].desc = NULL;
+		pch->thread->req_running = -1;
 
 		/* Mark all desc done */
 		list_for_each_entry(desc, &pch->submitted_list, node) {
@@ -2384,9 +2106,9 @@ static int pl330_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, unsigned
 			dma_cookie_complete(&desc->txd);
 		}
 
-		list_splice_tail_init(&pch->submitted_list, &pdmac->desc_pool);
-		list_splice_tail_init(&pch->work_list, &pdmac->desc_pool);
-		list_splice_tail_init(&pch->completed_list, &pdmac->desc_pool);
+		list_splice_tail_init(&pch->submitted_list, &pl330->desc_pool);
+		list_splice_tail_init(&pch->work_list, &pl330->desc_pool);
+		list_splice_tail_init(&pch->completed_list, &pl330->desc_pool);
 		spin_unlock_irqrestore(&pch->lock, flags);
 		break;
 	case DMA_SLAVE_CONFIG:
@@ -2409,7 +2131,7 @@ static int pl330_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, unsigned
 		}
 		break;
 	default:
-		dev_err(pch->dmac->pif.dev, "Not supported command.\n");
+		dev_err(pch->dmac->ddma.dev, "Not supported command.\n");
 		return -ENXIO;
 	}
 
@@ -2425,8 +2147,8 @@ static void pl330_free_chan_resources(struct dma_chan *chan)
 
 	spin_lock_irqsave(&pch->lock, flags);
 
-	pl330_release_channel(pch->pl330_chid);
-	pch->pl330_chid = NULL;
+	pl330_release_channel(pch->thread);
+	pch->thread = NULL;
 
 	if (pch->cyclic)
 		list_splice_tail_init(&pch->work_list, &pch->dmac->desc_pool);
@@ -2489,57 +2211,46 @@ static dma_cookie_t pl330_tx_submit(struct dma_async_tx_descriptor *tx)
 
 static inline void _init_desc(struct dma_pl330_desc *desc)
 {
-	desc->req.x = &desc->px;
-	desc->req.token = desc;
 	desc->rqcfg.swap = SWAP_NO;
-	desc->rqcfg.scctl = SCCTRL0;
-	desc->rqcfg.dcctl = DCCTRL0;
-	desc->req.cfg = &desc->rqcfg;
-	desc->req.xfer_cb = dma_pl330_rqcb;
+	desc->rqcfg.scctl = CCTRL0;
+	desc->rqcfg.dcctl = CCTRL0;
 	desc->txd.tx_submit = pl330_tx_submit;
 
 	INIT_LIST_HEAD(&desc->node);
 }
 
 /* Returns the number of descriptors added to the DMAC pool */
-static int add_desc(struct dma_pl330_dmac *pdmac, gfp_t flg, int count)
+static int add_desc(struct pl330_dmac *pl330, gfp_t flg, int count)
 {
 	struct dma_pl330_desc *desc;
 	unsigned long flags;
 	int i;
 
-	if (!pdmac)
-		return 0;
-
 	desc = kcalloc(count, sizeof(*desc), flg);
 	if (!desc)
 		return 0;
 
-	spin_lock_irqsave(&pdmac->pool_lock, flags);
+	spin_lock_irqsave(&pl330->pool_lock, flags);
 
 	for (i = 0; i < count; i++) {
 		_init_desc(&desc[i]);
-		list_add_tail(&desc[i].node, &pdmac->desc_pool);
+		list_add_tail(&desc[i].node, &pl330->desc_pool);
 	}
 
-	spin_unlock_irqrestore(&pdmac->pool_lock, flags);
+	spin_unlock_irqrestore(&pl330->pool_lock, flags);
 
 	return count;
 }
 
-static struct dma_pl330_desc *
-pluck_desc(struct dma_pl330_dmac *pdmac)
+static struct dma_pl330_desc *pluck_desc(struct pl330_dmac *pl330)
 {
 	struct dma_pl330_desc *desc = NULL;
 	unsigned long flags;
 
-	if (!pdmac)
-		return NULL;
-
-	spin_lock_irqsave(&pdmac->pool_lock, flags);
+	spin_lock_irqsave(&pl330->pool_lock, flags);
 
-	if (!list_empty(&pdmac->desc_pool)) {
-		desc = list_entry(pdmac->desc_pool.next,
+	if (!list_empty(&pl330->desc_pool)) {
+		desc = list_entry(pl330->desc_pool.next,
 				struct dma_pl330_desc, node);
 
 		list_del_init(&desc->node);
@@ -2548,29 +2259,29 @@ pluck_desc(struct dma_pl330_dmac *pdmac)
 		desc->txd.callback = NULL;
 	}
 
-	spin_unlock_irqrestore(&pdmac->pool_lock, flags);
+	spin_unlock_irqrestore(&pl330->pool_lock, flags);
 
 	return desc;
 }
 
 static struct dma_pl330_desc *pl330_get_desc(struct dma_pl330_chan *pch)
 {
-	struct dma_pl330_dmac *pdmac = pch->dmac;
+	struct pl330_dmac *pl330 = pch->dmac;
 	u8 *peri_id = pch->chan.private;
 	struct dma_pl330_desc *desc;
 
 	/* Pluck one desc from the pool of DMAC */
-	desc = pluck_desc(pdmac);
+	desc = pluck_desc(pl330);
 
 	/* If the DMAC pool is empty, alloc new */
 	if (!desc) {
-		if (!add_desc(pdmac, GFP_ATOMIC, 1))
+		if (!add_desc(pl330, GFP_ATOMIC, 1))
 			return NULL;
 
 		/* Try again */
-		desc = pluck_desc(pdmac);
+		desc = pluck_desc(pl330);
 		if (!desc) {
-			dev_err(pch->dmac->pif.dev,
+			dev_err(pch->dmac->ddma.dev,
 				"%s:%d ALERT!\n", __func__, __LINE__);
 			return NULL;
 		}
@@ -2581,8 +2292,8 @@ static struct dma_pl330_desc *pl330_get_desc(struct dma_pl330_chan *pch)
 	desc->txd.cookie = 0;
 	async_tx_ack(&desc->txd);
 
-	desc->req.peri = peri_id ? pch->chan.chan_id : 0;
-	desc->rqcfg.pcfg = &pch->dmac->pif.pcfg;
+	desc->peri = peri_id ? pch->chan.chan_id : 0;
+	desc->rqcfg.pcfg = &pch->dmac->pcfg;
 
 	dma_async_tx_descriptor_init(&desc->txd, &pch->chan);
 
@@ -2592,7 +2303,6 @@ static struct dma_pl330_desc *pl330_get_desc(struct dma_pl330_chan *pch)
 static inline void fill_px(struct pl330_xfer *px,
 		dma_addr_t dst, dma_addr_t src, size_t len)
 {
-	px->next = NULL;
 	px->bytes = len;
 	px->dst_addr = dst;
 	px->src_addr = src;
@@ -2605,7 +2315,7 @@ __pl330_prep_dma_memcpy(struct dma_pl330_chan *pch, dma_addr_t dst,
 	struct dma_pl330_desc *desc = pl330_get_desc(pch);
 
 	if (!desc) {
-		dev_err(pch->dmac->pif.dev, "%s:%d Unable to fetch desc\n",
+		dev_err(pch->dmac->ddma.dev, "%s:%d Unable to fetch desc\n",
 			__func__, __LINE__);
 		return NULL;
 	}
@@ -2629,11 +2339,11 @@ __pl330_prep_dma_memcpy(struct dma_pl330_chan *pch, dma_addr_t dst,
 static inline int get_burst_len(struct dma_pl330_desc *desc, size_t len)
 {
 	struct dma_pl330_chan *pch = desc->pchan;
-	struct pl330_info *pi = &pch->dmac->pif;
+	struct pl330_dmac *pl330 = pch->dmac;
 	int burst_len;
 
-	burst_len = pi->pcfg.data_bus_width / 8;
-	burst_len *= pi->pcfg.data_buf_dep;
+	burst_len = pl330->pcfg.data_bus_width / 8;
+	burst_len *= pl330->pcfg.data_buf_dep;
 	burst_len >>= desc->rqcfg.brst_size;
 
 	/* src/dst_burst_len can't be more than 16 */
@@ -2652,11 +2362,11 @@ static inline int get_burst_len(struct dma_pl330_desc *desc, size_t len)
 static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
 		struct dma_chan *chan, dma_addr_t dma_addr, size_t len,
 		size_t period_len, enum dma_transfer_direction direction,
-		unsigned long flags, void *context)
+		unsigned long flags)
 {
 	struct dma_pl330_desc *desc = NULL, *first = NULL;
 	struct dma_pl330_chan *pch = to_pchan(chan);
-	struct dma_pl330_dmac *pdmac = pch->dmac;
+	struct pl330_dmac *pl330 = pch->dmac;
 	unsigned int i;
 	dma_addr_t dst;
 	dma_addr_t src;
@@ -2665,7 +2375,7 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
 		return NULL;
 
 	if (!is_slave_direction(direction)) {
-		dev_err(pch->dmac->pif.dev, "%s:%d Invalid dma direction\n",
+		dev_err(pch->dmac->ddma.dev, "%s:%d Invalid dma direction\n",
 		__func__, __LINE__);
 		return NULL;
 	}
@@ -2673,23 +2383,23 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
 	for (i = 0; i < len / period_len; i++) {
 		desc = pl330_get_desc(pch);
 		if (!desc) {
-			dev_err(pch->dmac->pif.dev, "%s:%d Unable to fetch desc\n",
+			dev_err(pch->dmac->ddma.dev, "%s:%d Unable to fetch desc\n",
 				__func__, __LINE__);
 
 			if (!first)
 				return NULL;
 
-			spin_lock_irqsave(&pdmac->pool_lock, flags);
+			spin_lock_irqsave(&pl330->pool_lock, flags);
 
 			while (!list_empty(&first->node)) {
 				desc = list_entry(first->node.next,
 						struct dma_pl330_desc, node);
-				list_move_tail(&desc->node, &pdmac->desc_pool);
+				list_move_tail(&desc->node, &pl330->desc_pool);
 			}
 
-			list_move_tail(&first->node, &pdmac->desc_pool);
+			list_move_tail(&first->node, &pl330->desc_pool);
 
-			spin_unlock_irqrestore(&pdmac->pool_lock, flags);
+			spin_unlock_irqrestore(&pl330->pool_lock, flags);
 
 			return NULL;
 		}
@@ -2698,14 +2408,12 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
 		case DMA_MEM_TO_DEV:
 			desc->rqcfg.src_inc = 1;
 			desc->rqcfg.dst_inc = 0;
-			desc->req.rqtype = MEMTODEV;
 			src = dma_addr;
 			dst = pch->fifo_addr;
 			break;
 		case DMA_DEV_TO_MEM:
 			desc->rqcfg.src_inc = 0;
 			desc->rqcfg.dst_inc = 1;
-			desc->req.rqtype = DEVTOMEM;
 			src = pch->fifo_addr;
 			dst = dma_addr;
 			break;
@@ -2713,6 +2421,7 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
 			break;
 		}
 
+		desc->rqtype = direction;
 		desc->rqcfg.brst_size = pch->burst_sz;
 		desc->rqcfg.brst_len = 1;
 		fill_px(&desc->px, dst, src, period_len);
@@ -2740,24 +2449,22 @@ pl330_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dst,
 {
 	struct dma_pl330_desc *desc;
 	struct dma_pl330_chan *pch = to_pchan(chan);
-	struct pl330_info *pi;
+	struct pl330_dmac *pl330 = pch->dmac;
 	int burst;
 
 	if (unlikely(!pch || !len))
 		return NULL;
 
-	pi = &pch->dmac->pif;
-
 	desc = __pl330_prep_dma_memcpy(pch, dst, src, len);
 	if (!desc)
 		return NULL;
 
 	desc->rqcfg.src_inc = 1;
 	desc->rqcfg.dst_inc = 1;
-	desc->req.rqtype = MEMTOMEM;
+	desc->rqtype = DMA_MEM_TO_MEM;
 
 	/* Select max possible burst size */
-	burst = pi->pcfg.data_bus_width / 8;
+	burst = pl330->pcfg.data_bus_width / 8;
 
 	while (burst > 1) {
 		if (!(len % burst))
@@ -2776,7 +2483,7 @@ pl330_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dst,
 	return &desc->txd;
 }
 
-static void __pl330_giveback_desc(struct dma_pl330_dmac *pdmac,
+static void __pl330_giveback_desc(struct pl330_dmac *pl330,
 				  struct dma_pl330_desc *first)
 {
 	unsigned long flags;
@@ -2785,17 +2492,17 @@ static void __pl330_giveback_desc(struct dma_pl330_dmac *pdmac,
 	if (!first)
 		return;
 
-	spin_lock_irqsave(&pdmac->pool_lock, flags);
+	spin_lock_irqsave(&pl330->pool_lock, flags);
 
 	while (!list_empty(&first->node)) {
 		desc = list_entry(first->node.next,
 				struct dma_pl330_desc, node);
-		list_move_tail(&desc->node, &pdmac->desc_pool);
+		list_move_tail(&desc->node, &pl330->desc_pool);
 	}
 
-	list_move_tail(&first->node, &pdmac->desc_pool);
+	list_move_tail(&first->node, &pl330->desc_pool);
 
-	spin_unlock_irqrestore(&pdmac->pool_lock, flags);
+	spin_unlock_irqrestore(&pl330->pool_lock, flags);
 }
 
 static struct dma_async_tx_descriptor *
@@ -2820,12 +2527,12 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 
 		desc = pl330_get_desc(pch);
 		if (!desc) {
-			struct dma_pl330_dmac *pdmac = pch->dmac;
+			struct pl330_dmac *pl330 = pch->dmac;
 
-			dev_err(pch->dmac->pif.dev,
+			dev_err(pch->dmac->ddma.dev,
 				"%s:%d Unable to fetch desc\n",
 				__func__, __LINE__);
-			__pl330_giveback_desc(pdmac, first);
+			__pl330_giveback_desc(pl330, first);
 
 			return NULL;
 		}
@@ -2838,19 +2545,18 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		if (direction == DMA_MEM_TO_DEV) {
 			desc->rqcfg.src_inc = 1;
 			desc->rqcfg.dst_inc = 0;
-			desc->req.rqtype = MEMTODEV;
 			fill_px(&desc->px,
 				addr, sg_dma_address(sg), sg_dma_len(sg));
 		} else {
 			desc->rqcfg.src_inc = 0;
 			desc->rqcfg.dst_inc = 1;
-			desc->req.rqtype = DEVTOMEM;
 			fill_px(&desc->px,
 				sg_dma_address(sg), addr, sg_dma_len(sg));
 		}
 
 		desc->rqcfg.brst_size = pch->burst_sz;
 		desc->rqcfg.brst_len = 1;
+		desc->rqtype = direction;
 	}
 
 	/* Return the last desc in the chain */
@@ -2890,9 +2596,9 @@ static int
 pl330_probe(struct amba_device *adev, const struct amba_id *id)
 {
 	struct dma_pl330_platdata *pdat;
-	struct dma_pl330_dmac *pdmac;
+	struct pl330_config *pcfg;
+	struct pl330_dmac *pl330;
 	struct dma_pl330_chan *pch, *_p;
-	struct pl330_info *pi;
 	struct dma_device *pd;
 	struct resource *res;
 	int i, ret, irq;
@@ -2905,30 +2611,27 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 		return ret;
 
 	/* Allocate a new DMAC and its Channels */
-	pdmac = devm_kzalloc(&adev->dev, sizeof(*pdmac), GFP_KERNEL);
-	if (!pdmac) {
+	pl330 = devm_kzalloc(&adev->dev, sizeof(*pl330), GFP_KERNEL);
+	if (!pl330) {
 		dev_err(&adev->dev, "unable to allocate mem\n");
 		return -ENOMEM;
 	}
 
-	pi = &pdmac->pif;
-	pi->dev = &adev->dev;
-	pi->pl330_data = NULL;
-	pi->mcbufsz = pdat ? pdat->mcbuf_sz : 0;
+	pl330->mcbufsz = pdat ? pdat->mcbuf_sz : 0;
 
 	res = &adev->res;
-	pi->base = devm_ioremap_resource(&adev->dev, res);
-	if (IS_ERR(pi->base))
-		return PTR_ERR(pi->base);
+	pl330->base = devm_ioremap_resource(&adev->dev, res);
+	if (IS_ERR(pl330->base))
+		return PTR_ERR(pl330->base);
 
-	amba_set_drvdata(adev, pdmac);
+	amba_set_drvdata(adev, pl330);
 
 	for (i = 0; i < AMBA_NR_IRQS; i++) {
 		irq = adev->irq[i];
 		if (irq) {
 			ret = devm_request_irq(&adev->dev, irq,
 					       pl330_irq_handler, 0,
-					       dev_name(&adev->dev), pi);
+					       dev_name(&adev->dev), pl330);
 			if (ret)
 				return ret;
 		} else {
@@ -2936,38 +2639,40 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 		}
 	}
 
-	pi->pcfg.periph_id = adev->periphid;
-	ret = pl330_add(pi);
+	pcfg = &pl330->pcfg;
+
+	pcfg->periph_id = adev->periphid;
+	ret = pl330_add(pl330);
 	if (ret)
 		return ret;
 
-	INIT_LIST_HEAD(&pdmac->desc_pool);
-	spin_lock_init(&pdmac->pool_lock);
+	INIT_LIST_HEAD(&pl330->desc_pool);
+	spin_lock_init(&pl330->pool_lock);
 
 	/* Create a descriptor pool of default size */
-	if (!add_desc(pdmac, GFP_KERNEL, NR_DEFAULT_DESC))
+	if (!add_desc(pl330, GFP_KERNEL, NR_DEFAULT_DESC))
 		dev_warn(&adev->dev, "unable to allocate desc\n");
 
-	pd = &pdmac->ddma;
+	pd = &pl330->ddma;
 	INIT_LIST_HEAD(&pd->channels);
 
 	/* Initialize channel parameters */
 	if (pdat)
-		num_chan = max_t(int, pdat->nr_valid_peri, pi->pcfg.num_chan);
+		num_chan = max_t(int, pdat->nr_valid_peri, pcfg->num_chan);
 	else
-		num_chan = max_t(int, pi->pcfg.num_peri, pi->pcfg.num_chan);
+		num_chan = max_t(int, pcfg->num_peri, pcfg->num_chan);
 
-	pdmac->num_peripherals = num_chan;
+	pl330->num_peripherals = num_chan;
 
-	pdmac->peripherals = kzalloc(num_chan * sizeof(*pch), GFP_KERNEL);
-	if (!pdmac->peripherals) {
+	pl330->peripherals = kzalloc(num_chan * sizeof(*pch), GFP_KERNEL);
+	if (!pl330->peripherals) {
 		ret = -ENOMEM;
-		dev_err(&adev->dev, "unable to allocate pdmac->peripherals\n");
+		dev_err(&adev->dev, "unable to allocate pl330->peripherals\n");
 		goto probe_err2;
 	}
 
 	for (i = 0; i < num_chan; i++) {
-		pch = &pdmac->peripherals[i];
+		pch = &pl330->peripherals[i];
 		if (!adev->dev.of_node)
 			pch->chan.private = pdat ? &pdat->peri_id[i] : NULL;
 		else
@@ -2977,9 +2682,9 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 		INIT_LIST_HEAD(&pch->work_list);
 		INIT_LIST_HEAD(&pch->completed_list);
 		spin_lock_init(&pch->lock);
-		pch->pl330_chid = NULL;
+		pch->thread = NULL;
 		pch->chan.device = pd;
-		pch->dmac = pdmac;
+		pch->dmac = pl330;
 
 		/* Add the channel to the DMAC list */
 		list_add_tail(&pch->chan.device_node, &pd->channels);
@@ -2990,7 +2695,7 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 		pd->cap_mask = pdat->cap_mask;
 	} else {
 		dma_cap_set(DMA_MEMCPY, pd->cap_mask);
-		if (pi->pcfg.num_peri) {
+		if (pcfg->num_peri) {
 			dma_cap_set(DMA_SLAVE, pd->cap_mask);
 			dma_cap_set(DMA_CYCLIC, pd->cap_mask);
 			dma_cap_set(DMA_PRIVATE, pd->cap_mask);
@@ -3015,14 +2720,14 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 
 	if (adev->dev.of_node) {
 		ret = of_dma_controller_register(adev->dev.of_node,
-					 of_dma_pl330_xlate, pdmac);
+					 of_dma_pl330_xlate, pl330);
 		if (ret) {
 			dev_err(&adev->dev,
 			"unable to register DMA to the generic DT DMA helpers\n");
 		}
 	}
 
-	adev->dev.dma_parms = &pdmac->dma_parms;
+	adev->dev.dma_parms = &pl330->dma_parms;
 
 	/*
 	 * This is the limit for transfers with a buswidth of 1, larger
@@ -3037,14 +2742,13 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 		"Loaded driver for PL330 DMAC-%d\n", adev->periphid);
 	dev_info(&adev->dev,
 		"\tDBUFF-%ux%ubytes Num_Chans-%u Num_Peri-%u Num_Events-%u\n",
-		pi->pcfg.data_buf_dep,
-		pi->pcfg.data_bus_width / 8, pi->pcfg.num_chan,
-		pi->pcfg.num_peri, pi->pcfg.num_events);
+		pcfg->data_buf_dep, pcfg->data_bus_width / 8, pcfg->num_chan,
+		pcfg->num_peri, pcfg->num_events);
 
 	return 0;
 probe_err3:
 	/* Idle the DMAC */
-	list_for_each_entry_safe(pch, _p, &pdmac->ddma.channels,
+	list_for_each_entry_safe(pch, _p, &pl330->ddma.channels,
 			chan.device_node) {
 
 		/* Remove the channel */
@@ -3055,27 +2759,23 @@ probe_err3:
 		pl330_free_chan_resources(&pch->chan);
 	}
 probe_err2:
-	pl330_del(pi);
+	pl330_del(pl330);
 
 	return ret;
 }
 
 static int pl330_remove(struct amba_device *adev)
 {
-	struct dma_pl330_dmac *pdmac = amba_get_drvdata(adev);
+	struct pl330_dmac *pl330 = amba_get_drvdata(adev);
 	struct dma_pl330_chan *pch, *_p;
-	struct pl330_info *pi;
-
-	if (!pdmac)
-		return 0;
 
 	if (adev->dev.of_node)
 		of_dma_controller_free(adev->dev.of_node);
 
-	dma_async_device_unregister(&pdmac->ddma);
+	dma_async_device_unregister(&pl330->ddma);
 
 	/* Idle the DMAC */
-	list_for_each_entry_safe(pch, _p, &pdmac->ddma.channels,
+	list_for_each_entry_safe(pch, _p, &pl330->ddma.channels,
 			chan.device_node) {
 
 		/* Remove the channel */
@@ -3086,9 +2786,7 @@ static int pl330_remove(struct amba_device *adev)
 		pl330_free_chan_resources(&pch->chan);
 	}
 
-	pi = &pdmac->pif;
-
-	pl330_del(pi);
+	pl330_del(pl330);
 
 	return 0;
 }
diff --git a/drivers/dma/qcom_bam_dma.c b/drivers/dma/qcom_bam_dma.c
index 82c923146e49..7a4bbb0f80a5 100644
--- a/drivers/dma/qcom_bam_dma.c
+++ b/drivers/dma/qcom_bam_dma.c
@@ -61,12 +61,17 @@ struct bam_desc_hw {
 #define DESC_FLAG_INT BIT(15)
 #define DESC_FLAG_EOT BIT(14)
 #define DESC_FLAG_EOB BIT(13)
+#define DESC_FLAG_NWD BIT(12)
 
 struct bam_async_desc {
 	struct virt_dma_desc vd;
 
 	u32 num_desc;
 	u32 xfer_len;
+
+	/* transaction flags, EOT|EOB|NWD */
+	u16 flags;
+
 	struct bam_desc_hw *curr_desc;
 
 	enum dma_transfer_direction dir;
@@ -490,6 +495,14 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
 	if (!async_desc)
 		goto err_out;
 
+	if (flags & DMA_PREP_FENCE)
+		async_desc->flags |= DESC_FLAG_NWD;
+
+	if (flags & DMA_PREP_INTERRUPT)
+		async_desc->flags |= DESC_FLAG_EOT;
+	else
+		async_desc->flags |= DESC_FLAG_INT;
+
 	async_desc->num_desc = num_alloc;
 	async_desc->curr_desc = async_desc->desc;
 	async_desc->dir = direction;
@@ -793,8 +806,11 @@ static void bam_start_dma(struct bam_chan *bchan)
 	else
 		async_desc->xfer_len = async_desc->num_desc;
 
-	/* set INT on last descriptor */
-	desc[async_desc->xfer_len - 1].flags |= DESC_FLAG_INT;
+	/* set any special flags on the last descriptor */
+	if (async_desc->num_desc == async_desc->xfer_len)
+		desc[async_desc->xfer_len - 1].flags = async_desc->flags;
+	else
+		desc[async_desc->xfer_len - 1].flags |= DESC_FLAG_INT;
 
 	if (bchan->tail + async_desc->xfer_len > MAX_DESCRIPTORS) {
 		u32 partial = MAX_DESCRIPTORS - bchan->tail;
diff --git a/drivers/dma/s3c24xx-dma.c b/drivers/dma/s3c24xx-dma.c
index 012520c9fd79..7416572d1e40 100644
--- a/drivers/dma/s3c24xx-dma.c
+++ b/drivers/dma/s3c24xx-dma.c
@@ -889,8 +889,7 @@ static struct dma_async_tx_descriptor *s3c24xx_dma_prep_memcpy(
 
 static struct dma_async_tx_descriptor *s3c24xx_dma_prep_dma_cyclic(
 	struct dma_chan *chan, dma_addr_t addr, size_t size, size_t period,
-	enum dma_transfer_direction direction, unsigned long flags,
-	void *context)
+	enum dma_transfer_direction direction, unsigned long flags)
 {
 	struct s3c24xx_dma_chan *s3cchan = to_s3c24xx_dma_chan(chan);
 	struct s3c24xx_dma_engine *s3cdma = s3cchan->host;
diff --git a/drivers/dma/sa11x0-dma.c b/drivers/dma/sa11x0-dma.c
index 5ebdfbc1051e..4b0ef043729a 100644
--- a/drivers/dma/sa11x0-dma.c
+++ b/drivers/dma/sa11x0-dma.c
@@ -612,7 +612,7 @@ static struct dma_async_tx_descriptor *sa11x0_dma_prep_slave_sg(
 
 static struct dma_async_tx_descriptor *sa11x0_dma_prep_dma_cyclic(
 	struct dma_chan *chan, dma_addr_t addr, size_t size, size_t period,
-	enum dma_transfer_direction dir, unsigned long flags, void *context)
+	enum dma_transfer_direction dir, unsigned long flags)
 {
 	struct sa11x0_dma_chan *c = to_sa11x0_dma_chan(chan);
 	struct sa11x0_dma_desc *txd;
diff --git a/drivers/dma/sh/Kconfig b/drivers/dma/sh/Kconfig
index 0f719816c91b..0349125a2e20 100644
--- a/drivers/dma/sh/Kconfig
+++ b/drivers/dma/sh/Kconfig
@@ -2,21 +2,39 @@
 # DMA engine configuration for sh
 #
 
+#
+# DMA Engine Helpers
+#
+
 config SH_DMAE_BASE
 	bool "Renesas SuperH DMA Engine support"
-	depends on (SUPERH && SH_DMA) || ARCH_SHMOBILE || COMPILE_TEST
+	depends on SUPERH || ARCH_SHMOBILE || COMPILE_TEST
+	depends on !SUPERH || SH_DMA
 	depends on !SH_DMA_API
 	default y
 	select DMA_ENGINE
 	help
 	  Enable support for the Renesas SuperH DMA controllers.
 
+#
+# DMA Controllers
+#
+
 config SH_DMAE
 	tristate "Renesas SuperH DMAC support"
 	depends on SH_DMAE_BASE
 	help
 	  Enable support for the Renesas SuperH DMA controllers.
 
+if SH_DMAE
+
+config SH_DMAE_R8A73A4
+	def_bool y
+	depends on ARCH_R8A73A4
+	depends on OF
+
+endif
+
 config SUDMAC
 	tristate "Renesas SUDMAC support"
 	depends on SH_DMAE_BASE
@@ -34,7 +52,3 @@ config RCAR_AUDMAC_PP
 	depends on SH_DMAE_BASE
 	help
 	  Enable support for the Renesas R-Car Audio DMAC Peripheral Peripheral controllers.
-
-config SHDMA_R8A73A4
-	def_bool y
-	depends on ARCH_R8A73A4 && SH_DMAE != n
diff --git a/drivers/dma/sh/Makefile b/drivers/dma/sh/Makefile
index 1ce88b28cfc6..0a5cfdb76e45 100644
--- a/drivers/dma/sh/Makefile
+++ b/drivers/dma/sh/Makefile
@@ -1,10 +1,18 @@
+#
+# DMA Engine Helpers
+#
+
 obj-$(CONFIG_SH_DMAE_BASE) += shdma-base.o shdma-of.o
-obj-$(CONFIG_SH_DMAE) += shdma.o
+
+#
+# DMA Controllers
+#
+
 shdma-y := shdmac.o
-ifeq ($(CONFIG_OF),y)
-shdma-$(CONFIG_SHDMA_R8A73A4) += shdma-r8a73a4.o
-endif
+shdma-$(CONFIG_SH_DMAE_R8A73A4) += shdma-r8a73a4.o
 shdma-objs := $(shdma-y)
+obj-$(CONFIG_SH_DMAE) += shdma.o
+
 obj-$(CONFIG_SUDMAC) += sudmac.o
 obj-$(CONFIG_RCAR_HPB_DMAE) += rcar-hpbdma.o
 obj-$(CONFIG_RCAR_AUDMAC_PP) += rcar-audmapp.o
diff --git a/drivers/dma/sh/rcar-audmapp.c b/drivers/dma/sh/rcar-audmapp.c
index 2de77289a2e9..dabbf0aba2e9 100644
--- a/drivers/dma/sh/rcar-audmapp.c
+++ b/drivers/dma/sh/rcar-audmapp.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/dmaengine.h>
+#include <linux/of_dma.h>
 #include <linux/platform_data/dma-rcar-audmapp.h>
 #include <linux/platform_device.h>
 #include <linux/shdma-base.h>
@@ -45,8 +46,9 @@
 
 struct audmapp_chan {
 	struct shdma_chan shdma_chan;
-	struct audmapp_slave_config *config;
 	void __iomem *base;
+	dma_addr_t slave_addr;
+	u32 chcr;
 };
 
 struct audmapp_device {
@@ -56,7 +58,16 @@ struct audmapp_device {
 	void __iomem *chan_reg;
 };
 
+struct audmapp_desc {
+	struct shdma_desc shdma_desc;
+	dma_addr_t src;
+	dma_addr_t dst;
+};
+
+#define to_shdma_chan(c) container_of(c, struct shdma_chan, dma_chan)
+
 #define to_chan(chan) container_of(chan, struct audmapp_chan, shdma_chan)
+#define to_desc(sdesc) container_of(sdesc, struct audmapp_desc, shdma_desc)
 #define to_dev(chan) container_of(chan->shdma_chan.dma_chan.device,	\
 				  struct audmapp_device, shdma_dev.dma_dev)
 
@@ -90,70 +101,82 @@ static void audmapp_halt(struct shdma_chan *schan)
 }
 
 static void audmapp_start_xfer(struct shdma_chan *schan,
-			       struct shdma_desc *sdecs)
+			       struct shdma_desc *sdesc)
 {
 	struct audmapp_chan *auchan = to_chan(schan);
 	struct audmapp_device *audev = to_dev(auchan);
-	struct audmapp_slave_config *cfg = auchan->config;
+	struct audmapp_desc *desc = to_desc(sdesc);
 	struct device *dev = audev->dev;
-	u32 chcr = cfg->chcr | PDMACHCR_DE;
+	u32 chcr = auchan->chcr | PDMACHCR_DE;
 
-	dev_dbg(dev, "src/dst/chcr = %pad/%pad/%x\n",
-		&cfg->src, &cfg->dst, cfg->chcr);
+	dev_dbg(dev, "src/dst/chcr = %pad/%pad/%08x\n",
+		&desc->src, &desc->dst, chcr);
 
-	audmapp_write(auchan, cfg->src,	PDMASAR);
-	audmapp_write(auchan, cfg->dst,	PDMADAR);
+	audmapp_write(auchan, desc->src,	PDMASAR);
+	audmapp_write(auchan, desc->dst,	PDMADAR);
 	audmapp_write(auchan, chcr,	PDMACHCR);
 }
 
-static struct audmapp_slave_config *
-audmapp_find_slave(struct audmapp_chan *auchan, int slave_id)
+static void audmapp_get_config(struct audmapp_chan *auchan, int slave_id,
+			      u32 *chcr, dma_addr_t *dst)
 {
 	struct audmapp_device *audev = to_dev(auchan);
 	struct audmapp_pdata *pdata = audev->pdata;
 	struct audmapp_slave_config *cfg;
 	int i;
 
+	*chcr	= 0;
+	*dst	= 0;
+
+	if (!pdata) { /* DT */
+		*chcr = ((u32)slave_id) << 16;
+		auchan->shdma_chan.slave_id = (slave_id) >> 8;
+		return;
+	}
+
+	/* non-DT */
+
 	if (slave_id >= AUDMAPP_SLAVE_NUMBER)
-		return NULL;
+		return;
 
 	for (i = 0, cfg = pdata->slave; i < pdata->slave_num; i++, cfg++)
-		if (cfg->slave_id == slave_id)
-			return cfg;
-
-	return NULL;
+		if (cfg->slave_id == slave_id) {
+			*chcr	= cfg->chcr;
+			*dst	= cfg->dst;
+			break;
+		}
 }
 
 static int audmapp_set_slave(struct shdma_chan *schan, int slave_id,
 			     dma_addr_t slave_addr, bool try)
 {
 	struct audmapp_chan *auchan = to_chan(schan);
-	struct audmapp_slave_config *cfg =
-		audmapp_find_slave(auchan, slave_id);
+	u32 chcr;
+	dma_addr_t dst;
+
+	audmapp_get_config(auchan, slave_id, &chcr, &dst);
 
-	if (!cfg)
-		return -ENODEV;
 	if (try)
 		return 0;
 
-	auchan->config	= cfg;
+	auchan->chcr		= chcr;
+	auchan->slave_addr	= slave_addr ? : dst;
 
 	return 0;
 }
 
 static int audmapp_desc_setup(struct shdma_chan *schan,
-			      struct shdma_desc *sdecs,
+			      struct shdma_desc *sdesc,
 			      dma_addr_t src, dma_addr_t dst, size_t *len)
 {
-	struct audmapp_chan *auchan = to_chan(schan);
-	struct audmapp_slave_config *cfg = auchan->config;
-
-	if (!cfg)
-		return -ENODEV;
+	struct audmapp_desc *desc = to_desc(sdesc);
 
 	if (*len > (size_t)AUDMAPP_LEN_MAX)
 		*len = (size_t)AUDMAPP_LEN_MAX;
 
+	desc->src = src;
+	desc->dst = dst;
+
 	return 0;
 }
 
@@ -164,7 +187,9 @@ static void audmapp_setup_xfer(struct shdma_chan *schan,
 
 static dma_addr_t audmapp_slave_addr(struct shdma_chan *schan)
 {
-	return 0; /* always fixed address */
+	struct audmapp_chan *auchan = to_chan(schan);
+
+	return auchan->slave_addr;
 }
 
 static bool audmapp_channel_busy(struct shdma_chan *schan)
@@ -183,7 +208,7 @@ static bool audmapp_desc_completed(struct shdma_chan *schan,
 
 static struct shdma_desc *audmapp_embedded_desc(void *buf, int i)
 {
-	return &((struct shdma_desc *)buf)[i];
+	return &((struct audmapp_desc *)buf)[i].shdma_desc;
 }
 
 static const struct shdma_ops audmapp_shdma_ops = {
@@ -234,16 +259,39 @@ static void audmapp_chan_remove(struct audmapp_device *audev)
 	dma_dev->chancnt = 0;
 }
 
+static struct dma_chan *audmapp_of_xlate(struct of_phandle_args *dma_spec,
+					 struct of_dma *ofdma)
+{
+	dma_cap_mask_t mask;
+	struct dma_chan *chan;
+	u32 chcr = dma_spec->args[0];
+
+	if (dma_spec->args_count != 1)
+		return NULL;
+
+	dma_cap_zero(mask);
+	dma_cap_set(DMA_SLAVE, mask);
+
+	chan = dma_request_channel(mask, shdma_chan_filter, NULL);
+	if (chan)
+		to_shdma_chan(chan)->hw_req = chcr;
+
+	return chan;
+}
+
 static int audmapp_probe(struct platform_device *pdev)
 {
 	struct audmapp_pdata *pdata = pdev->dev.platform_data;
+	struct device_node *np = pdev->dev.of_node;
 	struct audmapp_device *audev;
 	struct shdma_dev *sdev;
 	struct dma_device *dma_dev;
 	struct resource *res;
 	int err, i;
 
-	if (!pdata)
+	if (np)
+		of_dma_controller_register(np, audmapp_of_xlate, pdev);
+	else if (!pdata)
 		return -ENODEV;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -260,7 +308,7 @@ static int audmapp_probe(struct platform_device *pdev)
 
 	sdev		= &audev->shdma_dev;
 	sdev->ops	= &audmapp_shdma_ops;
-	sdev->desc_size	= sizeof(struct shdma_desc);
+	sdev->desc_size	= sizeof(struct audmapp_desc);
 
 	dma_dev			= &sdev->dma_dev;
 	dma_dev->copy_align	= LOG2_DEFAULT_XFER_SIZE;
@@ -305,12 +353,18 @@ static int audmapp_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id audmapp_of_match[] = {
+	{ .compatible = "renesas,rcar-audmapp", },
+	{},
+};
+
 static struct platform_driver audmapp_driver = {
 	.probe		= audmapp_probe,
 	.remove		= audmapp_remove,
 	.driver		= {
 		.owner	= THIS_MODULE,
 		.name	= "rcar-audmapp-engine",
+		.of_match_table = audmapp_of_match,
 	},
 };
 module_platform_driver(audmapp_driver);
diff --git a/drivers/dma/sh/shdma-arm.h b/drivers/dma/sh/shdma-arm.h
index a2b8258426c9..a1b0ef45d6a2 100644
--- a/drivers/dma/sh/shdma-arm.h
+++ b/drivers/dma/sh/shdma-arm.h
@@ -45,7 +45,7 @@ enum {
 	((((i) & TS_LOW_BIT) << TS_LOW_SHIFT) |\
 	 (((i) & TS_HI_BIT)  << TS_HI_SHIFT))
 
-#define CHCR_TX(xmit_sz) (DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL((xmit_sz)))
-#define CHCR_RX(xmit_sz) (DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL((xmit_sz)))
+#define CHCR_TX(xmit_sz) (DM_FIX | SM_INC | RS_ERS | TS_INDEX2VAL((xmit_sz)))
+#define CHCR_RX(xmit_sz) (DM_INC | SM_FIX | RS_ERS | TS_INDEX2VAL((xmit_sz)))
 
 #endif
diff --git a/drivers/dma/sh/shdma-base.c b/drivers/dma/sh/shdma-base.c
index b35007e21e6b..42d497416196 100644
--- a/drivers/dma/sh/shdma-base.c
+++ b/drivers/dma/sh/shdma-base.c
@@ -206,45 +206,6 @@ static int shdma_setup_slave(struct shdma_chan *schan, int slave_id,
 	return 0;
 }
 
-/*
- * This is the standard shdma filter function to be used as a replacement to the
- * "old" method, using the .private pointer. If for some reason you allocate a
- * channel without slave data, use something like ERR_PTR(-EINVAL) as a filter
- * parameter. If this filter is used, the slave driver, after calling
- * dma_request_channel(), will also have to call dmaengine_slave_config() with
- * .slave_id, .direction, and either .src_addr or .dst_addr set.
- * NOTE: this filter doesn't support multiple DMAC drivers with the DMA_SLAVE
- * capability! If this becomes a requirement, hardware glue drivers, using this
- * services would have to provide their own filters, which first would check
- * the device driver, similar to how other DMAC drivers, e.g., sa11x0-dma.c, do
- * this, and only then, in case of a match, call this common filter.
- * NOTE 2: This filter function is also used in the DT case by shdma_of_xlate().
- * In that case the MID-RID value is used for slave channel filtering and is
- * passed to this function in the "arg" parameter.
- */
-bool shdma_chan_filter(struct dma_chan *chan, void *arg)
-{
-	struct shdma_chan *schan = to_shdma_chan(chan);
-	struct shdma_dev *sdev = to_shdma_dev(schan->dma_chan.device);
-	const struct shdma_ops *ops = sdev->ops;
-	int match = (long)arg;
-	int ret;
-
-	if (match < 0)
-		/* No slave requested - arbitrary channel */
-		return true;
-
-	if (!schan->dev->of_node && match >= slave_num)
-		return false;
-
-	ret = ops->set_slave(schan, match, 0, true);
-	if (ret < 0)
-		return false;
-
-	return true;
-}
-EXPORT_SYMBOL(shdma_chan_filter);
-
 static int shdma_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct shdma_chan *schan = to_shdma_chan(chan);
@@ -295,6 +256,51 @@ esetslave:
 	return ret;
 }
 
+/*
+ * This is the standard shdma filter function to be used as a replacement to the
+ * "old" method, using the .private pointer. If for some reason you allocate a
+ * channel without slave data, use something like ERR_PTR(-EINVAL) as a filter
+ * parameter. If this filter is used, the slave driver, after calling
+ * dma_request_channel(), will also have to call dmaengine_slave_config() with
+ * .slave_id, .direction, and either .src_addr or .dst_addr set.
+ * NOTE: this filter doesn't support multiple DMAC drivers with the DMA_SLAVE
+ * capability! If this becomes a requirement, hardware glue drivers, using this
+ * services would have to provide their own filters, which first would check
+ * the device driver, similar to how other DMAC drivers, e.g., sa11x0-dma.c, do
+ * this, and only then, in case of a match, call this common filter.
+ * NOTE 2: This filter function is also used in the DT case by shdma_of_xlate().
+ * In that case the MID-RID value is used for slave channel filtering and is
+ * passed to this function in the "arg" parameter.
+ */
+bool shdma_chan_filter(struct dma_chan *chan, void *arg)
+{
+	struct shdma_chan *schan;
+	struct shdma_dev *sdev;
+	int match = (long)arg;
+	int ret;
+
+	/* Only support channels handled by this driver. */
+	if (chan->device->device_alloc_chan_resources !=
+	    shdma_alloc_chan_resources)
+		return false;
+
+	if (match < 0)
+		/* No slave requested - arbitrary channel */
+		return true;
+
+	schan = to_shdma_chan(chan);
+	if (!schan->dev->of_node && match >= slave_num)
+		return false;
+
+	sdev = to_shdma_dev(schan->dma_chan.device);
+	ret = sdev->ops->set_slave(schan, match, 0, true);
+	if (ret < 0)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL(shdma_chan_filter);
+
 static dma_async_tx_callback __ld_cleanup(struct shdma_chan *schan, bool all)
 {
 	struct shdma_desc *desc, *_desc;
@@ -662,15 +668,16 @@ static struct dma_async_tx_descriptor *shdma_prep_slave_sg(
 static struct dma_async_tx_descriptor *shdma_prep_dma_cyclic(
 	struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
 	size_t period_len, enum dma_transfer_direction direction,
-	unsigned long flags, void *context)
+	unsigned long flags)
 {
 	struct shdma_chan *schan = to_shdma_chan(chan);
 	struct shdma_dev *sdev = to_shdma_dev(schan->dma_chan.device);
+	struct dma_async_tx_descriptor *desc;
 	const struct shdma_ops *ops = sdev->ops;
 	unsigned int sg_len = buf_len / period_len;
 	int slave_id = schan->slave_id;
 	dma_addr_t slave_addr;
-	struct scatterlist sgl[SHDMA_MAX_SG_LEN];
+	struct scatterlist *sgl;
 	int i;
 
 	if (!chan)
@@ -694,7 +701,16 @@ static struct dma_async_tx_descriptor *shdma_prep_dma_cyclic(
 
 	slave_addr = ops->slave_addr(schan);
 
+	/*
+	 * Allocate the sg list dynamically as it would consumer too much stack
+	 * space.
+	 */
+	sgl = kcalloc(sg_len, sizeof(*sgl), GFP_KERNEL);
+	if (!sgl)
+		return NULL;
+
 	sg_init_table(sgl, sg_len);
+
 	for (i = 0; i < sg_len; i++) {
 		dma_addr_t src = buf_addr + (period_len * i);
 
@@ -704,8 +720,11 @@ static struct dma_async_tx_descriptor *shdma_prep_dma_cyclic(
 		sg_dma_len(&sgl[i]) = period_len;
 	}
 
-	return shdma_prep_sg(schan, sgl, sg_len, &slave_addr,
+	desc = shdma_prep_sg(schan, sgl, sg_len, &slave_addr,
 			     direction, flags, true);
+
+	kfree(sgl);
+	return desc;
 }
 
 static int shdma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
diff --git a/drivers/dma/sh/shdma.h b/drivers/dma/sh/shdma.h
index 758a57b51875..2c0a969adc9f 100644
--- a/drivers/dma/sh/shdma.h
+++ b/drivers/dma/sh/shdma.h
@@ -62,7 +62,7 @@ struct sh_dmae_desc {
 #define to_sh_dev(chan) container_of(chan->shdma_chan.dma_chan.device,\
 				     struct sh_dmae_device, shdma_dev.dma_dev)
 
-#ifdef CONFIG_SHDMA_R8A73A4
+#ifdef CONFIG_SH_DMAE_R8A73A4
 extern const struct sh_dmae_pdata r8a73a4_dma_pdata;
 #define r8a73a4_shdma_devid (&r8a73a4_dma_pdata)
 #else
diff --git a/drivers/dma/sh/shdmac.c b/drivers/dma/sh/shdmac.c
index 146d5df926db..58eb85770eba 100644
--- a/drivers/dma/sh/shdmac.c
+++ b/drivers/dma/sh/shdmac.c
@@ -38,12 +38,12 @@
 #include "../dmaengine.h"
 #include "shdma.h"
 
-/* DMA register */
-#define SAR	0x00
-#define DAR	0x04
-#define TCR	0x08
-#define CHCR	0x0C
-#define DMAOR	0x40
+/* DMA registers */
+#define SAR	0x00	/* Source Address Register */
+#define DAR	0x04	/* Destination Address Register */
+#define TCR	0x08	/* Transfer Count Register */
+#define CHCR	0x0C	/* Channel Control Register */
+#define DMAOR	0x40	/* DMA Operation Register */
 
 #define TEND	0x18 /* USB-DMAC */
 
@@ -239,9 +239,8 @@ static void dmae_init(struct sh_dmae_chan *sh_chan)
 {
 	/*
 	 * Default configuration for dual address memory-memory transfer.
-	 * 0x400 represents auto-request.
 	 */
-	u32 chcr = DM_INC | SM_INC | 0x400 | log2size_to_chcr(sh_chan,
+	u32 chcr = DM_INC | SM_INC | RS_AUTO | log2size_to_chcr(sh_chan,
 						   LOG2_DEFAULT_XFER_SIZE);
 	sh_chan->xmit_shift = calc_xmit_shift(sh_chan, chcr);
 	chcr_write(sh_chan, chcr);
diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
index 03f7820fa333..aac03ab10c54 100644
--- a/drivers/dma/sirf-dma.c
+++ b/drivers/dma/sirf-dma.c
@@ -580,7 +580,7 @@ err_dir:
 static struct dma_async_tx_descriptor *
 sirfsoc_dma_prep_cyclic(struct dma_chan *chan, dma_addr_t addr,
 	size_t buf_len, size_t period_len,
-	enum dma_transfer_direction direction, unsigned long flags, void *context)
+	enum dma_transfer_direction direction, unsigned long flags)
 {
 	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
 	struct sirfsoc_dma_desc *sdesc = NULL;
diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c
index c7984459ede7..5fe59335e247 100644
--- a/drivers/dma/ste_dma40.c
+++ b/drivers/dma/ste_dma40.c
@@ -2531,8 +2531,7 @@ d40_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 static struct dma_async_tx_descriptor *
 dma40_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr,
 		     size_t buf_len, size_t period_len,
-		     enum dma_transfer_direction direction, unsigned long flags,
-		     void *context)
+		     enum dma_transfer_direction direction, unsigned long flags)
 {
 	unsigned int periods = buf_len / period_len;
 	struct dma_async_tx_descriptor *txd;
diff --git a/drivers/dma/sun6i-dma.c b/drivers/dma/sun6i-dma.c
new file mode 100644
index 000000000000..1f92a56fd2b6
--- /dev/null
+++ b/drivers/dma/sun6i-dma.c
@@ -0,0 +1,1053 @@
+/*
+ * Copyright (C) 2013-2014 Allwinner Tech Co., Ltd
+ * Author: Sugar <shuge@allwinnertech.com>
+ *
+ * Copyright (C) 2014 Maxime Ripard
+ * Maxime Ripard <maxime.ripard@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/dmaengine.h>
+#include <linux/dmapool.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of_dma.h>
+#include <linux/platform_device.h>
+#include <linux/reset.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "virt-dma.h"
+
+/*
+ * There's 16 physical channels that can work in parallel.
+ *
+ * However we have 30 different endpoints for our requests.
+ *
+ * Since the channels are able to handle only an unidirectional
+ * transfer, we need to allocate more virtual channels so that
+ * everyone can grab one channel.
+ *
+ * Some devices can't work in both direction (mostly because it
+ * wouldn't make sense), so we have a bit fewer virtual channels than
+ * 2 channels per endpoints.
+ */
+
+#define NR_MAX_CHANNELS		16
+#define NR_MAX_REQUESTS		30
+#define NR_MAX_VCHANS		53
+
+/*
+ * Common registers
+ */
+#define DMA_IRQ_EN(x)		((x) * 0x04)
+#define DMA_IRQ_HALF			BIT(0)
+#define DMA_IRQ_PKG			BIT(1)
+#define DMA_IRQ_QUEUE			BIT(2)
+
+#define DMA_IRQ_CHAN_NR			8
+#define DMA_IRQ_CHAN_WIDTH		4
+
+
+#define DMA_IRQ_STAT(x)		((x) * 0x04 + 0x10)
+
+#define DMA_STAT		0x30
+
+/*
+ * Channels specific registers
+ */
+#define DMA_CHAN_ENABLE		0x00
+#define DMA_CHAN_ENABLE_START		BIT(0)
+#define DMA_CHAN_ENABLE_STOP		0
+
+#define DMA_CHAN_PAUSE		0x04
+#define DMA_CHAN_PAUSE_PAUSE		BIT(1)
+#define DMA_CHAN_PAUSE_RESUME		0
+
+#define DMA_CHAN_LLI_ADDR	0x08
+
+#define DMA_CHAN_CUR_CFG	0x0c
+#define DMA_CHAN_CFG_SRC_DRQ(x)		((x) & 0x1f)
+#define DMA_CHAN_CFG_SRC_IO_MODE	BIT(5)
+#define DMA_CHAN_CFG_SRC_LINEAR_MODE	(0 << 5)
+#define DMA_CHAN_CFG_SRC_BURST(x)	(((x) & 0x3) << 7)
+#define DMA_CHAN_CFG_SRC_WIDTH(x)	(((x) & 0x3) << 9)
+
+#define DMA_CHAN_CFG_DST_DRQ(x)		(DMA_CHAN_CFG_SRC_DRQ(x) << 16)
+#define DMA_CHAN_CFG_DST_IO_MODE	(DMA_CHAN_CFG_SRC_IO_MODE << 16)
+#define DMA_CHAN_CFG_DST_LINEAR_MODE	(DMA_CHAN_CFG_SRC_LINEAR_MODE << 16)
+#define DMA_CHAN_CFG_DST_BURST(x)	(DMA_CHAN_CFG_SRC_BURST(x) << 16)
+#define DMA_CHAN_CFG_DST_WIDTH(x)	(DMA_CHAN_CFG_SRC_WIDTH(x) << 16)
+
+#define DMA_CHAN_CUR_SRC	0x10
+
+#define DMA_CHAN_CUR_DST	0x14
+
+#define DMA_CHAN_CUR_CNT	0x18
+
+#define DMA_CHAN_CUR_PARA	0x1c
+
+
+/*
+ * Various hardware related defines
+ */
+#define LLI_LAST_ITEM	0xfffff800
+#define NORMAL_WAIT	8
+#define DRQ_SDRAM	1
+
+/*
+ * Hardware representation of the LLI
+ *
+ * The hardware will be fed the physical address of this structure,
+ * and read its content in order to start the transfer.
+ */
+struct sun6i_dma_lli {
+	u32			cfg;
+	u32			src;
+	u32			dst;
+	u32			len;
+	u32			para;
+	u32			p_lli_next;
+
+	/*
+	 * This field is not used by the DMA controller, but will be
+	 * used by the CPU to go through the list (mostly for dumping
+	 * or freeing it).
+	 */
+	struct sun6i_dma_lli	*v_lli_next;
+};
+
+
+struct sun6i_desc {
+	struct virt_dma_desc	vd;
+	dma_addr_t		p_lli;
+	struct sun6i_dma_lli	*v_lli;
+};
+
+struct sun6i_pchan {
+	u32			idx;
+	void __iomem		*base;
+	struct sun6i_vchan	*vchan;
+	struct sun6i_desc	*desc;
+	struct sun6i_desc	*done;
+};
+
+struct sun6i_vchan {
+	struct virt_dma_chan	vc;
+	struct list_head	node;
+	struct dma_slave_config	cfg;
+	struct sun6i_pchan	*phy;
+	u8			port;
+};
+
+struct sun6i_dma_dev {
+	struct dma_device	slave;
+	void __iomem		*base;
+	struct clk		*clk;
+	int			irq;
+	spinlock_t		lock;
+	struct reset_control	*rstc;
+	struct tasklet_struct	task;
+	atomic_t		tasklet_shutdown;
+	struct list_head	pending;
+	struct dma_pool		*pool;
+	struct sun6i_pchan	*pchans;
+	struct sun6i_vchan	*vchans;
+};
+
+static struct device *chan2dev(struct dma_chan *chan)
+{
+	return &chan->dev->device;
+}
+
+static inline struct sun6i_dma_dev *to_sun6i_dma_dev(struct dma_device *d)
+{
+	return container_of(d, struct sun6i_dma_dev, slave);
+}
+
+static inline struct sun6i_vchan *to_sun6i_vchan(struct dma_chan *chan)
+{
+	return container_of(chan, struct sun6i_vchan, vc.chan);
+}
+
+static inline struct sun6i_desc *
+to_sun6i_desc(struct dma_async_tx_descriptor *tx)
+{
+	return container_of(tx, struct sun6i_desc, vd.tx);
+}
+
+static inline void sun6i_dma_dump_com_regs(struct sun6i_dma_dev *sdev)
+{
+	dev_dbg(sdev->slave.dev, "Common register:\n"
+		"\tmask0(%04x): 0x%08x\n"
+		"\tmask1(%04x): 0x%08x\n"
+		"\tpend0(%04x): 0x%08x\n"
+		"\tpend1(%04x): 0x%08x\n"
+		"\tstats(%04x): 0x%08x\n",
+		DMA_IRQ_EN(0), readl(sdev->base + DMA_IRQ_EN(0)),
+		DMA_IRQ_EN(1), readl(sdev->base + DMA_IRQ_EN(1)),
+		DMA_IRQ_STAT(0), readl(sdev->base + DMA_IRQ_STAT(0)),
+		DMA_IRQ_STAT(1), readl(sdev->base + DMA_IRQ_STAT(1)),
+		DMA_STAT, readl(sdev->base + DMA_STAT));
+}
+
+static inline void sun6i_dma_dump_chan_regs(struct sun6i_dma_dev *sdev,
+					    struct sun6i_pchan *pchan)
+{
+	phys_addr_t reg = virt_to_phys(pchan->base);
+
+	dev_dbg(sdev->slave.dev, "Chan %d reg: %pa\n"
+		"\t___en(%04x): \t0x%08x\n"
+		"\tpause(%04x): \t0x%08x\n"
+		"\tstart(%04x): \t0x%08x\n"
+		"\t__cfg(%04x): \t0x%08x\n"
+		"\t__src(%04x): \t0x%08x\n"
+		"\t__dst(%04x): \t0x%08x\n"
+		"\tcount(%04x): \t0x%08x\n"
+		"\t_para(%04x): \t0x%08x\n\n",
+		pchan->idx, &reg,
+		DMA_CHAN_ENABLE,
+		readl(pchan->base + DMA_CHAN_ENABLE),
+		DMA_CHAN_PAUSE,
+		readl(pchan->base + DMA_CHAN_PAUSE),
+		DMA_CHAN_LLI_ADDR,
+		readl(pchan->base + DMA_CHAN_LLI_ADDR),
+		DMA_CHAN_CUR_CFG,
+		readl(pchan->base + DMA_CHAN_CUR_CFG),
+		DMA_CHAN_CUR_SRC,
+		readl(pchan->base + DMA_CHAN_CUR_SRC),
+		DMA_CHAN_CUR_DST,
+		readl(pchan->base + DMA_CHAN_CUR_DST),
+		DMA_CHAN_CUR_CNT,
+		readl(pchan->base + DMA_CHAN_CUR_CNT),
+		DMA_CHAN_CUR_PARA,
+		readl(pchan->base + DMA_CHAN_CUR_PARA));
+}
+
+static inline int convert_burst(u32 maxburst, u8 *burst)
+{
+	switch (maxburst) {
+	case 1:
+		*burst = 0;
+		break;
+	case 8:
+		*burst = 2;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline int convert_buswidth(enum dma_slave_buswidth addr_width, u8 *width)
+{
+	if ((addr_width < DMA_SLAVE_BUSWIDTH_1_BYTE) ||
+	    (addr_width > DMA_SLAVE_BUSWIDTH_4_BYTES))
+		return -EINVAL;
+
+	*width = addr_width >> 1;
+	return 0;
+}
+
+static void *sun6i_dma_lli_add(struct sun6i_dma_lli *prev,
+			       struct sun6i_dma_lli *next,
+			       dma_addr_t next_phy,
+			       struct sun6i_desc *txd)
+{
+	if ((!prev && !txd) || !next)
+		return NULL;
+
+	if (!prev) {
+		txd->p_lli = next_phy;
+		txd->v_lli = next;
+	} else {
+		prev->p_lli_next = next_phy;
+		prev->v_lli_next = next;
+	}
+
+	next->p_lli_next = LLI_LAST_ITEM;
+	next->v_lli_next = NULL;
+
+	return next;
+}
+
+static inline int sun6i_dma_cfg_lli(struct sun6i_dma_lli *lli,
+				    dma_addr_t src,
+				    dma_addr_t dst, u32 len,
+				    struct dma_slave_config *config)
+{
+	u8 src_width, dst_width, src_burst, dst_burst;
+	int ret;
+
+	if (!config)
+		return -EINVAL;
+
+	ret = convert_burst(config->src_maxburst, &src_burst);
+	if (ret)
+		return ret;
+
+	ret = convert_burst(config->dst_maxburst, &dst_burst);
+	if (ret)
+		return ret;
+
+	ret = convert_buswidth(config->src_addr_width, &src_width);
+	if (ret)
+		return ret;
+
+	ret = convert_buswidth(config->dst_addr_width, &dst_width);
+	if (ret)
+		return ret;
+
+	lli->cfg = DMA_CHAN_CFG_SRC_BURST(src_burst) |
+		DMA_CHAN_CFG_SRC_WIDTH(src_width) |
+		DMA_CHAN_CFG_DST_BURST(dst_burst) |
+		DMA_CHAN_CFG_DST_WIDTH(dst_width);
+
+	lli->src = src;
+	lli->dst = dst;
+	lli->len = len;
+	lli->para = NORMAL_WAIT;
+
+	return 0;
+}
+
+static inline void sun6i_dma_dump_lli(struct sun6i_vchan *vchan,
+				      struct sun6i_dma_lli *lli)
+{
+	phys_addr_t p_lli = virt_to_phys(lli);
+
+	dev_dbg(chan2dev(&vchan->vc.chan),
+		"\n\tdesc:   p - %pa v - 0x%p\n"
+		"\t\tc - 0x%08x s - 0x%08x d - 0x%08x\n"
+		"\t\tl - 0x%08x p - 0x%08x n - 0x%08x\n",
+		&p_lli, lli,
+		lli->cfg, lli->src, lli->dst,
+		lli->len, lli->para, lli->p_lli_next);
+}
+
+static void sun6i_dma_free_desc(struct virt_dma_desc *vd)
+{
+	struct sun6i_desc *txd = to_sun6i_desc(&vd->tx);
+	struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(vd->tx.chan->device);
+	struct sun6i_dma_lli *v_lli, *v_next;
+	dma_addr_t p_lli, p_next;
+
+	if (unlikely(!txd))
+		return;
+
+	p_lli = txd->p_lli;
+	v_lli = txd->v_lli;
+
+	while (v_lli) {
+		v_next = v_lli->v_lli_next;
+		p_next = v_lli->p_lli_next;
+
+		dma_pool_free(sdev->pool, v_lli, p_lli);
+
+		v_lli = v_next;
+		p_lli = p_next;
+	}
+
+	kfree(txd);
+}
+
+static int sun6i_dma_terminate_all(struct sun6i_vchan *vchan)
+{
+	struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(vchan->vc.chan.device);
+	struct sun6i_pchan *pchan = vchan->phy;
+	unsigned long flags;
+	LIST_HEAD(head);
+
+	spin_lock(&sdev->lock);
+	list_del_init(&vchan->node);
+	spin_unlock(&sdev->lock);
+
+	spin_lock_irqsave(&vchan->vc.lock, flags);
+
+	vchan_get_all_descriptors(&vchan->vc, &head);
+
+	if (pchan) {
+		writel(DMA_CHAN_ENABLE_STOP, pchan->base + DMA_CHAN_ENABLE);
+		writel(DMA_CHAN_PAUSE_RESUME, pchan->base + DMA_CHAN_PAUSE);
+
+		vchan->phy = NULL;
+		pchan->vchan = NULL;
+		pchan->desc = NULL;
+		pchan->done = NULL;
+	}
+
+	spin_unlock_irqrestore(&vchan->vc.lock, flags);
+
+	vchan_dma_desc_free_list(&vchan->vc, &head);
+
+	return 0;
+}
+
+static int sun6i_dma_start_desc(struct sun6i_vchan *vchan)
+{
+	struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(vchan->vc.chan.device);
+	struct virt_dma_desc *desc = vchan_next_desc(&vchan->vc);
+	struct sun6i_pchan *pchan = vchan->phy;
+	u32 irq_val, irq_reg, irq_offset;
+
+	if (!pchan)
+		return -EAGAIN;
+
+	if (!desc) {
+		pchan->desc = NULL;
+		pchan->done = NULL;
+		return -EAGAIN;
+	}
+
+	list_del(&desc->node);
+
+	pchan->desc = to_sun6i_desc(&desc->tx);
+	pchan->done = NULL;
+
+	sun6i_dma_dump_lli(vchan, pchan->desc->v_lli);
+
+	irq_reg = pchan->idx / DMA_IRQ_CHAN_NR;
+	irq_offset = pchan->idx % DMA_IRQ_CHAN_NR;
+
+	irq_val = readl(sdev->base + DMA_IRQ_EN(irq_offset));
+	irq_val |= DMA_IRQ_QUEUE << (irq_offset * DMA_IRQ_CHAN_WIDTH);
+	writel(irq_val, sdev->base + DMA_IRQ_EN(irq_offset));
+
+	writel(pchan->desc->p_lli, pchan->base + DMA_CHAN_LLI_ADDR);
+	writel(DMA_CHAN_ENABLE_START, pchan->base + DMA_CHAN_ENABLE);
+
+	sun6i_dma_dump_com_regs(sdev);
+	sun6i_dma_dump_chan_regs(sdev, pchan);
+
+	return 0;
+}
+
+static void sun6i_dma_tasklet(unsigned long data)
+{
+	struct sun6i_dma_dev *sdev = (struct sun6i_dma_dev *)data;
+	struct sun6i_vchan *vchan;
+	struct sun6i_pchan *pchan;
+	unsigned int pchan_alloc = 0;
+	unsigned int pchan_idx;
+
+	list_for_each_entry(vchan, &sdev->slave.channels, vc.chan.device_node) {
+		spin_lock_irq(&vchan->vc.lock);
+
+		pchan = vchan->phy;
+
+		if (pchan && pchan->done) {
+			if (sun6i_dma_start_desc(vchan)) {
+				/*
+				 * No current txd associated with this channel
+				 */
+				dev_dbg(sdev->slave.dev, "pchan %u: free\n",
+					pchan->idx);
+
+				/* Mark this channel free */
+				vchan->phy = NULL;
+				pchan->vchan = NULL;
+			}
+		}
+		spin_unlock_irq(&vchan->vc.lock);
+	}
+
+	spin_lock_irq(&sdev->lock);
+	for (pchan_idx = 0; pchan_idx < NR_MAX_CHANNELS; pchan_idx++) {
+		pchan = &sdev->pchans[pchan_idx];
+
+		if (pchan->vchan || list_empty(&sdev->pending))
+			continue;
+
+		vchan = list_first_entry(&sdev->pending,
+					 struct sun6i_vchan, node);
+
+		/* Remove from pending channels */
+		list_del_init(&vchan->node);
+		pchan_alloc |= BIT(pchan_idx);
+
+		/* Mark this channel allocated */
+		pchan->vchan = vchan;
+		vchan->phy = pchan;
+		dev_dbg(sdev->slave.dev, "pchan %u: alloc vchan %p\n",
+			pchan->idx, &vchan->vc);
+	}
+	spin_unlock_irq(&sdev->lock);
+
+	for (pchan_idx = 0; pchan_idx < NR_MAX_CHANNELS; pchan_idx++) {
+		if (!(pchan_alloc & BIT(pchan_idx)))
+			continue;
+
+		pchan = sdev->pchans + pchan_idx;
+		vchan = pchan->vchan;
+		if (vchan) {
+			spin_lock_irq(&vchan->vc.lock);
+			sun6i_dma_start_desc(vchan);
+			spin_unlock_irq(&vchan->vc.lock);
+		}
+	}
+}
+
+static irqreturn_t sun6i_dma_interrupt(int irq, void *dev_id)
+{
+	struct sun6i_dma_dev *sdev = dev_id;
+	struct sun6i_vchan *vchan;
+	struct sun6i_pchan *pchan;
+	int i, j, ret = IRQ_NONE;
+	u32 status;
+
+	for (i = 0; i < 2; i++) {
+		status = readl(sdev->base + DMA_IRQ_STAT(i));
+		if (!status)
+			continue;
+
+		dev_dbg(sdev->slave.dev, "DMA irq status %s: 0x%x\n",
+			i ? "high" : "low", status);
+
+		writel(status, sdev->base + DMA_IRQ_STAT(i));
+
+		for (j = 0; (j < 8) && status; j++) {
+			if (status & DMA_IRQ_QUEUE) {
+				pchan = sdev->pchans + j;
+				vchan = pchan->vchan;
+
+				if (vchan) {
+					spin_lock(&vchan->vc.lock);
+					vchan_cookie_complete(&pchan->desc->vd);
+					pchan->done = pchan->desc;
+					spin_unlock(&vchan->vc.lock);
+				}
+			}
+
+			status = status >> 4;
+		}
+
+		if (!atomic_read(&sdev->tasklet_shutdown))
+			tasklet_schedule(&sdev->task);
+		ret = IRQ_HANDLED;
+	}
+
+	return ret;
+}
+
+static struct dma_async_tx_descriptor *sun6i_dma_prep_dma_memcpy(
+		struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
+		size_t len, unsigned long flags)
+{
+	struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(chan->device);
+	struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+	struct dma_slave_config *sconfig = &vchan->cfg;
+	struct sun6i_dma_lli *v_lli;
+	struct sun6i_desc *txd;
+	dma_addr_t p_lli;
+	int ret;
+
+	dev_dbg(chan2dev(chan),
+		"%s; chan: %d, dest: %pad, src: %pad, len: %zu. flags: 0x%08lx\n",
+		__func__, vchan->vc.chan.chan_id, &dest, &src, len, flags);
+
+	if (!len)
+		return NULL;
+
+	txd = kzalloc(sizeof(*txd), GFP_NOWAIT);
+	if (!txd)
+		return NULL;
+
+	v_lli = dma_pool_alloc(sdev->pool, GFP_NOWAIT, &p_lli);
+	if (!v_lli) {
+		dev_err(sdev->slave.dev, "Failed to alloc lli memory\n");
+		goto err_txd_free;
+	}
+
+	ret = sun6i_dma_cfg_lli(v_lli, src, dest, len, sconfig);
+	if (ret)
+		goto err_dma_free;
+
+	v_lli->cfg |= DMA_CHAN_CFG_SRC_DRQ(DRQ_SDRAM) |
+		DMA_CHAN_CFG_DST_DRQ(DRQ_SDRAM) |
+		DMA_CHAN_CFG_DST_LINEAR_MODE |
+		DMA_CHAN_CFG_SRC_LINEAR_MODE;
+
+	sun6i_dma_lli_add(NULL, v_lli, p_lli, txd);
+
+	sun6i_dma_dump_lli(vchan, v_lli);
+
+	return vchan_tx_prep(&vchan->vc, &txd->vd, flags);
+
+err_dma_free:
+	dma_pool_free(sdev->pool, v_lli, p_lli);
+err_txd_free:
+	kfree(txd);
+	return NULL;
+}
+
+static struct dma_async_tx_descriptor *sun6i_dma_prep_slave_sg(
+		struct dma_chan *chan, struct scatterlist *sgl,
+		unsigned int sg_len, enum dma_transfer_direction dir,
+		unsigned long flags, void *context)
+{
+	struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(chan->device);
+	struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+	struct dma_slave_config *sconfig = &vchan->cfg;
+	struct sun6i_dma_lli *v_lli, *prev = NULL;
+	struct sun6i_desc *txd;
+	struct scatterlist *sg;
+	dma_addr_t p_lli;
+	int i, ret;
+
+	if (!sgl)
+		return NULL;
+
+	if (!is_slave_direction(dir)) {
+		dev_err(chan2dev(chan), "Invalid DMA direction\n");
+		return NULL;
+	}
+
+	txd = kzalloc(sizeof(*txd), GFP_NOWAIT);
+	if (!txd)
+		return NULL;
+
+	for_each_sg(sgl, sg, sg_len, i) {
+		v_lli = dma_pool_alloc(sdev->pool, GFP_NOWAIT, &p_lli);
+		if (!v_lli)
+			goto err_lli_free;
+
+		if (dir == DMA_MEM_TO_DEV) {
+			ret = sun6i_dma_cfg_lli(v_lli, sg_dma_address(sg),
+						sconfig->dst_addr, sg_dma_len(sg),
+						sconfig);
+			if (ret)
+				goto err_cur_lli_free;
+
+			v_lli->cfg |= DMA_CHAN_CFG_DST_IO_MODE |
+				DMA_CHAN_CFG_SRC_LINEAR_MODE |
+				DMA_CHAN_CFG_SRC_DRQ(DRQ_SDRAM) |
+				DMA_CHAN_CFG_DST_DRQ(vchan->port);
+
+			dev_dbg(chan2dev(chan),
+				"%s; chan: %d, dest: %pad, src: %pad, len: %u. flags: 0x%08lx\n",
+				__func__, vchan->vc.chan.chan_id,
+				&sconfig->dst_addr, &sg_dma_address(sg),
+				sg_dma_len(sg), flags);
+
+		} else {
+			ret = sun6i_dma_cfg_lli(v_lli, sconfig->src_addr,
+						sg_dma_address(sg), sg_dma_len(sg),
+						sconfig);
+			if (ret)
+				goto err_cur_lli_free;
+
+			v_lli->cfg |= DMA_CHAN_CFG_DST_LINEAR_MODE |
+				DMA_CHAN_CFG_SRC_IO_MODE |
+				DMA_CHAN_CFG_DST_DRQ(DRQ_SDRAM) |
+				DMA_CHAN_CFG_SRC_DRQ(vchan->port);
+
+			dev_dbg(chan2dev(chan),
+				"%s; chan: %d, dest: %pad, src: %pad, len: %u. flags: 0x%08lx\n",
+				__func__, vchan->vc.chan.chan_id,
+				&sg_dma_address(sg), &sconfig->src_addr,
+				sg_dma_len(sg), flags);
+		}
+
+		prev = sun6i_dma_lli_add(prev, v_lli, p_lli, txd);
+	}
+
+	dev_dbg(chan2dev(chan), "First: %pad\n", &txd->p_lli);
+	for (prev = txd->v_lli; prev; prev = prev->v_lli_next)
+		sun6i_dma_dump_lli(vchan, prev);
+
+	return vchan_tx_prep(&vchan->vc, &txd->vd, flags);
+
+err_cur_lli_free:
+	dma_pool_free(sdev->pool, v_lli, p_lli);
+err_lli_free:
+	for (prev = txd->v_lli; prev; prev = prev->v_lli_next)
+		dma_pool_free(sdev->pool, prev, virt_to_phys(prev));
+	kfree(txd);
+	return NULL;
+}
+
+static int sun6i_dma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
+		       unsigned long arg)
+{
+	struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(chan->device);
+	struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+	struct sun6i_pchan *pchan = vchan->phy;
+	unsigned long flags;
+	int ret = 0;
+
+	switch (cmd) {
+	case DMA_RESUME:
+		dev_dbg(chan2dev(chan), "vchan %p: resume\n", &vchan->vc);
+
+		spin_lock_irqsave(&vchan->vc.lock, flags);
+
+		if (pchan) {
+			writel(DMA_CHAN_PAUSE_RESUME,
+			       pchan->base + DMA_CHAN_PAUSE);
+		} else if (!list_empty(&vchan->vc.desc_issued)) {
+			spin_lock(&sdev->lock);
+			list_add_tail(&vchan->node, &sdev->pending);
+			spin_unlock(&sdev->lock);
+		}
+
+		spin_unlock_irqrestore(&vchan->vc.lock, flags);
+		break;
+
+	case DMA_PAUSE:
+		dev_dbg(chan2dev(chan), "vchan %p: pause\n", &vchan->vc);
+
+		if (pchan) {
+			writel(DMA_CHAN_PAUSE_PAUSE,
+			       pchan->base + DMA_CHAN_PAUSE);
+		} else {
+			spin_lock(&sdev->lock);
+			list_del_init(&vchan->node);
+			spin_unlock(&sdev->lock);
+		}
+		break;
+
+	case DMA_TERMINATE_ALL:
+		ret = sun6i_dma_terminate_all(vchan);
+		break;
+	case DMA_SLAVE_CONFIG:
+		memcpy(&vchan->cfg, (void *)arg, sizeof(struct dma_slave_config));
+		break;
+	default:
+		ret = -ENXIO;
+		break;
+	}
+	return ret;
+}
+
+static enum dma_status sun6i_dma_tx_status(struct dma_chan *chan,
+					   dma_cookie_t cookie,
+					   struct dma_tx_state *state)
+{
+	struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+	struct sun6i_pchan *pchan = vchan->phy;
+	struct sun6i_dma_lli *lli;
+	struct virt_dma_desc *vd;
+	struct sun6i_desc *txd;
+	enum dma_status ret;
+	unsigned long flags;
+	size_t bytes = 0;
+
+	ret = dma_cookie_status(chan, cookie, state);
+	if (ret == DMA_COMPLETE)
+		return ret;
+
+	spin_lock_irqsave(&vchan->vc.lock, flags);
+
+	vd = vchan_find_desc(&vchan->vc, cookie);
+	txd = to_sun6i_desc(&vd->tx);
+
+	if (vd) {
+		for (lli = txd->v_lli; lli != NULL; lli = lli->v_lli_next)
+			bytes += lli->len;
+	} else if (!pchan || !pchan->desc) {
+		bytes = 0;
+	} else {
+		bytes = readl(pchan->base + DMA_CHAN_CUR_CNT);
+	}
+
+	spin_unlock_irqrestore(&vchan->vc.lock, flags);
+
+	dma_set_residue(state, bytes);
+
+	return ret;
+}
+
+static void sun6i_dma_issue_pending(struct dma_chan *chan)
+{
+	struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(chan->device);
+	struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+	unsigned long flags;
+
+	spin_lock_irqsave(&vchan->vc.lock, flags);
+
+	if (vchan_issue_pending(&vchan->vc)) {
+		spin_lock(&sdev->lock);
+
+		if (!vchan->phy && list_empty(&vchan->node)) {
+			list_add_tail(&vchan->node, &sdev->pending);
+			tasklet_schedule(&sdev->task);
+			dev_dbg(chan2dev(chan), "vchan %p: issued\n",
+				&vchan->vc);
+		}
+
+		spin_unlock(&sdev->lock);
+	} else {
+		dev_dbg(chan2dev(chan), "vchan %p: nothing to issue\n",
+			&vchan->vc);
+	}
+
+	spin_unlock_irqrestore(&vchan->vc.lock, flags);
+}
+
+static int sun6i_dma_alloc_chan_resources(struct dma_chan *chan)
+{
+	return 0;
+}
+
+static void sun6i_dma_free_chan_resources(struct dma_chan *chan)
+{
+	struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(chan->device);
+	struct sun6i_vchan *vchan = to_sun6i_vchan(chan);
+	unsigned long flags;
+
+	spin_lock_irqsave(&sdev->lock, flags);
+	list_del_init(&vchan->node);
+	spin_unlock_irqrestore(&sdev->lock, flags);
+
+	vchan_free_chan_resources(&vchan->vc);
+}
+
+static struct dma_chan *sun6i_dma_of_xlate(struct of_phandle_args *dma_spec,
+					   struct of_dma *ofdma)
+{
+	struct sun6i_dma_dev *sdev = ofdma->of_dma_data;
+	struct sun6i_vchan *vchan;
+	struct dma_chan *chan;
+	u8 port = dma_spec->args[0];
+
+	if (port > NR_MAX_REQUESTS)
+		return NULL;
+
+	chan = dma_get_any_slave_channel(&sdev->slave);
+	if (!chan)
+		return NULL;
+
+	vchan = to_sun6i_vchan(chan);
+	vchan->port = port;
+
+	return chan;
+}
+
+static inline void sun6i_kill_tasklet(struct sun6i_dma_dev *sdev)
+{
+	/* Disable all interrupts from DMA */
+	writel(0, sdev->base + DMA_IRQ_EN(0));
+	writel(0, sdev->base + DMA_IRQ_EN(1));
+
+	/* Prevent spurious interrupts from scheduling the tasklet */
+	atomic_inc(&sdev->tasklet_shutdown);
+
+	/* Make sure we won't have any further interrupts */
+	devm_free_irq(sdev->slave.dev, sdev->irq, sdev);
+
+	/* Actually prevent the tasklet from being scheduled */
+	tasklet_kill(&sdev->task);
+}
+
+static inline void sun6i_dma_free(struct sun6i_dma_dev *sdev)
+{
+	int i;
+
+	for (i = 0; i < NR_MAX_VCHANS; i++) {
+		struct sun6i_vchan *vchan = &sdev->vchans[i];
+
+		list_del(&vchan->vc.chan.device_node);
+		tasklet_kill(&vchan->vc.task);
+	}
+}
+
+static int sun6i_dma_probe(struct platform_device *pdev)
+{
+	struct sun6i_dma_dev *sdc;
+	struct resource *res;
+	struct clk *mux, *pll6;
+	int ret, i;
+
+	sdc = devm_kzalloc(&pdev->dev, sizeof(*sdc), GFP_KERNEL);
+	if (!sdc)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	sdc->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(sdc->base))
+		return PTR_ERR(sdc->base);
+
+	sdc->irq = platform_get_irq(pdev, 0);
+	if (sdc->irq < 0) {
+		dev_err(&pdev->dev, "Cannot claim IRQ\n");
+		return sdc->irq;
+	}
+
+	sdc->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(sdc->clk)) {
+		dev_err(&pdev->dev, "No clock specified\n");
+		return PTR_ERR(sdc->clk);
+	}
+
+	mux = clk_get(NULL, "ahb1_mux");
+	if (IS_ERR(mux)) {
+		dev_err(&pdev->dev, "Couldn't get AHB1 Mux\n");
+		return PTR_ERR(mux);
+	}
+
+	pll6 = clk_get(NULL, "pll6");
+	if (IS_ERR(pll6)) {
+		dev_err(&pdev->dev, "Couldn't get PLL6\n");
+		clk_put(mux);
+		return PTR_ERR(pll6);
+	}
+
+	ret = clk_set_parent(mux, pll6);
+	clk_put(pll6);
+	clk_put(mux);
+
+	if (ret) {
+		dev_err(&pdev->dev, "Couldn't reparent AHB1 on PLL6\n");
+		return ret;
+	}
+
+	sdc->rstc = devm_reset_control_get(&pdev->dev, NULL);
+	if (IS_ERR(sdc->rstc)) {
+		dev_err(&pdev->dev, "No reset controller specified\n");
+		return PTR_ERR(sdc->rstc);
+	}
+
+	sdc->pool = dmam_pool_create(dev_name(&pdev->dev), &pdev->dev,
+				     sizeof(struct sun6i_dma_lli), 4, 0);
+	if (!sdc->pool) {
+		dev_err(&pdev->dev, "No memory for descriptors dma pool\n");
+		return -ENOMEM;
+	}
+
+	platform_set_drvdata(pdev, sdc);
+	INIT_LIST_HEAD(&sdc->pending);
+	spin_lock_init(&sdc->lock);
+
+	dma_cap_set(DMA_PRIVATE, sdc->slave.cap_mask);
+	dma_cap_set(DMA_MEMCPY, sdc->slave.cap_mask);
+	dma_cap_set(DMA_SLAVE, sdc->slave.cap_mask);
+
+	INIT_LIST_HEAD(&sdc->slave.channels);
+	sdc->slave.device_alloc_chan_resources	= sun6i_dma_alloc_chan_resources;
+	sdc->slave.device_free_chan_resources	= sun6i_dma_free_chan_resources;
+	sdc->slave.device_tx_status		= sun6i_dma_tx_status;
+	sdc->slave.device_issue_pending		= sun6i_dma_issue_pending;
+	sdc->slave.device_prep_slave_sg		= sun6i_dma_prep_slave_sg;
+	sdc->slave.device_prep_dma_memcpy	= sun6i_dma_prep_dma_memcpy;
+	sdc->slave.device_control		= sun6i_dma_control;
+	sdc->slave.chancnt			= NR_MAX_VCHANS;
+
+	sdc->slave.dev = &pdev->dev;
+
+	sdc->pchans = devm_kcalloc(&pdev->dev, NR_MAX_CHANNELS,
+				   sizeof(struct sun6i_pchan), GFP_KERNEL);
+	if (!sdc->pchans)
+		return -ENOMEM;
+
+	sdc->vchans = devm_kcalloc(&pdev->dev, NR_MAX_VCHANS,
+				   sizeof(struct sun6i_vchan), GFP_KERNEL);
+	if (!sdc->vchans)
+		return -ENOMEM;
+
+	tasklet_init(&sdc->task, sun6i_dma_tasklet, (unsigned long)sdc);
+
+	for (i = 0; i < NR_MAX_CHANNELS; i++) {
+		struct sun6i_pchan *pchan = &sdc->pchans[i];
+
+		pchan->idx = i;
+		pchan->base = sdc->base + 0x100 + i * 0x40;
+	}
+
+	for (i = 0; i < NR_MAX_VCHANS; i++) {
+		struct sun6i_vchan *vchan = &sdc->vchans[i];
+
+		INIT_LIST_HEAD(&vchan->node);
+		vchan->vc.desc_free = sun6i_dma_free_desc;
+		vchan_init(&vchan->vc, &sdc->slave);
+	}
+
+	ret = reset_control_deassert(sdc->rstc);
+	if (ret) {
+		dev_err(&pdev->dev, "Couldn't deassert the device from reset\n");
+		goto err_chan_free;
+	}
+
+	ret = clk_prepare_enable(sdc->clk);
+	if (ret) {
+		dev_err(&pdev->dev, "Couldn't enable the clock\n");
+		goto err_reset_assert;
+	}
+
+	ret = devm_request_irq(&pdev->dev, sdc->irq, sun6i_dma_interrupt, 0,
+			       dev_name(&pdev->dev), sdc);
+	if (ret) {
+		dev_err(&pdev->dev, "Cannot request IRQ\n");
+		goto err_clk_disable;
+	}
+
+	ret = dma_async_device_register(&sdc->slave);
+	if (ret) {
+		dev_warn(&pdev->dev, "Failed to register DMA engine device\n");
+		goto err_irq_disable;
+	}
+
+	ret = of_dma_controller_register(pdev->dev.of_node, sun6i_dma_of_xlate,
+					 sdc);
+	if (ret) {
+		dev_err(&pdev->dev, "of_dma_controller_register failed\n");
+		goto err_dma_unregister;
+	}
+
+	return 0;
+
+err_dma_unregister:
+	dma_async_device_unregister(&sdc->slave);
+err_irq_disable:
+	sun6i_kill_tasklet(sdc);
+err_clk_disable:
+	clk_disable_unprepare(sdc->clk);
+err_reset_assert:
+	reset_control_assert(sdc->rstc);
+err_chan_free:
+	sun6i_dma_free(sdc);
+	return ret;
+}
+
+static int sun6i_dma_remove(struct platform_device *pdev)
+{
+	struct sun6i_dma_dev *sdc = platform_get_drvdata(pdev);
+
+	of_dma_controller_free(pdev->dev.of_node);
+	dma_async_device_unregister(&sdc->slave);
+
+	sun6i_kill_tasklet(sdc);
+
+	clk_disable_unprepare(sdc->clk);
+	reset_control_assert(sdc->rstc);
+
+	sun6i_dma_free(sdc);
+
+	return 0;
+}
+
+static struct of_device_id sun6i_dma_match[] = {
+	{ .compatible = "allwinner,sun6i-a31-dma" },
+	{ /* sentinel */ }
+};
+
+static struct platform_driver sun6i_dma_driver = {
+	.probe		= sun6i_dma_probe,
+	.remove		= sun6i_dma_remove,
+	.driver = {
+		.name		= "sun6i-dma",
+		.of_match_table	= sun6i_dma_match,
+	},
+};
+module_platform_driver(sun6i_dma_driver);
+
+MODULE_DESCRIPTION("Allwinner A31 DMA Controller Driver");
+MODULE_AUTHOR("Sugar <shuge@allwinnertech.com>");
+MODULE_AUTHOR("Maxime Ripard <maxime.ripard@free-electrons.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
index 03ad64ecaaf0..16efa603ff65 100644
--- a/drivers/dma/tegra20-apb-dma.c
+++ b/drivers/dma/tegra20-apb-dma.c
@@ -1055,7 +1055,7 @@ static struct dma_async_tx_descriptor *tegra_dma_prep_slave_sg(
 static struct dma_async_tx_descriptor *tegra_dma_prep_dma_cyclic(
 	struct dma_chan *dc, dma_addr_t buf_addr, size_t buf_len,
 	size_t period_len, enum dma_transfer_direction direction,
-	unsigned long flags, void *context)
+	unsigned long flags)
 {
 	struct tegra_dma_channel *tdc = to_tegra_dma_chan(dc);
 	struct tegra_dma_desc *dma_desc = NULL;
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 443d03fbac47..8eeab72b93e2 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -331,7 +331,7 @@ static int bch_allocator_thread(void *arg)
 				mutex_unlock(&ca->set->bucket_lock);
 				blkdev_issue_discard(ca->bdev,
 					bucket_to_sector(ca->set, bucket),
-					ca->sb.block_size, GFP_KERNEL, 0);
+					ca->sb.bucket_size, GFP_KERNEL, 0);
 				mutex_lock(&ca->set->bucket_lock);
 			}
 
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d2ebcf323094..04f7bc28ef83 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -477,9 +477,13 @@ struct gc_stat {
  * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
  * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
  * flushing dirty data).
+ *
+ * CACHE_SET_RUNNING means all cache devices have been registered and journal
+ * replay is complete.
  */
 #define CACHE_SET_UNREGISTERING		0
 #define	CACHE_SET_STOPPING		1
+#define	CACHE_SET_RUNNING		2
 
 struct cache_set {
 	struct closure		cl;
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 545416415305..646fe85261c1 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1182,7 +1182,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
 {
 	uint64_t start_time;
 	bool used_mempool = false;
-	struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
+	struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT,
 						     order);
 	if (!out) {
 		struct page *outp;
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 5f6728d5d4dd..ae964624efb2 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -453,7 +453,7 @@ static inline bool bch_bkey_equal_header(const struct bkey *l,
 {
 	return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
 		KEY_PTRS(l) == KEY_PTRS(r) &&
-		KEY_CSUM(l) == KEY_CSUM(l));
+		KEY_CSUM(l) == KEY_CSUM(r));
 }
 
 /* Keylists */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7347b6100961..00cde40db572 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -117,9 +117,9 @@
 ({									\
 	int _r, l = (b)->level - 1;					\
 	bool _w = l <= (op)->lock;					\
-	struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
+	struct btree *_child = bch_btree_node_get((b)->c, op, key, l,	\
+						  _w, b);		\
 	if (!IS_ERR(_child)) {						\
-		_child->parent = (b);					\
 		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
 		rw_unlock(_w, _child);					\
 	} else								\
@@ -142,7 +142,6 @@
 		rw_lock(_w, _b, _b->level);				\
 		if (_b == (c)->root &&					\
 		    _w == insert_lock(op, _b)) {			\
-			_b->parent = NULL;				\
 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
 		}							\
 		rw_unlock(_w, _b);					\
@@ -202,7 +201,7 @@ void bch_btree_node_read_done(struct btree *b)
 	struct bset *i = btree_bset_first(b);
 	struct btree_iter *iter;
 
-	iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
+	iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 	iter->used = 0;
 
@@ -421,7 +420,7 @@ static void do_btree_node_write(struct btree *b)
 	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
 		       bset_sector_offset(&b->keys, i));
 
-	if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
+	if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
 		int j;
 		struct bio_vec *bv;
 		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -967,7 +966,8 @@ err:
  * level and op->lock.
  */
 struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
-				 struct bkey *k, int level, bool write)
+				 struct bkey *k, int level, bool write,
+				 struct btree *parent)
 {
 	int i = 0;
 	struct btree *b;
@@ -1002,6 +1002,7 @@ retry:
 		BUG_ON(b->level != level);
 	}
 
+	b->parent = parent;
 	b->accessed = 1;
 
 	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
@@ -1022,15 +1023,16 @@ retry:
 	return b;
 }
 
-static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
+static void btree_node_prefetch(struct btree *parent, struct bkey *k)
 {
 	struct btree *b;
 
-	mutex_lock(&c->bucket_lock);
-	b = mca_alloc(c, NULL, k, level);
-	mutex_unlock(&c->bucket_lock);
+	mutex_lock(&parent->c->bucket_lock);
+	b = mca_alloc(parent->c, NULL, k, parent->level - 1);
+	mutex_unlock(&parent->c->bucket_lock);
 
 	if (!IS_ERR_OR_NULL(b)) {
+		b->parent = parent;
 		bch_btree_node_read(b);
 		rw_unlock(true, b);
 	}
@@ -1060,15 +1062,16 @@ static void btree_node_free(struct btree *b)
 	mutex_unlock(&b->c->bucket_lock);
 }
 
-struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
-				   int level)
+struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+				     int level, bool wait,
+				     struct btree *parent)
 {
 	BKEY_PADDED(key) k;
 	struct btree *b = ERR_PTR(-EAGAIN);
 
 	mutex_lock(&c->bucket_lock);
 retry:
-	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
+	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
 		goto err;
 
 	bkey_put(c, &k.key);
@@ -1085,6 +1088,7 @@ retry:
 	}
 
 	b->accessed = 1;
+	b->parent = parent;
 	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
 
 	mutex_unlock(&c->bucket_lock);
@@ -1096,14 +1100,21 @@ err_free:
 err:
 	mutex_unlock(&c->bucket_lock);
 
-	trace_bcache_btree_node_alloc_fail(b);
+	trace_bcache_btree_node_alloc_fail(c);
 	return b;
 }
 
+static struct btree *bch_btree_node_alloc(struct cache_set *c,
+					  struct btree_op *op, int level,
+					  struct btree *parent)
+{
+	return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
+}
+
 static struct btree *btree_node_alloc_replacement(struct btree *b,
 						  struct btree_op *op)
 {
-	struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
+	struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
 	if (!IS_ERR_OR_NULL(n)) {
 		mutex_lock(&n->write_lock);
 		bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
@@ -1403,6 +1414,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 	BUG_ON(btree_bset_first(new_nodes[0])->keys);
 	btree_node_free(new_nodes[0]);
 	rw_unlock(true, new_nodes[0]);
+	new_nodes[0] = NULL;
 
 	for (i = 0; i < nodes; i++) {
 		if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
@@ -1516,7 +1528,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
 		if (k) {
 			r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
-						  true);
+						  true, b);
 			if (IS_ERR(r->b)) {
 				ret = PTR_ERR(r->b);
 				break;
@@ -1811,7 +1823,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
 			k = bch_btree_iter_next_filter(&iter, &b->keys,
 						       bch_ptr_bad);
 			if (k)
-				btree_node_prefetch(b->c, k, b->level - 1);
+				btree_node_prefetch(b, k);
 
 			if (p)
 				ret = btree(check_recurse, p, b, op);
@@ -1976,12 +1988,12 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 		trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
 
-		n2 = bch_btree_node_alloc(b->c, op, b->level);
+		n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
 		if (IS_ERR(n2))
 			goto err_free1;
 
 		if (!b->parent) {
-			n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
+			n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
 			if (IS_ERR(n3))
 				goto err_free2;
 		}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 91dfa5e69685..5c391fa01bed 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -242,9 +242,10 @@ void __bch_btree_node_write(struct btree *, struct closure *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
 void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
+struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *,
+				     int, bool, struct btree *);
 struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
-				 struct bkey *, int, bool);
+				 struct bkey *, int, bool, struct btree *);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 3a0de4cf9771..243de0bf15cd 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -474,9 +474,8 @@ out:
 	return false;
 }
 
-static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k)
 {
-	struct btree *b = container_of(bk, struct btree, keys);
 	char buf[80];
 
 	if (!KEY_SIZE(k))
@@ -485,16 +484,22 @@ static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
 	if (KEY_SIZE(k) > KEY_OFFSET(k))
 		goto bad;
 
-	if (__ptr_invalid(b->c, k))
+	if (__ptr_invalid(c, k))
 		goto bad;
 
 	return false;
 bad:
 	bch_extent_to_text(buf, sizeof(buf), k);
-	cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
+	cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
 	return true;
 }
 
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+	struct btree *b = container_of(bk, struct btree, keys);
+	return __bch_extent_invalid(b->c, k);
+}
+
 static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
 				     unsigned ptr)
 {
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
index e4e23409782d..e2ed54054e7a 100644
--- a/drivers/md/bcache/extents.h
+++ b/drivers/md/bcache/extents.h
@@ -9,5 +9,6 @@ struct cache_set;
 
 void bch_extent_to_text(char *, size_t, const struct bkey *);
 bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+bool __bch_extent_invalid(struct cache_set *, const struct bkey *);
 
 #endif /* _BCACHE_EXTENTS_H */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 59e82021b5bb..fe080ad0e558 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -7,6 +7,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"
 
 #include <trace/events/bcache.h>
 
@@ -189,11 +190,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
 			if (read_bucket(l))
 				goto bsearch;
 
-		if (list_empty(list))
+		/* no journal entries on this device? */
+		if (l == ca->sb.njournal_buckets)
 			continue;
 bsearch:
+		BUG_ON(list_empty(list));
+
 		/* Binary search */
-		m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+		m = l;
+		r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
 		pr_debug("starting binary search, l %u r %u", l, r);
 
 		while (l + 1 < r) {
@@ -291,15 +296,16 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
 
 		for (k = i->j.start;
 		     k < bset_bkey_last(&i->j);
-		     k = bkey_next(k)) {
-			unsigned j;
+		     k = bkey_next(k))
+			if (!__bch_extent_invalid(c, k)) {
+				unsigned j;
 
-			for (j = 0; j < KEY_PTRS(k); j++)
-				if (ptr_available(c, k, j))
-					atomic_inc(&PTR_BUCKET(c, k, j)->pin);
+				for (j = 0; j < KEY_PTRS(k); j++)
+					if (ptr_available(c, k, j))
+						atomic_inc(&PTR_BUCKET(c, k, j)->pin);
 
-			bch_initial_mark_key(c, 0, k);
-		}
+				bch_initial_mark_key(c, 0, k);
+			}
 	}
 }
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 15fff4f68a7c..62e6e98186b5 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -311,7 +311,8 @@ void bch_data_insert(struct closure *cl)
 {
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
-	trace_bcache_write(op->bio, op->writeback, op->bypass);
+	trace_bcache_write(op->c, op->inode, op->bio,
+			   op->writeback, op->bypass);
 
 	bch_keylist_init(&op->insert_keys);
 	bio_get(op->bio);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 926ded8ccbf5..d4713d098a39 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -733,8 +733,6 @@ static void bcache_device_detach(struct bcache_device *d)
 static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
 				 unsigned id)
 {
-	BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
-
 	d->id = id;
 	d->c = c;
 	c->devices[id] = d;
@@ -927,6 +925,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
 	list_move(&dc->list, &uncached_devices);
 
 	clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
+	clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
 
 	mutex_unlock(&bch_register_lock);
 
@@ -1041,6 +1040,9 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	 */
 	atomic_set(&dc->count, 1);
 
+	if (bch_cached_dev_writeback_start(dc))
+		return -ENOMEM;
+
 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
 		bch_sectors_dirty_init(dc);
 		atomic_set(&dc->has_dirty, 1);
@@ -1070,7 +1072,8 @@ static void cached_dev_free(struct closure *cl)
 	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 
 	cancel_delayed_work_sync(&dc->writeback_rate_update);
-	kthread_stop(dc->writeback_thread);
+	if (!IS_ERR_OR_NULL(dc->writeback_thread))
+		kthread_stop(dc->writeback_thread);
 
 	mutex_lock(&bch_register_lock);
 
@@ -1081,12 +1084,8 @@ static void cached_dev_free(struct closure *cl)
 
 	mutex_unlock(&bch_register_lock);
 
-	if (!IS_ERR_OR_NULL(dc->bdev)) {
-		if (dc->bdev->bd_disk)
-			blk_sync_queue(bdev_get_queue(dc->bdev));
-
+	if (!IS_ERR_OR_NULL(dc->bdev))
 		blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-	}
 
 	wake_up(&unregister_wait);
 
@@ -1213,7 +1212,9 @@ void bch_flash_dev_release(struct kobject *kobj)
 static void flash_dev_free(struct closure *cl)
 {
 	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+	mutex_lock(&bch_register_lock);
 	bcache_device_free(d);
+	mutex_unlock(&bch_register_lock);
 	kobject_put(&d->kobj);
 }
 
@@ -1221,7 +1222,9 @@ static void flash_dev_flush(struct closure *cl)
 {
 	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
 
+	mutex_lock(&bch_register_lock);
 	bcache_device_unlink(d);
+	mutex_unlock(&bch_register_lock);
 	kobject_del(&d->kobj);
 	continue_at(cl, flash_dev_free, system_wq);
 }
@@ -1277,6 +1280,9 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
 	if (test_bit(CACHE_SET_STOPPING, &c->flags))
 		return -EINTR;
 
+	if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+		return -EPERM;
+
 	u = uuid_find_empty(c);
 	if (!u) {
 		pr_err("Can't create volume, no room for UUID");
@@ -1346,8 +1352,11 @@ static void cache_set_free(struct closure *cl)
 	bch_journal_free(c);
 
 	for_each_cache(ca, c, i)
-		if (ca)
+		if (ca) {
+			ca->set = NULL;
+			c->cache[ca->sb.nr_this_dev] = NULL;
 			kobject_put(&ca->kobj);
+		}
 
 	bch_bset_sort_state_free(&c->sort);
 	free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
@@ -1405,9 +1414,11 @@ static void cache_set_flush(struct closure *cl)
 		if (ca->alloc_thread)
 			kthread_stop(ca->alloc_thread);
 
-	cancel_delayed_work_sync(&c->journal.work);
-	/* flush last journal entry if needed */
-	c->journal.work.work.func(&c->journal.work.work);
+	if (c->journal.cur) {
+		cancel_delayed_work_sync(&c->journal.work);
+		/* flush last journal entry if needed */
+		c->journal.work.work.func(&c->journal.work.work);
+	}
 
 	closure_return(cl);
 }
@@ -1586,7 +1597,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		err = "error reading btree root";
-		c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
+		c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err;
 
@@ -1661,7 +1672,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		err = "cannot allocate new btree root";
-		c->root = bch_btree_node_alloc(c, NULL, 0);
+		c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err;
 
@@ -1697,6 +1708,7 @@ static void run_cache_set(struct cache_set *c)
 
 	flash_devs_run(c);
 
+	set_bit(CACHE_SET_RUNNING, &c->flags);
 	return;
 err:
 	closure_sync(&cl);
@@ -1760,6 +1772,7 @@ found:
 		pr_debug("set version = %llu", c->sb.version);
 	}
 
+	kobject_get(&ca->kobj);
 	ca->set = c;
 	ca->set->cache[ca->sb.nr_this_dev] = ca;
 	c->cache_by_alloc[c->caches_loaded++] = ca;
@@ -1780,8 +1793,10 @@ void bch_cache_release(struct kobject *kobj)
 	struct cache *ca = container_of(kobj, struct cache, kobj);
 	unsigned i;
 
-	if (ca->set)
+	if (ca->set) {
+		BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
 		ca->set->cache[ca->sb.nr_this_dev] = NULL;
+	}
 
 	bio_split_pool_free(&ca->bio_split_hook);
 
@@ -1798,10 +1813,8 @@ void bch_cache_release(struct kobject *kobj)
 	if (ca->sb_bio.bi_inline_vecs[0].bv_page)
 		put_page(ca->sb_bio.bi_io_vec[0].bv_page);
 
-	if (!IS_ERR_OR_NULL(ca->bdev)) {
-		blk_sync_queue(bdev_get_queue(ca->bdev));
+	if (!IS_ERR_OR_NULL(ca->bdev))
 		blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-	}
 
 	kfree(ca);
 	module_put(THIS_MODULE);
@@ -1844,7 +1857,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 }
 
 static void register_cache(struct cache_sb *sb, struct page *sb_page,
-				  struct block_device *bdev, struct cache *ca)
+				struct block_device *bdev, struct cache *ca)
 {
 	char name[BDEVNAME_SIZE];
 	const char *err = "cannot allocate memory";
@@ -1877,10 +1890,12 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page,
 		goto err;
 
 	pr_info("registered cache device %s", bdevname(bdev, name));
+out:
+	kobject_put(&ca->kobj);
 	return;
 err:
 	pr_notice("error opening %s: %s", bdevname(bdev, name), err);
-	kobject_put(&ca->kobj);
+	goto out;
 }
 
 /* Global interfaces/init */
@@ -1945,10 +1960,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	if (IS_ERR(bdev)) {
 		if (bdev == ERR_PTR(-EBUSY)) {
 			bdev = lookup_bdev(strim(path));
+			mutex_lock(&bch_register_lock);
 			if (!IS_ERR(bdev) && bch_is_open(bdev))
 				err = "device already registered";
 			else
 				err = "device busy";
+			mutex_unlock(&bch_register_lock);
 		}
 		goto err;
 	}
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ac7d0d1f70d7..98df7572b5f7 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -416,8 +416,8 @@ do {									\
 			  average_frequency,	frequency_units);	\
 	__print_time_stat(stats, name,					\
 			  average_duration,	duration_units);	\
-	__print_time_stat(stats, name,					\
-			  max_duration,		duration_units);	\
+	sysfs_print(name ## _ ##max_duration ## _ ## duration_units,	\
+			div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\
 									\
 	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\
 		    ? div_s64(local_clock() - (stats)->last,		\
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index f4300e4c0114..f1986bcd1bf0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -239,7 +239,7 @@ static void read_dirty(struct cached_dev *dc)
 		if (KEY_START(&w->key) != dc->last_read ||
 		    jiffies_to_msecs(delay) > 50)
 			while (!kthread_should_stop() && delay)
-				delay = schedule_timeout_uninterruptible(delay);
+				delay = schedule_timeout_interruptible(delay);
 
 		dc->last_read	= KEY_OFFSET(&w->key);
 
@@ -436,7 +436,7 @@ static int bch_writeback_thread(void *arg)
 			while (delay &&
 			       !kthread_should_stop() &&
 			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
-				delay = schedule_timeout_uninterruptible(delay);
+				delay = schedule_timeout_interruptible(delay);
 		}
 	}
 
@@ -478,7 +478,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
 	dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
 }
 
-int bch_cached_dev_writeback_init(struct cached_dev *dc)
+void bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
 	sema_init(&dc->in_flight, 64);
 	init_rwsem(&dc->writeback_lock);
@@ -494,14 +494,20 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_rate_d_term	= 30;
 	dc->writeback_rate_p_term_inverse = 6000;
 
+	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+}
+
+int bch_cached_dev_writeback_start(struct cached_dev *dc)
+{
 	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
 					      "bcache_writeback");
 	if (IS_ERR(dc->writeback_thread))
 		return PTR_ERR(dc->writeback_thread);
 
-	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 	schedule_delayed_work(&dc->writeback_rate_update,
 			      dc->writeback_rate_update_seconds * HZ);
 
+	bch_writeback_queue(dc);
+
 	return 0;
 }
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index e2f8598937ac..0a9dab187b79 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -85,6 +85,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
 
 void bch_sectors_dirty_init(struct cached_dev *dc);
-int bch_cached_dev_writeback_init(struct cached_dev *);
+void bch_cached_dev_writeback_init(struct cached_dev *);
+int bch_cached_dev_writeback_start(struct cached_dev *);
 
 #endif
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index d2899e7eb3aa..06709257adde 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -330,7 +330,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
 	disk_super->discard_root = cpu_to_le64(cmd->discard_root);
 	disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
 	disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
-	disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+	disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
 	disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
 	disk_super->cache_blocks = cpu_to_le32(0);
 
@@ -478,7 +478,7 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
 					    bool may_format_device)
 {
 	int r;
-	cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE,
+	cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
 					  CACHE_METADATA_CACHE_SIZE,
 					  CACHE_MAX_CONCURRENT_LOCKS);
 	if (IS_ERR(cmd->bm)) {
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index cd70a78623a3..7383c90ccdb8 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -9,19 +9,17 @@
 
 #include "dm-cache-block-types.h"
 #include "dm-cache-policy-internal.h"
+#include "persistent-data/dm-space-map-metadata.h"
 
 /*----------------------------------------------------------------*/
 
-#define DM_CACHE_METADATA_BLOCK_SIZE 4096
+#define DM_CACHE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
 
 /* FIXME: remove this restriction */
 /*
  * The metadata device is currently limited in size.
- *
- * We have one block of index, which can hold 255 index entries.  Each
- * index entry contains allocation info about 16k metadata blocks.
  */
-#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+#define DM_CACHE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
 
 /*
  * A metadata device larger than 16GB triggers a warning.
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2c63326638b6..1af40ee209e2 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -718,6 +718,22 @@ static int bio_triggers_commit(struct cache *cache, struct bio *bio)
 	return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 }
 
+/*
+ * You must increment the deferred set whilst the prison cell is held.  To
+ * encourage this, we ask for 'cell' to be passed in.
+ */
+static void inc_ds(struct cache *cache, struct bio *bio,
+		   struct dm_bio_prison_cell *cell)
+{
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+	BUG_ON(!cell);
+	BUG_ON(pb->all_io_entry);
+
+	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+}
+
 static void issue(struct cache *cache, struct bio *bio)
 {
 	unsigned long flags;
@@ -737,6 +753,12 @@ static void issue(struct cache *cache, struct bio *bio)
 	spin_unlock_irqrestore(&cache->lock, flags);
 }
 
+static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
+{
+	inc_ds(cache, bio, cell);
+	issue(cache, bio);
+}
+
 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 {
 	unsigned long flags;
@@ -1015,6 +1037,11 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
 
 	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
 	remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
+
+	/*
+	 * No need to inc_ds() here, since the cell will be held for the
+	 * duration of the io.
+	 */
 	generic_make_request(bio);
 }
 
@@ -1115,8 +1142,7 @@ static void check_for_quiesced_migrations(struct cache *cache,
 		return;
 
 	INIT_LIST_HEAD(&work);
-	if (pb->all_io_entry)
-		dm_deferred_entry_dec(pb->all_io_entry, &work);
+	dm_deferred_entry_dec(pb->all_io_entry, &work);
 
 	if (!list_empty(&work))
 		queue_quiesced_migrations(cache, &work);
@@ -1252,6 +1278,11 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
 	else
 		remap_to_cache(cache, bio, 0);
 
+	/*
+	 * REQ_FLUSH is not directed at any particular block so we don't
+	 * need to inc_ds().  REQ_FUA's are split into a write + REQ_FLUSH
+	 * by dm-core.
+	 */
 	issue(cache, bio);
 }
 
@@ -1301,15 +1332,6 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
 		   &cache->stats.read_miss : &cache->stats.write_miss);
 }
 
-static void issue_cache_bio(struct cache *cache, struct bio *bio,
-			    struct per_bio_data *pb,
-			    dm_oblock_t oblock, dm_cblock_t cblock)
-{
-	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-	remap_to_cache_dirty(cache, bio, oblock, cblock);
-	issue(cache, bio);
-}
-
 static void process_bio(struct cache *cache, struct prealloc *structs,
 			struct bio *bio)
 {
@@ -1318,8 +1340,6 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 	dm_oblock_t block = get_bio_block(cache, bio);
 	struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
 	struct policy_result lookup_result;
-	size_t pb_data_size = get_per_bio_data_size(cache);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	bool discarded_block = is_discarded_oblock(cache, block);
 	bool passthrough = passthrough_mode(&cache->features);
 	bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
@@ -1359,9 +1379,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 
 			} else {
 				/* FIXME: factor out issue_origin() */
-				pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
 				remap_to_origin_clear_discard(cache, bio, block);
-				issue(cache, bio);
+				inc_and_issue(cache, bio, new_ocell);
 			}
 		} else {
 			inc_hit_counter(cache, bio);
@@ -1369,20 +1388,21 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 			if (bio_data_dir(bio) == WRITE &&
 			    writethrough_mode(&cache->features) &&
 			    !is_dirty(cache, lookup_result.cblock)) {
-				pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
 				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-				issue(cache, bio);
-			} else
-				issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
+				inc_and_issue(cache, bio, new_ocell);
+
+			} else  {
+				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+				inc_and_issue(cache, bio, new_ocell);
+			}
 		}
 
 		break;
 
 	case POLICY_MISS:
 		inc_miss_counter(cache, bio);
-		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
 		remap_to_origin_clear_discard(cache, bio, block);
-		issue(cache, bio);
+		inc_and_issue(cache, bio, new_ocell);
 		break;
 
 	case POLICY_NEW:
@@ -1501,6 +1521,9 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
 	bio_list_init(&cache->deferred_flush_bios);
 	spin_unlock_irqrestore(&cache->lock, flags);
 
+	/*
+	 * These bios have already been through inc_ds()
+	 */
 	while ((bio = bio_list_pop(&bios)))
 		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
 }
@@ -1518,6 +1541,9 @@ static void process_deferred_writethrough_bios(struct cache *cache)
 	bio_list_init(&cache->deferred_writethrough_bios);
 	spin_unlock_irqrestore(&cache->lock, flags);
 
+	/*
+	 * These bios have already been through inc_ds()
+	 */
 	while ((bio = bio_list_pop(&bios)))
 		generic_make_request(bio);
 }
@@ -1694,6 +1720,7 @@ static void do_worker(struct work_struct *ws)
 
 		if (commit_if_needed(cache)) {
 			process_deferred_flush_bios(cache, false);
+			process_migrations(cache, &cache->need_commit_migrations, migration_failure);
 
 			/*
 			 * FIXME: rollback metadata or just go into a
@@ -2406,16 +2433,13 @@ out:
 	return r;
 }
 
-static int cache_map(struct dm_target *ti, struct bio *bio)
+static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
 {
-	struct cache *cache = ti->private;
-
 	int r;
 	dm_oblock_t block = get_bio_block(cache, bio);
 	size_t pb_data_size = get_per_bio_data_size(cache);
 	bool can_migrate = false;
 	bool discarded_block;
-	struct dm_bio_prison_cell *cell;
 	struct policy_result lookup_result;
 	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
 
@@ -2437,15 +2461,15 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	/*
 	 * Check to see if that block is currently migrating.
 	 */
-	cell = alloc_prison_cell(cache);
-	if (!cell) {
+	*cell = alloc_prison_cell(cache);
+	if (!*cell) {
 		defer_bio(cache, bio);
 		return DM_MAPIO_SUBMITTED;
 	}
 
-	r = bio_detain(cache, block, bio, cell,
+	r = bio_detain(cache, block, bio, *cell,
 		       (cell_free_fn) free_prison_cell,
-		       cache, &cell);
+		       cache, cell);
 	if (r) {
 		if (r < 0)
 			defer_bio(cache, bio);
@@ -2458,11 +2482,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
 		       bio, &lookup_result);
 	if (r == -EWOULDBLOCK) {
-		cell_defer(cache, cell, true);
+		cell_defer(cache, *cell, true);
 		return DM_MAPIO_SUBMITTED;
 
 	} else if (r) {
 		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
+		cell_defer(cache, *cell, false);
 		bio_io_error(bio);
 		return DM_MAPIO_SUBMITTED;
 	}
@@ -2476,52 +2501,44 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 				 * We need to invalidate this block, so
 				 * defer for the worker thread.
 				 */
-				cell_defer(cache, cell, true);
+				cell_defer(cache, *cell, true);
 				r = DM_MAPIO_SUBMITTED;
 
 			} else {
-				pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
 				inc_miss_counter(cache, bio);
 				remap_to_origin_clear_discard(cache, bio, block);
-
-				cell_defer(cache, cell, false);
 			}
 
 		} else {
 			inc_hit_counter(cache, bio);
-			pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-
 			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
 			    !is_dirty(cache, lookup_result.cblock))
 				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
 			else
 				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
-
-			cell_defer(cache, cell, false);
 		}
 		break;
 
 	case POLICY_MISS:
 		inc_miss_counter(cache, bio);
-		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-
 		if (pb->req_nr != 0) {
 			/*
 			 * This is a duplicate writethrough io that is no
 			 * longer needed because the block has been demoted.
 			 */
 			bio_endio(bio, 0);
-			cell_defer(cache, cell, false);
-			return DM_MAPIO_SUBMITTED;
-		} else {
+			cell_defer(cache, *cell, false);
+			r = DM_MAPIO_SUBMITTED;
+
+		} else
 			remap_to_origin_clear_discard(cache, bio, block);
-			cell_defer(cache, cell, false);
-		}
+
 		break;
 
 	default:
 		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
 			    (unsigned) lookup_result.op);
+		cell_defer(cache, *cell, false);
 		bio_io_error(bio);
 		r = DM_MAPIO_SUBMITTED;
 	}
@@ -2529,6 +2546,21 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	return r;
 }
 
+static int cache_map(struct dm_target *ti, struct bio *bio)
+{
+	int r;
+	struct dm_bio_prison_cell *cell;
+	struct cache *cache = ti->private;
+
+	r = __cache_map(cache, bio, &cell);
+	if (r == DM_MAPIO_REMAPPED) {
+		inc_ds(cache, bio, cell);
+		cell_defer(cache, cell, false);
+	}
+
+	return r;
+}
+
 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 {
 	struct cache *cache = ti->private;
@@ -2808,7 +2840,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 		residency = policy_residency(cache->policy);
 
 		DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
-		       (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
+		       (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
 		       (unsigned long long)nr_blocks_metadata,
 		       cache->sectors_per_block,
@@ -3062,7 +3094,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	 */
 	if (io_opt_sectors < cache->sectors_per_block ||
 	    do_div(io_opt_sectors, cache->sectors_per_block)) {
-		blk_limits_io_min(limits, 0);
+		blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
 		blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
 	}
 	set_discard_limits(cache, limits);
@@ -3072,7 +3104,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
 	.name = "cache",
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
 	.module = THIS_MODULE,
 	.ctr = cache_ctr,
 	.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4cba2d808afb..2785007e0e46 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -59,7 +59,7 @@ struct dm_crypt_io {
 	int error;
 	sector_t sector;
 	struct dm_crypt_io *base_io;
-};
+} CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
 	struct convert_context *ctx;
@@ -162,6 +162,8 @@ struct crypt_config {
 	 */
 	unsigned int dmreq_start;
 
+	unsigned int per_bio_data_size;
+
 	unsigned long flags;
 	unsigned int key_size;
 	unsigned int key_parts;      /* independent parts in key buffer */
@@ -895,6 +897,15 @@ static void crypt_alloc_req(struct crypt_config *cc,
 	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
 }
 
+static void crypt_free_req(struct crypt_config *cc,
+			   struct ablkcipher_request *req, struct bio *base_bio)
+{
+	struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+
+	if ((struct ablkcipher_request *)(io + 1) != req)
+		mempool_free(req, cc->req_pool);
+}
+
 /*
  * Encrypt / decrypt data from one bio to another one (can be the same one)
  */
@@ -1008,12 +1019,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
 	}
 }
 
-static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
-					  struct bio *bio, sector_t sector)
+static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
+			  struct bio *bio, sector_t sector)
 {
-	struct dm_crypt_io *io;
-
-	io = mempool_alloc(cc->io_pool, GFP_NOIO);
 	io->cc = cc;
 	io->base_bio = bio;
 	io->sector = sector;
@@ -1021,8 +1029,6 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
 	io->base_io = NULL;
 	io->ctx.req = NULL;
 	atomic_set(&io->io_pending, 0);
-
-	return io;
 }
 
 static void crypt_inc_pending(struct dm_crypt_io *io)
@@ -1046,8 +1052,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 		return;
 
 	if (io->ctx.req)
-		mempool_free(io->ctx.req, cc->req_pool);
-	mempool_free(io, cc->io_pool);
+		crypt_free_req(cc, io->ctx.req, base_bio);
+	if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
+		mempool_free(io, cc->io_pool);
 
 	if (likely(!base_io))
 		bio_endio(base_bio, error);
@@ -1255,8 +1262,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		 * between fragments, so switch to a new dm_crypt_io structure.
 		 */
 		if (unlikely(!crypt_finished && remaining)) {
-			new_io = crypt_io_alloc(io->cc, io->base_bio,
-						sector);
+			new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
+			crypt_io_init(new_io, io->cc, io->base_bio, sector);
 			crypt_inc_pending(new_io);
 			crypt_convert_init(cc, &new_io->ctx, NULL,
 					   io->base_bio, sector);
@@ -1325,7 +1332,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	if (error < 0)
 		io->error = -EIO;
 
-	mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
+	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
 
 	if (!atomic_dec_and_test(&ctx->cc_pending))
 		return;
@@ -1728,6 +1735,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
+	cc->per_bio_data_size = ti->per_bio_data_size =
+				sizeof(struct dm_crypt_io) + cc->dmreq_start +
+				sizeof(struct dm_crypt_request) + cc->iv_size;
+
 	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
 	if (!cc->page_pool) {
 		ti->error = "Cannot allocate page mempool";
@@ -1824,7 +1835,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_REMAPPED;
 	}
 
-	io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+	io = dm_per_bio_data(bio, cc->per_bio_data_size);
+	crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+	io->ctx.req = (struct ablkcipher_request *)(io + 1);
 
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index db404a0f7e2c..c09359db3a90 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -33,7 +33,6 @@ struct dm_io_client {
 struct io {
 	unsigned long error_bits;
 	atomic_t count;
-	struct completion *wait;
 	struct dm_io_client *client;
 	io_notify_fn callback;
 	void *context;
@@ -112,28 +111,27 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
  * We need an io object to keep track of the number of bios that
  * have been dispatched for a particular io.
  *---------------------------------------------------------------*/
-static void dec_count(struct io *io, unsigned int region, int error)
+static void complete_io(struct io *io)
 {
-	if (error)
-		set_bit(region, &io->error_bits);
+	unsigned long error_bits = io->error_bits;
+	io_notify_fn fn = io->callback;
+	void *context = io->context;
 
-	if (atomic_dec_and_test(&io->count)) {
-		if (io->vma_invalidate_size)
-			invalidate_kernel_vmap_range(io->vma_invalidate_address,
-						     io->vma_invalidate_size);
+	if (io->vma_invalidate_size)
+		invalidate_kernel_vmap_range(io->vma_invalidate_address,
+					     io->vma_invalidate_size);
 
-		if (io->wait)
-			complete(io->wait);
+	mempool_free(io, io->client->pool);
+	fn(error_bits, context);
+}
 
-		else {
-			unsigned long r = io->error_bits;
-			io_notify_fn fn = io->callback;
-			void *context = io->context;
+static void dec_count(struct io *io, unsigned int region, int error)
+{
+	if (error)
+		set_bit(region, &io->error_bits);
 
-			mempool_free(io, io->client->pool);
-			fn(r, context);
-		}
-	}
+	if (atomic_dec_and_test(&io->count))
+		complete_io(io);
 }
 
 static void endio(struct bio *bio, int error)
@@ -376,41 +374,51 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	dec_count(io, 0, 0);
 }
 
+struct sync_io {
+	unsigned long error_bits;
+	struct completion wait;
+};
+
+static void sync_io_complete(unsigned long error, void *context)
+{
+	struct sync_io *sio = context;
+
+	sio->error_bits = error;
+	complete(&sio->wait);
+}
+
 static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 		   struct dm_io_region *where, int rw, struct dpages *dp,
 		   unsigned long *error_bits)
 {
-	/*
-	 * gcc <= 4.3 can't do the alignment for stack variables, so we must
-	 * align it on our own.
-	 * volatile prevents the optimizer from removing or reusing
-	 * "io_" field from the stack frame (allowed in ANSI C).
-	 */
-	volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
-	struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
-	DECLARE_COMPLETION_ONSTACK(wait);
+	struct io *io;
+	struct sync_io sio;
 
 	if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
 		WARN_ON(1);
 		return -EIO;
 	}
 
+	init_completion(&sio.wait);
+
+	io = mempool_alloc(client->pool, GFP_NOIO);
 	io->error_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
-	io->wait = &wait;
 	io->client = client;
+	io->callback = sync_io_complete;
+	io->context = &sio;
 
 	io->vma_invalidate_address = dp->vma_invalidate_address;
 	io->vma_invalidate_size = dp->vma_invalidate_size;
 
 	dispatch_io(rw, num_regions, where, dp, io, 1);
 
-	wait_for_completion_io(&wait);
+	wait_for_completion_io(&sio.wait);
 
 	if (error_bits)
-		*error_bits = io->error_bits;
+		*error_bits = sio.error_bits;
 
-	return io->error_bits ? -EIO : 0;
+	return sio.error_bits ? -EIO : 0;
 }
 
 static int async_io(struct dm_io_client *client, unsigned int num_regions,
@@ -428,7 +436,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
 	io = mempool_alloc(client->pool, GFP_NOIO);
 	io->error_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
-	io->wait = NULL;
 	io->client = client;
 	io->callback = fn;
 	io->context = context;
@@ -481,9 +488,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
  * New collapsed (a)synchronous interface.
  *
  * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
- * the queue with blk_unplug() some time later or set REQ_SYNC in
-io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
- * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
+ * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw.
+ * If you fail to do one of these, the IO will be submitted to the disk after
+ * q->unplug_delay, which defaults to 3ms in blk-settings.c.
  */
 int dm_io(struct dm_io_request *io_req, unsigned num_regions,
 	  struct dm_io_region *where, unsigned long *sync_error_bits)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f4167b013d99..833d7e752f06 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -373,8 +373,6 @@ static int __must_push_back(struct multipath *m)
 		 dm_noflush_suspending(m->ti)));
 }
 
-#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required)
-
 /*
  * Map cloned requests
  */
@@ -402,11 +400,11 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
 		if (!__must_push_back(m))
 			r = -EIO;	/* Failed */
 		goto out_unlock;
-	}
-	if (!pg_ready(m)) {
+	} else if (m->queue_io || m->pg_init_required) {
 		__pg_init_all_paths(m);
 		goto out_unlock;
 	}
+
 	if (set_mapinfo(m, map_context) < 0)
 		/* ENOMEM, requeue */
 		goto out_unlock;
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 09a688b3d48c..50fca469cafd 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -137,13 +137,23 @@ static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr
 	*bit *= sctx->region_table_entry_bits;
 }
 
+static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
+{
+	unsigned long region_index;
+	unsigned bit;
+
+	switch_get_position(sctx, region_nr, &region_index, &bit);
+
+	return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+		((1 << sctx->region_table_entry_bits) - 1);
+}
+
 /*
  * Find which path to use at given offset.
  */
 static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
 {
-	unsigned long region_index;
-	unsigned bit, path_nr;
+	unsigned path_nr;
 	sector_t p;
 
 	p = offset;
@@ -152,9 +162,7 @@ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
 	else
 		sector_div(p, sctx->region_size);
 
-	switch_get_position(sctx, p, &region_index, &bit);
-	path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
-	       ((1 << sctx->region_table_entry_bits) - 1);
+	path_nr = switch_region_table_read(sctx, p);
 
 	/* This can only happen if the processor uses non-atomic stores. */
 	if (unlikely(path_nr >= sctx->nr_paths))
@@ -363,7 +371,7 @@ static __always_inline unsigned long parse_hex(const char **string)
 }
 
 static int process_set_region_mappings(struct switch_ctx *sctx,
-			     unsigned argc, char **argv)
+				       unsigned argc, char **argv)
 {
 	unsigned i;
 	unsigned long region_index = 0;
@@ -372,6 +380,51 @@ static int process_set_region_mappings(struct switch_ctx *sctx,
 		unsigned long path_nr;
 		const char *string = argv[i];
 
+		if ((*string & 0xdf) == 'R') {
+			unsigned long cycle_length, num_write;
+
+			string++;
+			if (unlikely(*string == ',')) {
+				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+				return -EINVAL;
+			}
+			cycle_length = parse_hex(&string);
+			if (unlikely(*string != ',')) {
+				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+				return -EINVAL;
+			}
+			string++;
+			if (unlikely(!*string)) {
+				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+				return -EINVAL;
+			}
+			num_write = parse_hex(&string);
+			if (unlikely(*string)) {
+				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+				return -EINVAL;
+			}
+
+			if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
+				DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
+				       cycle_length - 1, region_index);
+				return -EINVAL;
+			}
+			if (unlikely(region_index + num_write < region_index) ||
+			    unlikely(region_index + num_write >= sctx->nr_regions)) {
+				DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
+				       region_index, num_write, sctx->nr_regions);
+				return -EINVAL;
+			}
+
+			while (num_write--) {
+				region_index++;
+				path_nr = switch_region_table_read(sctx, region_index - cycle_length);
+				switch_region_table_write(sctx, region_index, path_nr);
+			}
+
+			continue;
+		}
+
 		if (*string == ':')
 			region_index++;
 		else {
@@ -500,7 +553,7 @@ static int switch_iterate_devices(struct dm_target *ti,
 
 static struct target_type switch_target = {
 	.name = "switch",
-	.version = {1, 0, 0},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr = switch_ctr,
 	.dtr = switch_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5f59f1e3e5b1..f9c6cb8dbcf8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1386,6 +1386,14 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
 	return q && !blk_queue_add_random(q);
 }
 
+static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
+				   sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
+}
+
 static bool dm_table_all_devices_attribute(struct dm_table *t,
 					   iterate_devices_callout_fn func)
 {
@@ -1430,6 +1438,43 @@ static bool dm_table_supports_write_same(struct dm_table *t)
 	return true;
 }
 
+static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
+				  sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && blk_queue_discard(q);
+}
+
+static bool dm_table_supports_discards(struct dm_table *t)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+
+	/*
+	 * Unless any target used by the table set discards_supported,
+	 * require at least one underlying device to support discards.
+	 * t->devices includes internal dm devices such as mirror logs
+	 * so we need to use iterate_devices here, which targets
+	 * supporting discard selectively must provide.
+	 */
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+
+		if (!ti->num_discard_bios)
+			continue;
+
+		if (ti->discards_supported)
+			return 1;
+
+		if (ti->type->iterate_devices &&
+		    ti->type->iterate_devices(ti, device_discard_capable, NULL))
+			return 1;
+	}
+
+	return 0;
+}
+
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 			       struct queue_limits *limits)
 {
@@ -1464,6 +1509,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	if (!dm_table_supports_write_same(t))
 		q->limits.max_write_same_sectors = 0;
 
+	if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
+		queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+	else
+		queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+
 	dm_table_set_integrity(t);
 
 	/*
@@ -1636,39 +1686,3 @@ void dm_table_run_md_queue_async(struct dm_table *t)
 }
 EXPORT_SYMBOL(dm_table_run_md_queue_async);
 
-static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
-				  sector_t start, sector_t len, void *data)
-{
-	struct request_queue *q = bdev_get_queue(dev->bdev);
-
-	return q && blk_queue_discard(q);
-}
-
-bool dm_table_supports_discards(struct dm_table *t)
-{
-	struct dm_target *ti;
-	unsigned i = 0;
-
-	/*
-	 * Unless any target used by the table set discards_supported,
-	 * require at least one underlying device to support discards.
-	 * t->devices includes internal dm devices such as mirror logs
-	 * so we need to use iterate_devices here, which targets
-	 * supporting discard selectively must provide.
-	 */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
-
-		if (!ti->num_discard_bios)
-			continue;
-
-		if (ti->discards_supported)
-			return 1;
-
-		if (ti->type->iterate_devices &&
-		    ti->type->iterate_devices(ti, device_discard_capable, NULL))
-			return 1;
-	}
-
-	return 0;
-}
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index fc9c848a60c9..4843801173fe 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -227,6 +227,7 @@ struct thin_c {
 	struct list_head list;
 	struct dm_dev *pool_dev;
 	struct dm_dev *origin_dev;
+	sector_t origin_size;
 	dm_thin_id dev_id;
 
 	struct pool *pool;
@@ -554,11 +555,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 struct dm_thin_new_mapping {
 	struct list_head list;
 
-	bool quiesced:1;
-	bool prepared:1;
 	bool pass_discard:1;
 	bool definitely_not_shared:1;
 
+	/*
+	 * Track quiescing, copying and zeroing preparation actions.  When this
+	 * counter hits zero the block is prepared and can be inserted into the
+	 * btree.
+	 */
+	atomic_t prepare_actions;
+
 	int err;
 	struct thin_c *tc;
 	dm_block_t virt_block;
@@ -575,43 +581,41 @@ struct dm_thin_new_mapping {
 	bio_end_io_t *saved_bi_end_io;
 };
 
-static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
+static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
 {
 	struct pool *pool = m->tc->pool;
 
-	if (m->quiesced && m->prepared) {
+	if (atomic_dec_and_test(&m->prepare_actions)) {
 		list_add_tail(&m->list, &pool->prepared_mappings);
 		wake_worker(pool);
 	}
 }
 
-static void copy_complete(int read_err, unsigned long write_err, void *context)
+static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
 {
 	unsigned long flags;
-	struct dm_thin_new_mapping *m = context;
 	struct pool *pool = m->tc->pool;
 
-	m->err = read_err || write_err ? -EIO : 0;
-
 	spin_lock_irqsave(&pool->lock, flags);
-	m->prepared = true;
-	__maybe_add_mapping(m);
+	__complete_mapping_preparation(m);
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
 
+static void copy_complete(int read_err, unsigned long write_err, void *context)
+{
+	struct dm_thin_new_mapping *m = context;
+
+	m->err = read_err || write_err ? -EIO : 0;
+	complete_mapping_preparation(m);
+}
+
 static void overwrite_endio(struct bio *bio, int err)
 {
-	unsigned long flags;
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	struct dm_thin_new_mapping *m = h->overwrite_mapping;
-	struct pool *pool = m->tc->pool;
 
 	m->err = err;
-
-	spin_lock_irqsave(&pool->lock, flags);
-	m->prepared = true;
-	__maybe_add_mapping(m);
-	spin_unlock_irqrestore(&pool->lock, flags);
+	complete_mapping_preparation(m);
 }
 
 /*----------------------------------------------------------------*/
@@ -821,10 +825,31 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
 	return m;
 }
 
+static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
+		    sector_t begin, sector_t end)
+{
+	int r;
+	struct dm_io_region to;
+
+	to.bdev = tc->pool_dev->bdev;
+	to.sector = begin;
+	to.count = end - begin;
+
+	r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
+	if (r < 0) {
+		DMERR_LIMIT("dm_kcopyd_zero() failed");
+		copy_complete(1, 1, m);
+	}
+}
+
+/*
+ * A partial copy also needs to zero the uncopied region.
+ */
 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 			  struct dm_dev *origin, dm_block_t data_origin,
 			  dm_block_t data_dest,
-			  struct dm_bio_prison_cell *cell, struct bio *bio)
+			  struct dm_bio_prison_cell *cell, struct bio *bio,
+			  sector_t len)
 {
 	int r;
 	struct pool *pool = tc->pool;
@@ -835,8 +860,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	m->data_block = data_dest;
 	m->cell = cell;
 
+	/*
+	 * quiesce action + copy action + an extra reference held for the
+	 * duration of this function (we may need to inc later for a
+	 * partial zero).
+	 */
+	atomic_set(&m->prepare_actions, 3);
+
 	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
-		m->quiesced = true;
+		complete_mapping_preparation(m); /* already quiesced */
 
 	/*
 	 * IO to pool_dev remaps to the pool target's data_dev.
@@ -857,20 +889,38 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 
 		from.bdev = origin->bdev;
 		from.sector = data_origin * pool->sectors_per_block;
-		from.count = pool->sectors_per_block;
+		from.count = len;
 
 		to.bdev = tc->pool_dev->bdev;
 		to.sector = data_dest * pool->sectors_per_block;
-		to.count = pool->sectors_per_block;
+		to.count = len;
 
 		r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
 				   0, copy_complete, m);
 		if (r < 0) {
-			mempool_free(m, pool->mapping_pool);
 			DMERR_LIMIT("dm_kcopyd_copy() failed");
-			cell_error(pool, cell);
+			copy_complete(1, 1, m);
+
+			/*
+			 * We allow the zero to be issued, to simplify the
+			 * error path.  Otherwise we'd need to start
+			 * worrying about decrementing the prepare_actions
+			 * counter.
+			 */
+		}
+
+		/*
+		 * Do we need to zero a tail region?
+		 */
+		if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
+			atomic_inc(&m->prepare_actions);
+			ll_zero(tc, m,
+				data_dest * pool->sectors_per_block + len,
+				(data_dest + 1) * pool->sectors_per_block);
 		}
 	}
+
+	complete_mapping_preparation(m); /* drop our ref */
 }
 
 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -878,15 +928,8 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
 				   struct dm_bio_prison_cell *cell, struct bio *bio)
 {
 	schedule_copy(tc, virt_block, tc->pool_dev,
-		      data_origin, data_dest, cell, bio);
-}
-
-static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
-				   dm_block_t data_dest,
-				   struct dm_bio_prison_cell *cell, struct bio *bio)
-{
-	schedule_copy(tc, virt_block, tc->origin_dev,
-		      virt_block, data_dest, cell, bio);
+		      data_origin, data_dest, cell, bio,
+		      tc->pool->sectors_per_block);
 }
 
 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
@@ -896,8 +939,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	struct pool *pool = tc->pool;
 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
-	m->quiesced = true;
-	m->prepared = false;
+	atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
 	m->tc = tc;
 	m->virt_block = virt_block;
 	m->data_block = data_block;
@@ -919,21 +961,33 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 		inc_all_io_entry(pool, bio);
 		remap_and_issue(tc, bio, data_block);
-	} else {
-		int r;
-		struct dm_io_region to;
 
-		to.bdev = tc->pool_dev->bdev;
-		to.sector = data_block * pool->sectors_per_block;
-		to.count = pool->sectors_per_block;
+	} else
+		ll_zero(tc, m,
+			data_block * pool->sectors_per_block,
+			(data_block + 1) * pool->sectors_per_block);
+}
 
-		r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
-		if (r < 0) {
-			mempool_free(m, pool->mapping_pool);
-			DMERR_LIMIT("dm_kcopyd_zero() failed");
-			cell_error(pool, cell);
-		}
-	}
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+				   dm_block_t data_dest,
+				   struct dm_bio_prison_cell *cell, struct bio *bio)
+{
+	struct pool *pool = tc->pool;
+	sector_t virt_block_begin = virt_block * pool->sectors_per_block;
+	sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
+
+	if (virt_block_end <= tc->origin_size)
+		schedule_copy(tc, virt_block, tc->origin_dev,
+			      virt_block, data_dest, cell, bio,
+			      pool->sectors_per_block);
+
+	else if (virt_block_begin < tc->origin_size)
+		schedule_copy(tc, virt_block, tc->origin_dev,
+			      virt_block, data_dest, cell, bio,
+			      tc->origin_size - virt_block_begin);
+
+	else
+		schedule_zero(tc, virt_block, data_dest, cell, bio);
 }
 
 /*
@@ -1315,7 +1369,18 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 			inc_all_io_entry(pool, bio);
 			cell_defer_no_holder(tc, cell);
 
-			remap_to_origin_and_issue(tc, bio);
+			if (bio_end_sector(bio) <= tc->origin_size)
+				remap_to_origin_and_issue(tc, bio);
+
+			else if (bio->bi_iter.bi_sector < tc->origin_size) {
+				zero_fill_bio(bio);
+				bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
+				remap_to_origin_and_issue(tc, bio);
+
+			} else {
+				zero_fill_bio(bio);
+				bio_endio(bio, 0);
+			}
 		} else
 			provision_block(tc, bio, block, cell);
 		break;
@@ -3112,7 +3177,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	 */
 	if (io_opt_sectors < pool->sectors_per_block ||
 	    do_div(io_opt_sectors, pool->sectors_per_block)) {
-		blk_limits_io_min(limits, 0);
+		blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
 		blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
 	}
 
@@ -3141,7 +3206,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 12, 0},
+	.version = {1, 13, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -3361,8 +3426,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
 		spin_lock_irqsave(&pool->lock, flags);
 		list_for_each_entry_safe(m, tmp, &work, list) {
 			list_del(&m->list);
-			m->quiesced = true;
-			__maybe_add_mapping(m);
+			__complete_mapping_preparation(m);
 		}
 		spin_unlock_irqrestore(&pool->lock, flags);
 	}
@@ -3401,6 +3465,16 @@ static void thin_postsuspend(struct dm_target *ti)
 	noflush_work(tc, do_noflush_stop);
 }
 
+static int thin_preresume(struct dm_target *ti)
+{
+	struct thin_c *tc = ti->private;
+
+	if (tc->origin_dev)
+		tc->origin_size = get_dev_size(tc->origin_dev->bdev);
+
+	return 0;
+}
+
 /*
  * <nr mapped sectors> <highest mapped sector>
  */
@@ -3483,12 +3557,13 @@ static int thin_iterate_devices(struct dm_target *ti,
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 12, 0},
+	.version = {1, 13, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
 	.map = thin_map,
 	.end_io = thin_endio,
+	.preresume = thin_preresume,
 	.presuspend = thin_presuspend,
 	.postsuspend = thin_postsuspend,
 	.status = thin_status,
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index ed76126aac54..e81d2152fa68 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -72,7 +72,6 @@ int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
-bool dm_table_supports_discards(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 32fc19c540d4..1294238610df 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5961,7 +5961,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
 	int err = 0;
 
 	if (mddev->pers) {
-		if (!mddev->pers->quiesce)
+		if (!mddev->pers->quiesce || !mddev->thread)
 			return -EBUSY;
 		if (mddev->recovery || mddev->sync_thread)
 			return -EBUSY;
@@ -6263,7 +6263,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 		rv = update_raid_disks(mddev, info->raid_disks);
 
 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
-		if (mddev->pers->quiesce == NULL)
+		if (mddev->pers->quiesce == NULL || mddev->thread == NULL)
 			return -EINVAL;
 		if (mddev->recovery || mddev->sync_thread)
 			return -EBUSY;
@@ -7376,7 +7376,7 @@ void md_do_sync(struct md_thread *thread)
 	struct mddev *mddev2;
 	unsigned int currspeed = 0,
 		 window;
-	sector_t max_sectors,j, io_sectors;
+	sector_t max_sectors,j, io_sectors, recovery_done;
 	unsigned long mark[SYNC_MARKS];
 	unsigned long update_time;
 	sector_t mark_cnt[SYNC_MARKS];
@@ -7652,7 +7652,8 @@ void md_do_sync(struct md_thread *thread)
 		 */
 		cond_resched();
 
-		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
+		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
+		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
 
 		if (currspeed > speed_min(mddev)) {
@@ -8592,7 +8593,7 @@ static int __init md_init(void)
 		goto err_mdp;
 	mdp_major = ret;
 
-	blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
+	blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
 			    md_probe, NULL, NULL);
 	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
 			    md_probe, NULL, NULL);
@@ -8687,7 +8688,7 @@ static __exit void md_exit(void)
 	struct list_head *tmp;
 	int delay = 1;
 
-	blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
+	blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
 	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
 
 	unregister_blkdev(MD_MAJOR,"md");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 407a99e46f69..cf91f5910c7c 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -685,6 +685,12 @@ static void *raid0_takeover(struct mddev *mddev)
 	 *  raid10 - assuming we have all necessary active disks
 	 *  raid1 - with (N -1) mirror drives faulty
 	 */
+
+	if (mddev->bitmap) {
+		printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n",
+		       mdname(mddev));
+		return ERR_PTR(-EBUSY);
+	}
 	if (mddev->level == 4)
 		return raid0_takeover_raid45(mddev);
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 56e24c072b62..d7690f86fdb9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1501,12 +1501,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 		mddev->degraded++;
 		set_bit(Faulty, &rdev->flags);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
-		/*
-		 * if recovery is running, make sure it aborts.
-		 */
-		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	} else
 		set_bit(Faulty, &rdev->flags);
+	/*
+	 * if recovery is running, make sure it aborts.
+	 */
+	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT
 	       "md/raid1:%s: Disk failure on %s, disabling device.\n"
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index cb882aae9e20..b08c18871323 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1684,13 +1684,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		return;
 	}
-	if (test_and_clear_bit(In_sync, &rdev->flags)) {
+	if (test_and_clear_bit(In_sync, &rdev->flags))
 		mddev->degraded++;
-			/*
-		 * if recovery is running, make sure it aborts.
-		 */
-		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-	}
+	/*
+	 * If recovery is running, make sure it aborts.
+	 */
+	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 452782bffebc..ede41f05c392 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -2028,8 +2028,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 		/* complete ongoing async transfer before issuing discard */
 		if (card->host->areq)
 			mmc_blk_issue_rw_rq(mq, NULL);
-		if (req->cmd_flags & REQ_SECURE &&
-			!(card->quirks & MMC_QUIRK_SEC_ERASE_TRIM_BROKEN))
+		if (req->cmd_flags & REQ_SECURE)
 			ret = mmc_blk_issue_secdiscard_rq(mq, req);
 		else
 			ret = mmc_blk_issue_discard_rq(mq, req);
@@ -2432,6 +2431,8 @@ static int mmc_blk_probe(struct mmc_card *card)
 	if (!(card->csd.cmdclass & CCC_BLOCK_READ))
 		return -ENODEV;
 
+	mmc_fixup_device(card, blk_fixups);
+
 	md = mmc_blk_alloc(card);
 	if (IS_ERR(md))
 		return PTR_ERR(md);
@@ -2446,7 +2447,6 @@ static int mmc_blk_probe(struct mmc_card *card)
 		goto out;
 
 	mmc_set_drvdata(card, md);
-	mmc_fixup_device(card, blk_fixups);
 
 	if (mmc_add_disk(md))
 		goto out;
diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index d2dbf02022bd..8a1f1240e058 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -180,7 +180,6 @@ static int mmc_bus_resume(struct device *dev)
 #endif
 
 #ifdef CONFIG_PM_RUNTIME
-
 static int mmc_runtime_suspend(struct device *dev)
 {
 	struct mmc_card *card = mmc_dev_to_card(dev);
@@ -196,17 +195,10 @@ static int mmc_runtime_resume(struct device *dev)
 
 	return host->bus_ops->runtime_resume(host);
 }
-
-static int mmc_runtime_idle(struct device *dev)
-{
-	return 0;
-}
-
 #endif /* !CONFIG_PM_RUNTIME */
 
 static const struct dev_pm_ops mmc_bus_pm_ops = {
-	SET_RUNTIME_PM_OPS(mmc_runtime_suspend, mmc_runtime_resume,
-			mmc_runtime_idle)
+	SET_RUNTIME_PM_OPS(mmc_runtime_suspend, mmc_runtime_resume, NULL)
 	SET_SYSTEM_SLEEP_PM_OPS(mmc_bus_suspend, mmc_bus_resume)
 };
 
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 7dc0c85fdb60..d03a080fb9cd 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2102,7 +2102,8 @@ EXPORT_SYMBOL(mmc_can_sanitize);
 
 int mmc_can_secure_erase_trim(struct mmc_card *card)
 {
-	if (card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN)
+	if ((card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN) &&
+	    !(card->quirks & MMC_QUIRK_SEC_ERASE_TRIM_BROKEN))
 		return 1;
 	return 0;
 }
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 793c6f7ddb04..1eda8dd8c867 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -324,13 +324,12 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		}
 	}
 
+	/*
+	 * The EXT_CSD format is meant to be forward compatible. As long
+	 * as CSD_STRUCTURE does not change, all values for EXT_CSD_REV
+	 * are authorized, see JEDEC JESD84-B50 section B.8.
+	 */
 	card->ext_csd.rev = ext_csd[EXT_CSD_REV];
-	if (card->ext_csd.rev > 7) {
-		pr_err("%s: unrecognised EXT_CSD revision %d\n",
-			mmc_hostname(card->host), card->ext_csd.rev);
-		err = -EINVAL;
-		goto out;
-	}
 
 	card->ext_csd.raw_sectors[0] = ext_csd[EXT_CSD_SEC_CNT + 0];
 	card->ext_csd.raw_sectors[1] = ext_csd[EXT_CSD_SEC_CNT + 1];
diff --git a/drivers/mmc/core/quirks.c b/drivers/mmc/core/quirks.c
index 6c36fccaa1ec..dd1d1e0fe322 100644
--- a/drivers/mmc/core/quirks.c
+++ b/drivers/mmc/core/quirks.c
@@ -91,7 +91,7 @@ void mmc_fixup_device(struct mmc_card *card, const struct mmc_fixup *table)
 		    (f->cis_device == card->cis.device ||
 		     f->cis_device == (u16) SDIO_ANY_ID) &&
 		    rev >= f->rev_start && rev <= f->rev_end) {
-			dev_dbg(&card->dev, "calling %pF\n", f->vendor_fixup);
+			dev_dbg(&card->dev, "calling %pf\n", f->vendor_fixup);
 			f->vendor_fixup(card, f->data);
 		}
 	}
diff --git a/drivers/mmc/core/sd_ops.c b/drivers/mmc/core/sd_ops.c
index 274ef00b4463..48d0c93ba25a 100644
--- a/drivers/mmc/core/sd_ops.c
+++ b/drivers/mmc/core/sd_ops.c
@@ -184,6 +184,9 @@ int mmc_send_app_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr)
 		mmc_delay(10);
 	}
 
+	if (!i)
+		pr_err("%s: card never left busy state\n", mmc_hostname(host));
+
 	if (rocr && !mmc_host_is_spi(host))
 		*rocr = cmd.resp[0];
 
diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index a5652548230a..451135822464 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -290,6 +290,18 @@ config MMC_MOXART
 	  be found on some embedded hardware such as UC-7112-LX.
 	  If you have a controller with this interface, say Y here.
 
+config MMC_SDHCI_ST
+	tristate "SDHCI support on STMicroelectronics SoC"
+	depends on ARCH_STI
+	depends on MMC_SDHCI_PLTFM
+	select MMC_SDHCI_IO_ACCESSORS
+	help
+	  This selects the Secure Digital Host Controller Interface in
+	  STMicroelectronics SoCs.
+
+	  If you have a controller with this interface, say Y or M here.
+	  If unsure, say N.
+
 config MMC_OMAP
 	tristate "TI OMAP Multimedia Card Interface support"
 	depends on ARCH_OMAP
@@ -303,6 +315,7 @@ config MMC_OMAP
 
 config MMC_OMAP_HS
 	tristate "TI OMAP High Speed Multimedia Card Interface support"
+	depends on HAS_DMA
 	depends on ARCH_OMAP2PLUS || COMPILE_TEST
 	help
 	  This selects the TI OMAP High Speed Multimedia card Interface.
@@ -343,7 +356,7 @@ config MMC_ATMELMCI
 
 config MMC_SDHCI_MSM
 	tristate "Qualcomm SDHCI Controller Support"
-	depends on ARCH_QCOM
+	depends on ARCH_QCOM || (ARM && COMPILE_TEST)
 	depends on MMC_SDHCI_PLTFM
 	help
 	  This selects the Secure Digital Host Controller Interface (SDHCI)
@@ -440,6 +453,7 @@ config MMC_SPI
 config MMC_S3C
 	tristate "Samsung S3C SD/MMC Card Interface support"
 	depends on ARCH_S3C24XX
+	depends on S3C24XX_DMAC
 	help
 	  This selects a driver for the MCI interface found in
           Samsung's S3C2410, S3C2412, S3C2440, S3C2442 CPUs.
@@ -477,15 +491,6 @@ config MMC_S3C_DMA
 	  working properly and needs to be debugged before this
 	  option is useful.
 
-config MMC_S3C_PIODMA
-	bool "Support for both PIO and DMA"
-	help
-	  Compile both the PIO and DMA transfer routines into the
-	  driver and let the platform select at run-time which one
-	  is best.
-
-	  See notes for the DMA option.
-
 endchoice
 
 config MMC_SDRICOH_CS
@@ -623,7 +628,7 @@ config MMC_DW_PCI
 
 config MMC_SH_MMCIF
 	tristate "SuperH Internal MMCIF support"
-	depends on MMC_BLOCK
+	depends on MMC_BLOCK && HAS_DMA
 	depends on SUPERH || ARCH_SHMOBILE || COMPILE_TEST
 	help
 	  This selects the MMC Host Interface controller (MMCIF).
@@ -697,6 +702,7 @@ config MMC_WMT
 
 config MMC_USDHI6ROL0
 	tristate "Renesas USDHI6ROL0 SD/SDIO Host Controller support"
+	depends on HAS_DMA
 	help
 	  This selects support for the Renesas USDHI6ROL0 SD/SDIO
 	  Host Controller
diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile
index 7f81ddf1dd2c..f211eede8db5 100644
--- a/drivers/mmc/host/Makefile
+++ b/drivers/mmc/host/Makefile
@@ -68,6 +68,7 @@ obj-$(CONFIG_MMC_SDHCI_OF_HLWD)		+= sdhci-of-hlwd.o
 obj-$(CONFIG_MMC_SDHCI_BCM_KONA)	+= sdhci-bcm-kona.o
 obj-$(CONFIG_MMC_SDHCI_BCM2835)		+= sdhci-bcm2835.o
 obj-$(CONFIG_MMC_SDHCI_MSM)		+= sdhci-msm.o
+obj-$(CONFIG_MMC_SDHCI_ST)		+= sdhci-st.o
 
 ifeq ($(CONFIG_CB710_DEBUG),y)
 	CFLAGS-cb710-mmc	+= -DDEBUG
diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 1ac227c603b7..8f216edbdf08 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -111,8 +111,7 @@ static const u8 tuning_blk_pattern_8bit[] = {
 	0xff, 0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee,
 };
 
-static inline bool dw_mci_fifo_reset(struct dw_mci *host);
-static inline bool dw_mci_ctrl_all_reset(struct dw_mci *host);
+static bool dw_mci_reset(struct dw_mci *host);
 
 #if defined(CONFIG_DEBUG_FS)
 static int dw_mci_req_show(struct seq_file *s, void *v)
@@ -997,7 +996,8 @@ static int dw_mci_get_ro(struct mmc_host *mmc)
 	int gpio_ro = mmc_gpio_get_ro(mmc);
 
 	/* Use platform get_ro function, else try on board write protect */
-	if (slot->quirks & DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT)
+	if ((slot->quirks & DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT) ||
+			(slot->host->quirks & DW_MCI_QUIRK_NO_WRITE_PROTECT))
 		read_only = 0;
 	else if (!IS_ERR_VALUE(gpio_ro))
 		read_only = gpio_ro;
@@ -1235,7 +1235,7 @@ static int dw_mci_data_complete(struct dw_mci *host, struct mmc_data *data)
 		 * After an error, there may be data lingering
 		 * in the FIFO
 		 */
-		dw_mci_fifo_reset(host);
+		dw_mci_reset(host);
 	} else {
 		data->bytes_xfered = data->blocks * data->blksz;
 		data->error = 0;
@@ -1352,7 +1352,7 @@ static void dw_mci_tasklet_func(unsigned long priv)
 
 			/* CMD error in data command */
 			if (mrq->cmd->error && mrq->data)
-				dw_mci_fifo_reset(host);
+				dw_mci_reset(host);
 
 			host->cmd = NULL;
 			host->data = NULL;
@@ -1963,14 +1963,8 @@ static void dw_mci_work_routine_card(struct work_struct *work)
 			}
 
 			/* Power down slot */
-			if (present == 0) {
-				/* Clear down the FIFO */
-				dw_mci_fifo_reset(host);
-#ifdef CONFIG_MMC_DW_IDMAC
-				dw_mci_idmac_reset(host);
-#endif
-
-			}
+			if (present == 0)
+				dw_mci_reset(host);
 
 			spin_unlock_bh(&host->lock);
 
@@ -2021,8 +2015,11 @@ static int dw_mci_of_get_slot_quirks(struct device *dev, u8 slot)
 
 	/* get quirks */
 	for (idx = 0; idx < ARRAY_SIZE(of_slot_quirks); idx++)
-		if (of_get_property(np, of_slot_quirks[idx].quirk, NULL))
+		if (of_get_property(np, of_slot_quirks[idx].quirk, NULL)) {
+			dev_warn(dev, "Slot quirk %s is deprecated\n",
+					of_slot_quirks[idx].quirk);
 			quirks |= of_slot_quirks[idx].id;
+		}
 
 	return quirks;
 }
@@ -2208,8 +2205,11 @@ static bool dw_mci_ctrl_reset(struct dw_mci *host, u32 reset)
 	return false;
 }
 
-static inline bool dw_mci_fifo_reset(struct dw_mci *host)
+static bool dw_mci_reset(struct dw_mci *host)
 {
+	u32 flags = SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET;
+	bool ret = false;
+
 	/*
 	 * Reseting generates a block interrupt, hence setting
 	 * the scatter-gather pointer to NULL.
@@ -2219,15 +2219,60 @@ static inline bool dw_mci_fifo_reset(struct dw_mci *host)
 		host->sg = NULL;
 	}
 
-	return dw_mci_ctrl_reset(host, SDMMC_CTRL_FIFO_RESET);
-}
+	if (host->use_dma)
+		flags |= SDMMC_CTRL_DMA_RESET;
 
-static inline bool dw_mci_ctrl_all_reset(struct dw_mci *host)
-{
-	return dw_mci_ctrl_reset(host,
-				 SDMMC_CTRL_FIFO_RESET |
-				 SDMMC_CTRL_RESET |
-				 SDMMC_CTRL_DMA_RESET);
+	if (dw_mci_ctrl_reset(host, flags)) {
+		/*
+		 * In all cases we clear the RAWINTS register to clear any
+		 * interrupts.
+		 */
+		mci_writel(host, RINTSTS, 0xFFFFFFFF);
+
+		/* if using dma we wait for dma_req to clear */
+		if (host->use_dma) {
+			unsigned long timeout = jiffies + msecs_to_jiffies(500);
+			u32 status;
+			do {
+				status = mci_readl(host, STATUS);
+				if (!(status & SDMMC_STATUS_DMA_REQ))
+					break;
+				cpu_relax();
+			} while (time_before(jiffies, timeout));
+
+			if (status & SDMMC_STATUS_DMA_REQ) {
+				dev_err(host->dev,
+					"%s: Timeout waiting for dma_req to "
+					"clear during reset\n", __func__);
+				goto ciu_out;
+			}
+
+			/* when using DMA next we reset the fifo again */
+			if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_FIFO_RESET))
+				goto ciu_out;
+		}
+	} else {
+		/* if the controller reset bit did clear, then set clock regs */
+		if (!(mci_readl(host, CTRL) & SDMMC_CTRL_RESET)) {
+			dev_err(host->dev, "%s: fifo/dma reset bits didn't "
+				"clear but ciu was reset, doing clock update\n",
+				__func__);
+			goto ciu_out;
+		}
+	}
+
+#if IS_ENABLED(CONFIG_MMC_DW_IDMAC)
+	/* It is also recommended that we reset and reprogram idmac */
+	dw_mci_idmac_reset(host);
+#endif
+
+	ret = true;
+
+ciu_out:
+	/* After a CTRL reset we need to have CIU set clock registers  */
+	mci_send_cmd(host->cur_slot, SDMMC_CMD_UPD_CLK, 0);
+
+	return ret;
 }
 
 #ifdef CONFIG_OF
@@ -2238,6 +2283,9 @@ static struct dw_mci_of_quirks {
 	{
 		.quirk	= "broken-cd",
 		.id	= DW_MCI_QUIRK_BROKEN_CARD_DETECTION,
+	}, {
+		.quirk	= "disable-wp",
+		.id	= DW_MCI_QUIRK_NO_WRITE_PROTECT,
 	},
 };
 
@@ -2425,7 +2473,7 @@ int dw_mci_probe(struct dw_mci *host)
 	}
 
 	/* Reset all blocks */
-	if (!dw_mci_ctrl_all_reset(host))
+	if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_ALL_RESET_FLAGS))
 		return -ENODEV;
 
 	host->dma_ops = host->pdata->dma_ops;
@@ -2612,7 +2660,7 @@ int dw_mci_resume(struct dw_mci *host)
 		}
 	}
 
-	if (!dw_mci_ctrl_all_reset(host)) {
+	if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_ALL_RESET_FLAGS)) {
 		ret = -ENODEV;
 		return ret;
 	}
diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h
index 738fa241d058..08fd956d81f3 100644
--- a/drivers/mmc/host/dw_mmc.h
+++ b/drivers/mmc/host/dw_mmc.h
@@ -129,6 +129,7 @@
 #define SDMMC_CMD_INDX(n)		((n) & 0x1F)
 /* Status register defines */
 #define SDMMC_GET_FCNT(x)		(((x)>>17) & 0x1FFF)
+#define SDMMC_STATUS_DMA_REQ		BIT(31)
 /* FIFOTH register defines */
 #define SDMMC_SET_FIFOTH(m, r, t)	(((m) & 0x7) << 28 | \
 					 ((r) & 0xFFF) << 16 | \
@@ -150,6 +151,10 @@
 /* Card read threshold */
 #define SDMMC_SET_RD_THLD(v, x)		(((v) & 0x1FFF) << 16 | (x))
 
+/* All ctrl reset bits */
+#define SDMMC_CTRL_ALL_RESET_FLAGS \
+	(SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET | SDMMC_CTRL_DMA_RESET)
+
 /* Register access macros */
 #define mci_readl(dev, reg)			\
 	__raw_readl((dev)->regs + SDMMC_##reg)
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index 7ad463e9741c..e4d470704150 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -52,34 +52,53 @@ static unsigned int fmax = 515633;
  * struct variant_data - MMCI variant-specific quirks
  * @clkreg: default value for MCICLOCK register
  * @clkreg_enable: enable value for MMCICLOCK register
+ * @clkreg_8bit_bus_enable: enable value for 8 bit bus
+ * @clkreg_neg_edge_enable: enable value for inverted data/cmd output
  * @datalength_bits: number of bits in the MMCIDATALENGTH register
  * @fifosize: number of bytes that can be written when MMCI_TXFIFOEMPTY
  *	      is asserted (likewise for RX)
  * @fifohalfsize: number of bytes that can be written when MCI_TXFIFOHALFEMPTY
  *		  is asserted (likewise for RX)
+ * @data_cmd_enable: enable value for data commands.
  * @sdio: variant supports SDIO
  * @st_clkdiv: true if using a ST-specific clock divider algorithm
+ * @datactrl_mask_ddrmode: ddr mode mask in datactrl register.
  * @blksz_datactrl16: true if Block size is at b16..b30 position in datactrl register
+ * @blksz_datactrl4: true if Block size is at b4..b16 position in datactrl
+ *		     register
  * @pwrreg_powerup: power up value for MMCIPOWER register
+ * @f_max: maximum clk frequency supported by the controller.
  * @signal_direction: input/out direction of bus signals can be indicated
  * @pwrreg_clkgate: MMCIPOWER register must be used to gate the clock
  * @busy_detect: true if busy detection on dat0 is supported
  * @pwrreg_nopower: bits in MMCIPOWER don't controls ext. power supply
+ * @explicit_mclk_control: enable explicit mclk control in driver.
+ * @qcom_fifo: enables qcom specific fifo pio read logic.
+ * @reversed_irq_handling: handle data irq before cmd irq.
  */
 struct variant_data {
 	unsigned int		clkreg;
 	unsigned int		clkreg_enable;
+	unsigned int		clkreg_8bit_bus_enable;
+	unsigned int		clkreg_neg_edge_enable;
 	unsigned int		datalength_bits;
 	unsigned int		fifosize;
 	unsigned int		fifohalfsize;
+	unsigned int		data_cmd_enable;
+	unsigned int		datactrl_mask_ddrmode;
 	bool			sdio;
 	bool			st_clkdiv;
 	bool			blksz_datactrl16;
+	bool			blksz_datactrl4;
 	u32			pwrreg_powerup;
+	u32			f_max;
 	bool			signal_direction;
 	bool			pwrreg_clkgate;
 	bool			busy_detect;
 	bool			pwrreg_nopower;
+	bool			explicit_mclk_control;
+	bool			qcom_fifo;
+	bool			reversed_irq_handling;
 };
 
 static struct variant_data variant_arm = {
@@ -87,6 +106,8 @@ static struct variant_data variant_arm = {
 	.fifohalfsize		= 8 * 4,
 	.datalength_bits	= 16,
 	.pwrreg_powerup		= MCI_PWR_UP,
+	.f_max			= 100000000,
+	.reversed_irq_handling	= true,
 };
 
 static struct variant_data variant_arm_extended_fifo = {
@@ -94,6 +115,7 @@ static struct variant_data variant_arm_extended_fifo = {
 	.fifohalfsize		= 64 * 4,
 	.datalength_bits	= 16,
 	.pwrreg_powerup		= MCI_PWR_UP,
+	.f_max			= 100000000,
 };
 
 static struct variant_data variant_arm_extended_fifo_hwfc = {
@@ -102,15 +124,18 @@ static struct variant_data variant_arm_extended_fifo_hwfc = {
 	.clkreg_enable		= MCI_ARM_HWFCEN,
 	.datalength_bits	= 16,
 	.pwrreg_powerup		= MCI_PWR_UP,
+	.f_max			= 100000000,
 };
 
 static struct variant_data variant_u300 = {
 	.fifosize		= 16 * 4,
 	.fifohalfsize		= 8 * 4,
 	.clkreg_enable		= MCI_ST_U300_HWFCEN,
+	.clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS,
 	.datalength_bits	= 16,
 	.sdio			= true,
 	.pwrreg_powerup		= MCI_PWR_ON,
+	.f_max			= 100000000,
 	.signal_direction	= true,
 	.pwrreg_clkgate		= true,
 	.pwrreg_nopower		= true,
@@ -124,6 +149,7 @@ static struct variant_data variant_nomadik = {
 	.sdio			= true,
 	.st_clkdiv		= true,
 	.pwrreg_powerup		= MCI_PWR_ON,
+	.f_max			= 100000000,
 	.signal_direction	= true,
 	.pwrreg_clkgate		= true,
 	.pwrreg_nopower		= true,
@@ -134,10 +160,13 @@ static struct variant_data variant_ux500 = {
 	.fifohalfsize		= 8 * 4,
 	.clkreg			= MCI_CLK_ENABLE,
 	.clkreg_enable		= MCI_ST_UX500_HWFCEN,
+	.clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS,
+	.clkreg_neg_edge_enable	= MCI_ST_UX500_NEG_EDGE,
 	.datalength_bits	= 24,
 	.sdio			= true,
 	.st_clkdiv		= true,
 	.pwrreg_powerup		= MCI_PWR_ON,
+	.f_max			= 100000000,
 	.signal_direction	= true,
 	.pwrreg_clkgate		= true,
 	.busy_detect		= true,
@@ -149,17 +178,38 @@ static struct variant_data variant_ux500v2 = {
 	.fifohalfsize		= 8 * 4,
 	.clkreg			= MCI_CLK_ENABLE,
 	.clkreg_enable		= MCI_ST_UX500_HWFCEN,
+	.clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS,
+	.clkreg_neg_edge_enable	= MCI_ST_UX500_NEG_EDGE,
+	.datactrl_mask_ddrmode	= MCI_ST_DPSM_DDRMODE,
 	.datalength_bits	= 24,
 	.sdio			= true,
 	.st_clkdiv		= true,
 	.blksz_datactrl16	= true,
 	.pwrreg_powerup		= MCI_PWR_ON,
+	.f_max			= 100000000,
 	.signal_direction	= true,
 	.pwrreg_clkgate		= true,
 	.busy_detect		= true,
 	.pwrreg_nopower		= true,
 };
 
+static struct variant_data variant_qcom = {
+	.fifosize		= 16 * 4,
+	.fifohalfsize		= 8 * 4,
+	.clkreg			= MCI_CLK_ENABLE,
+	.clkreg_enable		= MCI_QCOM_CLK_FLOWENA |
+				  MCI_QCOM_CLK_SELECT_IN_FBCLK,
+	.clkreg_8bit_bus_enable = MCI_QCOM_CLK_WIDEBUS_8,
+	.datactrl_mask_ddrmode	= MCI_QCOM_CLK_SELECT_IN_DDR_MODE,
+	.data_cmd_enable	= MCI_QCOM_CSPM_DATCMD,
+	.blksz_datactrl4	= true,
+	.datalength_bits	= 24,
+	.pwrreg_powerup		= MCI_PWR_UP,
+	.f_max			= 208000000,
+	.explicit_mclk_control	= true,
+	.qcom_fifo		= true,
+};
+
 static int mmci_card_busy(struct mmc_host *mmc)
 {
 	struct mmci_host *host = mmc_priv(mmc);
@@ -260,7 +310,9 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired)
 	host->cclk = 0;
 
 	if (desired) {
-		if (desired >= host->mclk) {
+		if (variant->explicit_mclk_control) {
+			host->cclk = host->mclk;
+		} else if (desired >= host->mclk) {
 			clk = MCI_CLK_BYPASS;
 			if (variant->st_clkdiv)
 				clk |= MCI_ST_UX500_NEG_EDGE;
@@ -299,11 +351,11 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired)
 	if (host->mmc->ios.bus_width == MMC_BUS_WIDTH_4)
 		clk |= MCI_4BIT_BUS;
 	if (host->mmc->ios.bus_width == MMC_BUS_WIDTH_8)
-		clk |= MCI_ST_8BIT_BUS;
+		clk |= variant->clkreg_8bit_bus_enable;
 
 	if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50 ||
 	    host->mmc->ios.timing == MMC_TIMING_MMC_DDR52)
-		clk |= MCI_ST_UX500_NEG_EDGE;
+		clk |= variant->clkreg_neg_edge_enable;
 
 	mmci_write_clkreg(host, clk);
 }
@@ -719,7 +771,7 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data)
 	data->bytes_xfered = 0;
 
 	clks = (unsigned long long)data->timeout_ns * host->cclk;
-	do_div(clks, 1000000000UL);
+	do_div(clks, NSEC_PER_SEC);
 
 	timeout = data->timeout_clks + (unsigned int)clks;
 
@@ -732,6 +784,8 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data)
 
 	if (variant->blksz_datactrl16)
 		datactrl = MCI_DPSM_ENABLE | (data->blksz << 16);
+	else if (variant->blksz_datactrl4)
+		datactrl = MCI_DPSM_ENABLE | (data->blksz << 4);
 	else
 		datactrl = MCI_DPSM_ENABLE | blksz_bits << 4;
 
@@ -767,7 +821,7 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data)
 
 	if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50 ||
 	    host->mmc->ios.timing == MMC_TIMING_MMC_DDR52)
-		datactrl |= MCI_ST_DPSM_DDRMODE;
+		datactrl |= variant->datactrl_mask_ddrmode;
 
 	/*
 	 * Attempt to use DMA operation mode, if this
@@ -812,7 +866,7 @@ mmci_start_command(struct mmci_host *host, struct mmc_command *cmd, u32 c)
 
 	if (readl(base + MMCICOMMAND) & MCI_CPSM_ENABLE) {
 		writel(0, base + MMCICOMMAND);
-		udelay(1);
+		mmci_reg_delay(host);
 	}
 
 	c |= cmd->opcode | MCI_CPSM_ENABLE;
@@ -824,6 +878,9 @@ mmci_start_command(struct mmci_host *host, struct mmc_command *cmd, u32 c)
 	if (/*interrupt*/0)
 		c |= MCI_CPSM_INTERRUPT;
 
+	if (mmc_cmd_type(cmd) == MMC_CMD_ADTC)
+		c |= host->variant->data_cmd_enable;
+
 	host->cmd = cmd;
 
 	writel(cmd->arg, base + MMCIARGUMENT);
@@ -834,6 +891,10 @@ static void
 mmci_data_irq(struct mmci_host *host, struct mmc_data *data,
 	      unsigned int status)
 {
+	/* Make sure we have data to handle */
+	if (!data)
+		return;
+
 	/* First check for errors */
 	if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR|
 		      MCI_TXUNDERRUN|MCI_RXOVERRUN)) {
@@ -902,9 +963,17 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd,
 	     unsigned int status)
 {
 	void __iomem *base = host->base;
-	bool sbc = (cmd == host->mrq->sbc);
-	bool busy_resp = host->variant->busy_detect &&
-			(cmd->flags & MMC_RSP_BUSY);
+	bool sbc, busy_resp;
+
+	if (!cmd)
+		return;
+
+	sbc = (cmd == host->mrq->sbc);
+	busy_resp = host->variant->busy_detect && (cmd->flags & MMC_RSP_BUSY);
+
+	if (!((status|host->busy_status) & (MCI_CMDCRCFAIL|MCI_CMDTIMEOUT|
+		MCI_CMDSENT|MCI_CMDRESPEND)))
+		return;
 
 	/* Check if we need to wait for busy completion. */
 	if (host->busy_status && (status & MCI_ST_CARDBUSY))
@@ -957,15 +1026,34 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd,
 	}
 }
 
+static int mmci_get_rx_fifocnt(struct mmci_host *host, u32 status, int remain)
+{
+	return remain - (readl(host->base + MMCIFIFOCNT) << 2);
+}
+
+static int mmci_qcom_get_rx_fifocnt(struct mmci_host *host, u32 status, int r)
+{
+	/*
+	 * on qcom SDCC4 only 8 words are used in each burst so only 8 addresses
+	 * from the fifo range should be used
+	 */
+	if (status & MCI_RXFIFOHALFFULL)
+		return host->variant->fifohalfsize;
+	else if (status & MCI_RXDATAAVLBL)
+		return 4;
+
+	return 0;
+}
+
 static int mmci_pio_read(struct mmci_host *host, char *buffer, unsigned int remain)
 {
 	void __iomem *base = host->base;
 	char *ptr = buffer;
-	u32 status;
+	u32 status = readl(host->base + MMCISTATUS);
 	int host_remain = host->size;
 
 	do {
-		int count = host_remain - (readl(base + MMCIFIFOCNT) << 2);
+		int count = host->get_rx_fifocnt(host, status, host_remain);
 
 		if (count > remain)
 			count = remain;
@@ -1132,9 +1220,6 @@ static irqreturn_t mmci_irq(int irq, void *dev_id)
 	spin_lock(&host->lock);
 
 	do {
-		struct mmc_command *cmd;
-		struct mmc_data *data;
-
 		status = readl(host->base + MMCISTATUS);
 
 		if (host->singleirq) {
@@ -1154,16 +1239,13 @@ static irqreturn_t mmci_irq(int irq, void *dev_id)
 
 		dev_dbg(mmc_dev(host->mmc), "irq0 (data+cmd) %08x\n", status);
 
-		cmd = host->cmd;
-		if ((status|host->busy_status) & (MCI_CMDCRCFAIL|MCI_CMDTIMEOUT|
-			MCI_CMDSENT|MCI_CMDRESPEND) && cmd)
-			mmci_cmd_irq(host, cmd, status);
-
-		data = host->data;
-		if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR|
-			      MCI_TXUNDERRUN|MCI_RXOVERRUN|MCI_DATAEND|
-			      MCI_DATABLOCKEND) && data)
-			mmci_data_irq(host, data, status);
+		if (host->variant->reversed_irq_handling) {
+			mmci_data_irq(host, host->data, status);
+			mmci_cmd_irq(host, host->cmd, status);
+		} else {
+			mmci_cmd_irq(host, host->cmd, status);
+			mmci_data_irq(host, host->data, status);
+		}
 
 		/* Don't poll for busy completion in irq context. */
 		if (host->busy_status)
@@ -1296,6 +1378,17 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 	if (!ios->clock && variant->pwrreg_clkgate)
 		pwr &= ~MCI_PWR_ON;
 
+	if (host->variant->explicit_mclk_control &&
+	    ios->clock != host->clock_cache) {
+		ret = clk_set_rate(host->clk, ios->clock);
+		if (ret < 0)
+			dev_err(mmc_dev(host->mmc),
+				"Error setting clock rate (%d)\n", ret);
+		else
+			host->mclk = clk_get_rate(host->clk);
+	}
+	host->clock_cache = ios->clock;
+
 	spin_lock_irqsave(&host->lock, flags);
 
 	mmci_set_clkreg(host, ios->clock);
@@ -1443,6 +1536,11 @@ static int mmci_probe(struct amba_device *dev,
 	if (ret)
 		goto host_free;
 
+	if (variant->qcom_fifo)
+		host->get_rx_fifocnt = mmci_qcom_get_rx_fifocnt;
+	else
+		host->get_rx_fifocnt = mmci_get_rx_fifocnt;
+
 	host->plat = plat;
 	host->variant = variant;
 	host->mclk = clk_get_rate(host->clk);
@@ -1451,8 +1549,8 @@ static int mmci_probe(struct amba_device *dev,
 	 * so we try to adjust the clock down to this,
 	 * (if possible).
 	 */
-	if (host->mclk > 100000000) {
-		ret = clk_set_rate(host->clk, 100000000);
+	if (host->mclk > variant->f_max) {
+		ret = clk_set_rate(host->clk, variant->f_max);
 		if (ret < 0)
 			goto clk_disable;
 		host->mclk = clk_get_rate(host->clk);
@@ -1471,9 +1569,12 @@ static int mmci_probe(struct amba_device *dev,
 	 * The ARM and ST versions of the block have slightly different
 	 * clock divider equations which means that the minimum divider
 	 * differs too.
+	 * on Qualcomm like controllers get the nearest minimum clock to 100Khz
 	 */
 	if (variant->st_clkdiv)
 		mmc->f_min = DIV_ROUND_UP(host->mclk, 257);
+	else if (variant->explicit_mclk_control)
+		mmc->f_min = clk_round_rate(host->clk, 100000);
 	else
 		mmc->f_min = DIV_ROUND_UP(host->mclk, 512);
 	/*
@@ -1483,9 +1584,14 @@ static int mmci_probe(struct amba_device *dev,
 	 * the block, of course.
 	 */
 	if (mmc->f_max)
-		mmc->f_max = min(host->mclk, mmc->f_max);
+		mmc->f_max = variant->explicit_mclk_control ?
+				min(variant->f_max, mmc->f_max) :
+				min(host->mclk, mmc->f_max);
 	else
-		mmc->f_max = min(host->mclk, fmax);
+		mmc->f_max = variant->explicit_mclk_control ?
+				fmax : min(host->mclk, fmax);
+
+
 	dev_dbg(mmc_dev(mmc), "clocking block at %u Hz\n", mmc->f_max);
 
 	/* Get regulators and the supported OCR mask */
@@ -1752,6 +1858,12 @@ static struct amba_id mmci_ids[] = {
 		.mask   = 0xf0ffffff,
 		.data	= &variant_ux500v2,
 	},
+	/* Qualcomm variants */
+	{
+		.id     = 0x00051180,
+		.mask	= 0x000fffff,
+		.data	= &variant_qcom,
+	},
 	{ 0, 0 },
 };
 
diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h
index 347d942d740b..a1f5e4f49e2a 100644
--- a/drivers/mmc/host/mmci.h
+++ b/drivers/mmc/host/mmci.h
@@ -41,6 +41,15 @@
 /* Modified PL180 on Versatile Express platform */
 #define MCI_ARM_HWFCEN		(1 << 12)
 
+/* Modified on Qualcomm Integrations */
+#define MCI_QCOM_CLK_WIDEBUS_8	(BIT(10) | BIT(11))
+#define MCI_QCOM_CLK_FLOWENA	BIT(12)
+#define MCI_QCOM_CLK_INVERTOUT	BIT(13)
+
+/* select in latch data and command in */
+#define MCI_QCOM_CLK_SELECT_IN_FBCLK	BIT(15)
+#define MCI_QCOM_CLK_SELECT_IN_DDR_MODE	(BIT(14) | BIT(15))
+
 #define MMCIARGUMENT		0x008
 #define MMCICOMMAND		0x00c
 #define MCI_CPSM_RESPONSE	(1 << 6)
@@ -54,6 +63,14 @@
 #define MCI_ST_NIEN		(1 << 13)
 #define MCI_ST_CE_ATACMD	(1 << 14)
 
+/* Modified on Qualcomm Integrations */
+#define MCI_QCOM_CSPM_DATCMD		BIT(12)
+#define MCI_QCOM_CSPM_MCIABORT		BIT(13)
+#define MCI_QCOM_CSPM_CCSENABLE		BIT(14)
+#define MCI_QCOM_CSPM_CCSDISABLE	BIT(15)
+#define MCI_QCOM_CSPM_AUTO_CMD19	BIT(16)
+#define MCI_QCOM_CSPM_AUTO_CMD21	BIT(21)
+
 #define MMCIRESPCMD		0x010
 #define MMCIRESPONSE0		0x014
 #define MMCIRESPONSE1		0x018
@@ -191,6 +208,8 @@ struct mmci_host {
 	spinlock_t		lock;
 
 	unsigned int		mclk;
+	/* cached value of requested clk in set_ios */
+	unsigned int		clock_cache;
 	unsigned int		cclk;
 	u32			pwr_reg;
 	u32			pwr_reg_add;
@@ -210,6 +229,7 @@ struct mmci_host {
 	/* pio stuff */
 	struct sg_mapping_iter	sg_miter;
 	unsigned int		size;
+	int (*get_rx_fifocnt)(struct mmci_host *h, u32 status, int remain);
 
 #ifdef CONFIG_DMA_ENGINE
 	/* DMA stuff */
diff --git a/drivers/mmc/host/moxart-mmc.c b/drivers/mmc/host/moxart-mmc.c
index 74924a04026e..b4b1efbf6c16 100644
--- a/drivers/mmc/host/moxart-mmc.c
+++ b/drivers/mmc/host/moxart-mmc.c
@@ -13,7 +13,6 @@
  * warranty of any kind, whether express or implied.
  */
 
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c
index babfea03ba8a..140885a5a4e7 100644
--- a/drivers/mmc/host/mxs-mmc.c
+++ b/drivers/mmc/host/mxs-mmc.c
@@ -86,7 +86,8 @@ static int mxs_mmc_get_cd(struct mmc_host *mmc)
 	if (ret >= 0)
 		return ret;
 
-	present = !(readl(ssp->base + HW_SSP_STATUS(ssp)) &
+	present = mmc->caps & MMC_CAP_NEEDS_POLL ||
+		!(readl(ssp->base + HW_SSP_STATUS(ssp)) &
 			BM_SSP_STATUS_CARD_DETECT);
 
 	if (mmc->caps2 & MMC_CAP2_CD_ACTIVE_HIGH)
diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c
index 6b7b75585926..965672663ef0 100644
--- a/drivers/mmc/host/omap_hsmmc.c
+++ b/drivers/mmc/host/omap_hsmmc.c
@@ -29,6 +29,7 @@
 #include <linux/timer.h>
 #include <linux/clk.h>
 #include <linux/of.h>
+#include <linux/of_irq.h>
 #include <linux/of_gpio.h>
 #include <linux/of_device.h>
 #include <linux/omap-dmaengine.h>
@@ -36,6 +37,7 @@
 #include <linux/mmc/core.h>
 #include <linux/mmc/mmc.h>
 #include <linux/io.h>
+#include <linux/irq.h>
 #include <linux/gpio.h>
 #include <linux/regulator/consumer.h>
 #include <linux/pinctrl/consumer.h>
@@ -54,6 +56,7 @@
 #define OMAP_HSMMC_RSP54	0x0118
 #define OMAP_HSMMC_RSP76	0x011C
 #define OMAP_HSMMC_DATA		0x0120
+#define OMAP_HSMMC_PSTATE	0x0124
 #define OMAP_HSMMC_HCTL		0x0128
 #define OMAP_HSMMC_SYSCTL	0x012C
 #define OMAP_HSMMC_STAT		0x0130
@@ -91,7 +94,10 @@
 #define BCE			(1 << 1)
 #define FOUR_BIT		(1 << 1)
 #define HSPE			(1 << 2)
+#define IWE			(1 << 24)
 #define DDR			(1 << 19)
+#define CLKEXTFREE		(1 << 16)
+#define CTPL			(1 << 11)
 #define DW8			(1 << 5)
 #define OD			0x1
 #define STAT_CLEAR		0xFFFFFFFF
@@ -101,11 +107,15 @@
 #define SRD			(1 << 26)
 #define SOFTRESET		(1 << 1)
 
+/* PSTATE */
+#define DLEV_DAT(x)		(1 << (20 + (x)))
+
 /* Interrupt masks for IE and ISE register */
 #define CC_EN			(1 << 0)
 #define TC_EN			(1 << 1)
 #define BWR_EN			(1 << 4)
 #define BRR_EN			(1 << 5)
+#define CIRQ_EN			(1 << 8)
 #define ERR_EN			(1 << 15)
 #define CTO_EN			(1 << 16)
 #define CCRC_EN			(1 << 17)
@@ -140,7 +150,6 @@
 #define VDD_3V0			3000000		/* 300000 uV */
 #define VDD_165_195		(ffs(MMC_VDD_165_195) - 1)
 
-#define AUTO_CMD23		(1 << 1)	/* Auto CMD23 support */
 /*
  * One controller can have multiple slots, like on some omap boards using
  * omap.c controller driver. Luckily this is not currently done on any known
@@ -194,6 +203,7 @@ struct omap_hsmmc_host {
 	u32			sysctl;
 	u32			capa;
 	int			irq;
+	int			wake_irq;
 	int			use_dma, dma_ch;
 	struct dma_chan		*tx_chan;
 	struct dma_chan		*rx_chan;
@@ -206,6 +216,9 @@ struct omap_hsmmc_host {
 	int			req_in_progress;
 	unsigned long		clk_rate;
 	unsigned int		flags;
+#define AUTO_CMD23		(1 << 0)        /* Auto CMD23 support */
+#define HSMMC_SDIO_IRQ_ENABLED	(1 << 1)        /* SDIO irq enabled */
+#define HSMMC_WAKE_IRQ_ENABLED	(1 << 2)
 	struct omap_hsmmc_next	next_data;
 	struct	omap_mmc_platform_data	*pdata;
 };
@@ -510,27 +523,40 @@ static void omap_hsmmc_stop_clock(struct omap_hsmmc_host *host)
 static void omap_hsmmc_enable_irq(struct omap_hsmmc_host *host,
 				  struct mmc_command *cmd)
 {
-	unsigned int irq_mask;
+	u32 irq_mask = INT_EN_MASK;
+	unsigned long flags;
 
 	if (host->use_dma)
-		irq_mask = INT_EN_MASK & ~(BRR_EN | BWR_EN);
-	else
-		irq_mask = INT_EN_MASK;
+		irq_mask &= ~(BRR_EN | BWR_EN);
 
 	/* Disable timeout for erases */
 	if (cmd->opcode == MMC_ERASE)
 		irq_mask &= ~DTO_EN;
 
+	spin_lock_irqsave(&host->irq_lock, flags);
 	OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
 	OMAP_HSMMC_WRITE(host->base, ISE, irq_mask);
+
+	/* latch pending CIRQ, but don't signal MMC core */
+	if (host->flags & HSMMC_SDIO_IRQ_ENABLED)
+		irq_mask |= CIRQ_EN;
 	OMAP_HSMMC_WRITE(host->base, IE, irq_mask);
+	spin_unlock_irqrestore(&host->irq_lock, flags);
 }
 
 static void omap_hsmmc_disable_irq(struct omap_hsmmc_host *host)
 {
-	OMAP_HSMMC_WRITE(host->base, ISE, 0);
-	OMAP_HSMMC_WRITE(host->base, IE, 0);
+	u32 irq_mask = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&host->irq_lock, flags);
+	/* no transfer running but need to keep cirq if enabled */
+	if (host->flags & HSMMC_SDIO_IRQ_ENABLED)
+		irq_mask |= CIRQ_EN;
+	OMAP_HSMMC_WRITE(host->base, ISE, irq_mask);
+	OMAP_HSMMC_WRITE(host->base, IE, irq_mask);
 	OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
+	spin_unlock_irqrestore(&host->irq_lock, flags);
 }
 
 /* Calculate divisor for the given clock frequency */
@@ -667,6 +693,9 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host)
 		capa = VS18;
 	}
 
+	if (host->mmc->caps & MMC_CAP_SDIO_IRQ)
+		hctl |= IWE;
+
 	OMAP_HSMMC_WRITE(host->base, HCTL,
 			OMAP_HSMMC_READ(host->base, HCTL) | hctl);
 
@@ -681,7 +710,9 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host)
 		&& time_before(jiffies, timeout))
 		;
 
-	omap_hsmmc_disable_irq(host);
+	OMAP_HSMMC_WRITE(host->base, ISE, 0);
+	OMAP_HSMMC_WRITE(host->base, IE, 0);
+	OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
 
 	/* Do not initialize card-specific things if the power is off */
 	if (host->power_mode == MMC_POWER_OFF)
@@ -1118,8 +1149,12 @@ static irqreturn_t omap_hsmmc_irq(int irq, void *dev_id)
 	int status;
 
 	status = OMAP_HSMMC_READ(host->base, STAT);
-	while (status & INT_EN_MASK && host->req_in_progress) {
-		omap_hsmmc_do_irq(host, status);
+	while (status & (INT_EN_MASK | CIRQ_EN)) {
+		if (host->req_in_progress)
+			omap_hsmmc_do_irq(host, status);
+
+		if (status & CIRQ_EN)
+			mmc_signal_sdio_irq(host->mmc);
 
 		/* Flush posted write */
 		status = OMAP_HSMMC_READ(host->base, STAT);
@@ -1128,6 +1163,22 @@ static irqreturn_t omap_hsmmc_irq(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t omap_hsmmc_wake_irq(int irq, void *dev_id)
+{
+	struct omap_hsmmc_host *host = dev_id;
+
+	/* cirq is level triggered, disable to avoid infinite loop */
+	spin_lock(&host->irq_lock);
+	if (host->flags & HSMMC_WAKE_IRQ_ENABLED) {
+		disable_irq_nosync(host->wake_irq);
+		host->flags &= ~HSMMC_WAKE_IRQ_ENABLED;
+	}
+	spin_unlock(&host->irq_lock);
+	pm_request_resume(host->dev); /* no use counter */
+
+	return IRQ_HANDLED;
+}
+
 static void set_sd_bus_power(struct omap_hsmmc_host *host)
 {
 	unsigned long i;
@@ -1639,6 +1690,103 @@ static void omap_hsmmc_init_card(struct mmc_host *mmc, struct mmc_card *card)
 		mmc_slot(host).init_card(card);
 }
 
+static void omap_hsmmc_enable_sdio_irq(struct mmc_host *mmc, int enable)
+{
+	struct omap_hsmmc_host *host = mmc_priv(mmc);
+	u32 irq_mask, con;
+	unsigned long flags;
+
+	spin_lock_irqsave(&host->irq_lock, flags);
+
+	con = OMAP_HSMMC_READ(host->base, CON);
+	irq_mask = OMAP_HSMMC_READ(host->base, ISE);
+	if (enable) {
+		host->flags |= HSMMC_SDIO_IRQ_ENABLED;
+		irq_mask |= CIRQ_EN;
+		con |= CTPL | CLKEXTFREE;
+	} else {
+		host->flags &= ~HSMMC_SDIO_IRQ_ENABLED;
+		irq_mask &= ~CIRQ_EN;
+		con &= ~(CTPL | CLKEXTFREE);
+	}
+	OMAP_HSMMC_WRITE(host->base, CON, con);
+	OMAP_HSMMC_WRITE(host->base, IE, irq_mask);
+
+	/*
+	 * if enable, piggy back detection on current request
+	 * but always disable immediately
+	 */
+	if (!host->req_in_progress || !enable)
+		OMAP_HSMMC_WRITE(host->base, ISE, irq_mask);
+
+	/* flush posted write */
+	OMAP_HSMMC_READ(host->base, IE);
+
+	spin_unlock_irqrestore(&host->irq_lock, flags);
+}
+
+static int omap_hsmmc_configure_wake_irq(struct omap_hsmmc_host *host)
+{
+	struct mmc_host *mmc = host->mmc;
+	int ret;
+
+	/*
+	 * For omaps with wake-up path, wakeirq will be irq from pinctrl and
+	 * for other omaps, wakeirq will be from GPIO (dat line remuxed to
+	 * gpio). wakeirq is needed to detect sdio irq in runtime suspend state
+	 * with functional clock disabled.
+	 */
+	if (!host->dev->of_node || !host->wake_irq)
+		return -ENODEV;
+
+	/* Prevent auto-enabling of IRQ */
+	irq_set_status_flags(host->wake_irq, IRQ_NOAUTOEN);
+	ret = devm_request_irq(host->dev, host->wake_irq, omap_hsmmc_wake_irq,
+			       IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+			       mmc_hostname(mmc), host);
+	if (ret) {
+		dev_err(mmc_dev(host->mmc), "Unable to request wake IRQ\n");
+		goto err;
+	}
+
+	/*
+	 * Some omaps don't have wake-up path from deeper idle states
+	 * and need to remux SDIO DAT1 to GPIO for wake-up from idle.
+	 */
+	if (host->pdata->controller_flags & OMAP_HSMMC_SWAKEUP_MISSING) {
+		struct pinctrl *p = devm_pinctrl_get(host->dev);
+		if (!p) {
+			ret = -ENODEV;
+			goto err_free_irq;
+		}
+		if (IS_ERR(pinctrl_lookup_state(p, PINCTRL_STATE_DEFAULT))) {
+			dev_info(host->dev, "missing default pinctrl state\n");
+			devm_pinctrl_put(p);
+			ret = -EINVAL;
+			goto err_free_irq;
+		}
+
+		if (IS_ERR(pinctrl_lookup_state(p, PINCTRL_STATE_IDLE))) {
+			dev_info(host->dev, "missing idle pinctrl state\n");
+			devm_pinctrl_put(p);
+			ret = -EINVAL;
+			goto err_free_irq;
+		}
+		devm_pinctrl_put(p);
+	}
+
+	OMAP_HSMMC_WRITE(host->base, HCTL,
+			 OMAP_HSMMC_READ(host->base, HCTL) | IWE);
+	return 0;
+
+err_free_irq:
+	devm_free_irq(host->dev, host->wake_irq, host);
+err:
+	dev_warn(host->dev, "no SDIO IRQ support, falling back to polling\n");
+	host->wake_irq = 0;
+	return ret;
+}
+
 static void omap_hsmmc_conf_bus_power(struct omap_hsmmc_host *host)
 {
 	u32 hctl, capa, value;
@@ -1691,7 +1839,7 @@ static const struct mmc_host_ops omap_hsmmc_ops = {
 	.get_cd = omap_hsmmc_get_cd,
 	.get_ro = omap_hsmmc_get_ro,
 	.init_card = omap_hsmmc_init_card,
-	/* NYET -- enable_sdio_irq */
+	.enable_sdio_irq = omap_hsmmc_enable_sdio_irq,
 };
 
 #ifdef CONFIG_DEBUG_FS
@@ -1701,13 +1849,23 @@ static int omap_hsmmc_regs_show(struct seq_file *s, void *data)
 	struct mmc_host *mmc = s->private;
 	struct omap_hsmmc_host *host = mmc_priv(mmc);
 
-	seq_printf(s, "mmc%d:\n ctx_loss:\t%d\n\nregs:\n",
-			mmc->index, host->context_loss);
+	seq_printf(s, "mmc%d:\n", mmc->index);
+	seq_printf(s, "sdio irq mode\t%s\n",
+		   (mmc->caps & MMC_CAP_SDIO_IRQ) ? "interrupt" : "polling");
 
-	pm_runtime_get_sync(host->dev);
+	if (mmc->caps & MMC_CAP_SDIO_IRQ) {
+		seq_printf(s, "sdio irq \t%s\n",
+			   (host->flags & HSMMC_SDIO_IRQ_ENABLED) ?  "enabled"
+			   : "disabled");
+	}
+	seq_printf(s, "ctx_loss:\t%d\n", host->context_loss);
 
+	pm_runtime_get_sync(host->dev);
+	seq_puts(s, "\nregs:\n");
 	seq_printf(s, "CON:\t\t0x%08x\n",
 			OMAP_HSMMC_READ(host->base, CON));
+	seq_printf(s, "PSTATE:\t\t0x%08x\n",
+		   OMAP_HSMMC_READ(host->base, PSTATE));
 	seq_printf(s, "HCTL:\t\t0x%08x\n",
 			OMAP_HSMMC_READ(host->base, HCTL));
 	seq_printf(s, "SYSCTL:\t\t0x%08x\n",
@@ -1761,6 +1919,10 @@ static const struct omap_mmc_of_data omap3_pre_es3_mmc_of_data = {
 static const struct omap_mmc_of_data omap4_mmc_of_data = {
 	.reg_offset = 0x100,
 };
+static const struct omap_mmc_of_data am33xx_mmc_of_data = {
+	.reg_offset = 0x100,
+	.controller_flags = OMAP_HSMMC_SWAKEUP_MISSING,
+};
 
 static const struct of_device_id omap_mmc_of_match[] = {
 	{
@@ -1777,6 +1939,10 @@ static const struct of_device_id omap_mmc_of_match[] = {
 		.compatible = "ti,omap4-hsmmc",
 		.data = &omap4_mmc_of_data,
 	},
+	{
+		.compatible = "ti,am33xx-hsmmc",
+		.data = &am33xx_mmc_of_data,
+	},
 	{},
 };
 MODULE_DEVICE_TABLE(of, omap_mmc_of_match);
@@ -1850,7 +2016,6 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 	const struct of_device_id *match;
 	dma_cap_mask_t mask;
 	unsigned tx_req, rx_req;
-	struct pinctrl *pinctrl;
 	const struct omap_mmc_of_data *data;
 	void __iomem *base;
 
@@ -1913,6 +2078,9 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, host);
 
+	if (pdev->dev.of_node)
+		host->wake_irq = irq_of_parse_and_map(pdev->dev.of_node, 1);
+
 	mmc->ops	= &omap_hsmmc_ops;
 
 	mmc->f_min = OMAP_MMC_MIN_CLOCK;
@@ -2061,10 +2229,17 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 
 	omap_hsmmc_disable_irq(host);
 
-	pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
-	if (IS_ERR(pinctrl))
-		dev_warn(&pdev->dev,
-			"pins are not configured from the driver\n");
+	/*
+	 * For now, only support SDIO interrupt if we have a separate
+	 * wake-up interrupt configured from device tree. This is because
+	 * the wake-up interrupt is needed for idle state and some
+	 * platforms need special quirks. And we don't want to add new
+	 * legacy mux platform init code callbacks any longer as we
+	 * are moving to DT based booting anyways.
+	 */
+	ret = omap_hsmmc_configure_wake_irq(host);
+	if (!ret)
+		mmc->caps |= MMC_CAP_SDIO_IRQ;
 
 	omap_hsmmc_protect_card(host);
 
@@ -2170,11 +2345,18 @@ static int omap_hsmmc_suspend(struct device *dev)
 	pm_runtime_get_sync(host->dev);
 
 	if (!(host->mmc->pm_flags & MMC_PM_KEEP_POWER)) {
-		omap_hsmmc_disable_irq(host);
+		OMAP_HSMMC_WRITE(host->base, ISE, 0);
+		OMAP_HSMMC_WRITE(host->base, IE, 0);
+		OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
 		OMAP_HSMMC_WRITE(host->base, HCTL,
 				OMAP_HSMMC_READ(host->base, HCTL) & ~SDBP);
 	}
 
+	/* do not wake up due to sdio irq */
+	if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+	    !(host->mmc->pm_flags & MMC_PM_WAKE_SDIO_IRQ))
+		disable_irq(host->wake_irq);
+
 	if (host->dbclk)
 		clk_disable_unprepare(host->dbclk);
 
@@ -2200,6 +2382,10 @@ static int omap_hsmmc_resume(struct device *dev)
 
 	omap_hsmmc_protect_card(host);
 
+	if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+	    !(host->mmc->pm_flags & MMC_PM_WAKE_SDIO_IRQ))
+		enable_irq(host->wake_irq);
+
 	pm_runtime_mark_last_busy(host->dev);
 	pm_runtime_put_autosuspend(host->dev);
 	return 0;
@@ -2215,22 +2401,77 @@ static int omap_hsmmc_resume(struct device *dev)
 static int omap_hsmmc_runtime_suspend(struct device *dev)
 {
 	struct omap_hsmmc_host *host;
+	unsigned long flags;
+	int ret = 0;
 
 	host = platform_get_drvdata(to_platform_device(dev));
 	omap_hsmmc_context_save(host);
 	dev_dbg(dev, "disabled\n");
 
-	return 0;
+	spin_lock_irqsave(&host->irq_lock, flags);
+	if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+	    (host->flags & HSMMC_SDIO_IRQ_ENABLED)) {
+		/* disable sdio irq handling to prevent race */
+		OMAP_HSMMC_WRITE(host->base, ISE, 0);
+		OMAP_HSMMC_WRITE(host->base, IE, 0);
+
+		if (!(OMAP_HSMMC_READ(host->base, PSTATE) & DLEV_DAT(1))) {
+			/*
+			 * dat1 line low, pending sdio irq
+			 * race condition: possible irq handler running on
+			 * multi-core, abort
+			 */
+			dev_dbg(dev, "pending sdio irq, abort suspend\n");
+			OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
+			OMAP_HSMMC_WRITE(host->base, ISE, CIRQ_EN);
+			OMAP_HSMMC_WRITE(host->base, IE, CIRQ_EN);
+			pm_runtime_mark_last_busy(dev);
+			ret = -EBUSY;
+			goto abort;
+		}
+
+		pinctrl_pm_select_idle_state(dev);
+
+		WARN_ON(host->flags & HSMMC_WAKE_IRQ_ENABLED);
+		enable_irq(host->wake_irq);
+		host->flags |= HSMMC_WAKE_IRQ_ENABLED;
+	} else {
+		pinctrl_pm_select_idle_state(dev);
+	}
+
+abort:
+	spin_unlock_irqrestore(&host->irq_lock, flags);
+	return ret;
 }
 
 static int omap_hsmmc_runtime_resume(struct device *dev)
 {
 	struct omap_hsmmc_host *host;
+	unsigned long flags;
 
 	host = platform_get_drvdata(to_platform_device(dev));
 	omap_hsmmc_context_restore(host);
 	dev_dbg(dev, "enabled\n");
 
+	spin_lock_irqsave(&host->irq_lock, flags);
+	if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
+	    (host->flags & HSMMC_SDIO_IRQ_ENABLED)) {
+		/* sdio irq flag can't change while in runtime suspend */
+		if (host->flags & HSMMC_WAKE_IRQ_ENABLED) {
+			disable_irq_nosync(host->wake_irq);
+			host->flags &= ~HSMMC_WAKE_IRQ_ENABLED;
+		}
+
+		pinctrl_pm_select_default_state(host->dev);
+
+		/* irq lost, if pinmux incorrect */
+		OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR);
+		OMAP_HSMMC_WRITE(host->base, ISE, CIRQ_EN);
+		OMAP_HSMMC_WRITE(host->base, IE, CIRQ_EN);
+	} else {
+		pinctrl_pm_select_default_state(host->dev);
+	}
+	spin_unlock_irqrestore(&host->irq_lock, flags);
 	return 0;
 }
 
diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c
index f23782683a7c..e5516a226362 100644
--- a/drivers/mmc/host/s3cmci.c
+++ b/drivers/mmc/host/s3cmci.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/clk.h>
 #include <linux/mmc/host.h>
@@ -27,6 +28,7 @@
 #include <mach/dma.h>
 #include <mach/gpio-samsung.h>
 
+#include <linux/platform_data/dma-s3c24xx.h>
 #include <linux/platform_data/mmc-s3cmci.h>
 
 #include "s3cmci.h"
@@ -140,10 +142,6 @@ static const int dbgmap_debug = dbg_err | dbg_debug;
 		dev_dbg(&host->pdev->dev, args);  \
 	} while (0)
 
-static struct s3c2410_dma_client s3cmci_dma_client = {
-	.name		= "s3c-mci",
-};
-
 static void finalize_request(struct s3cmci_host *host);
 static void s3cmci_send_request(struct mmc_host *mmc);
 static void s3cmci_reset(struct s3cmci_host *host);
@@ -256,25 +254,8 @@ static inline bool s3cmci_host_usedma(struct s3cmci_host *host)
 {
 #ifdef CONFIG_MMC_S3C_PIO
 	return false;
-#elif defined(CONFIG_MMC_S3C_DMA)
+#else /* CONFIG_MMC_S3C_DMA */
 	return true;
-#else
-	return host->dodma;
-#endif
-}
-
-/**
- * s3cmci_host_canpio - return true if host has pio code available
- *
- * Return true if the driver has been compiled with the PIO support code
- * available.
- */
-static inline bool s3cmci_host_canpio(void)
-{
-#ifdef CONFIG_MMC_S3C_PIO
-	return true;
-#else
-	return false;
 #endif
 }
 
@@ -841,60 +822,24 @@ static irqreturn_t s3cmci_irq_cd(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static void s3cmci_dma_done_callback(struct s3c2410_dma_chan *dma_ch,
-				     void *buf_id, int size,
-				     enum s3c2410_dma_buffresult result)
+static void s3cmci_dma_done_callback(void *arg)
 {
-	struct s3cmci_host *host = buf_id;
+	struct s3cmci_host *host = arg;
 	unsigned long iflags;
-	u32 mci_csta, mci_dsta, mci_fsta, mci_dcnt;
-
-	mci_csta = readl(host->base + S3C2410_SDICMDSTAT);
-	mci_dsta = readl(host->base + S3C2410_SDIDSTA);
-	mci_fsta = readl(host->base + S3C2410_SDIFSTA);
-	mci_dcnt = readl(host->base + S3C2410_SDIDCNT);
 
 	BUG_ON(!host->mrq);
 	BUG_ON(!host->mrq->data);
-	BUG_ON(!host->dmatogo);
 
 	spin_lock_irqsave(&host->complete_lock, iflags);
 
-	if (result != S3C2410_RES_OK) {
-		dbg(host, dbg_fail, "DMA FAILED: csta=0x%08x dsta=0x%08x "
-			"fsta=0x%08x dcnt:0x%08x result:0x%08x toGo:%u\n",
-			mci_csta, mci_dsta, mci_fsta,
-			mci_dcnt, result, host->dmatogo);
-
-		goto fail_request;
-	}
-
-	host->dmatogo--;
-	if (host->dmatogo) {
-		dbg(host, dbg_dma, "DMA DONE  Size:%i DSTA:[%08x] "
-			"DCNT:[%08x] toGo:%u\n",
-			size, mci_dsta, mci_dcnt, host->dmatogo);
-
-		goto out;
-	}
-
-	dbg(host, dbg_dma, "DMA FINISHED Size:%i DSTA:%08x DCNT:%08x\n",
-		size, mci_dsta, mci_dcnt);
+	dbg(host, dbg_dma, "DMA FINISHED\n");
 
 	host->dma_complete = 1;
 	host->complete_what = COMPLETION_FINALIZE;
 
-out:
 	tasklet_schedule(&host->pio_tasklet);
 	spin_unlock_irqrestore(&host->complete_lock, iflags);
-	return;
 
-fail_request:
-	host->mrq->data->error = -EINVAL;
-	host->complete_what = COMPLETION_FINALIZE;
-	clear_imask(host);
-
-	goto out;
 }
 
 static void finalize_request(struct s3cmci_host *host)
@@ -966,7 +911,7 @@ static void finalize_request(struct s3cmci_host *host)
 	 * DMA channel and the fifo to clear out any garbage. */
 	if (mrq->data->error != 0) {
 		if (s3cmci_host_usedma(host))
-			s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH);
+			dmaengine_terminate_all(host->dma);
 
 		if (host->is2440) {
 			/* Clear failure register and reset fifo. */
@@ -992,29 +937,6 @@ request_done:
 	mmc_request_done(host->mmc, mrq);
 }
 
-static void s3cmci_dma_setup(struct s3cmci_host *host,
-			     enum dma_data_direction source)
-{
-	static enum dma_data_direction last_source = -1;
-	static int setup_ok;
-
-	if (last_source == source)
-		return;
-
-	last_source = source;
-
-	s3c2410_dma_devconfig(host->dma, source,
-			      host->mem->start + host->sdidata);
-
-	if (!setup_ok) {
-		s3c2410_dma_config(host->dma, 4);
-		s3c2410_dma_set_buffdone_fn(host->dma,
-					    s3cmci_dma_done_callback);
-		s3c2410_dma_setflags(host->dma, S3C2410_DMAF_AUTOSTART);
-		setup_ok = 1;
-	}
-}
-
 static void s3cmci_send_command(struct s3cmci_host *host,
 					struct mmc_command *cmd)
 {
@@ -1162,43 +1084,45 @@ static int s3cmci_prepare_pio(struct s3cmci_host *host, struct mmc_data *data)
 
 static int s3cmci_prepare_dma(struct s3cmci_host *host, struct mmc_data *data)
 {
-	int dma_len, i;
 	int rw = data->flags & MMC_DATA_WRITE;
+	struct dma_async_tx_descriptor *desc;
+	struct dma_slave_config conf = {
+		.src_addr = host->mem->start + host->sdidata,
+		.dst_addr = host->mem->start + host->sdidata,
+		.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES,
+		.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES,
+	};
 
 	BUG_ON((data->flags & BOTH_DIR) == BOTH_DIR);
 
-	s3cmci_dma_setup(host, rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH);
-
-	dma_len = dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
-			     rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-
-	if (dma_len == 0)
-		return -ENOMEM;
-
-	host->dma_complete = 0;
-	host->dmatogo = dma_len;
-
-	for (i = 0; i < dma_len; i++) {
-		int res;
-
-		dbg(host, dbg_dma, "enqueue %i: %08x@%u\n", i,
-		    sg_dma_address(&data->sg[i]),
-		    sg_dma_len(&data->sg[i]));
+	/* Restore prescaler value */
+	writel(host->prescaler, host->base + S3C2410_SDIPRE);
 
-		res = s3c2410_dma_enqueue(host->dma, host,
-					  sg_dma_address(&data->sg[i]),
-					  sg_dma_len(&data->sg[i]));
+	if (!rw)
+		conf.direction = DMA_DEV_TO_MEM;
+	else
+		conf.direction = DMA_MEM_TO_DEV;
 
-		if (res) {
-			s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH);
-			return -EBUSY;
-		}
-	}
+	dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
+			     rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 
-	s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_START);
+	dmaengine_slave_config(host->dma, &conf);
+	desc = dmaengine_prep_slave_sg(host->dma, data->sg, data->sg_len,
+		conf.direction,
+		DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
+	if (!desc)
+		goto unmap_exit;
+	desc->callback = s3cmci_dma_done_callback;
+	desc->callback_param = host;
+	dmaengine_submit(desc);
+	dma_async_issue_pending(host->dma);
 
 	return 0;
+
+unmap_exit:
+	dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
+			     rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	return -ENOMEM;
 }
 
 static void s3cmci_send_request(struct mmc_host *mmc)
@@ -1676,10 +1600,6 @@ static int s3cmci_probe(struct platform_device *pdev)
 	host->complete_what 	= COMPLETION_NONE;
 	host->pio_active 	= XFER_NONE;
 
-#ifdef CONFIG_MMC_S3C_PIODMA
-	host->dodma		= host->pdata->use_dma;
-#endif
-
 	host->mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!host->mem) {
 		dev_err(&pdev->dev,
@@ -1765,17 +1685,17 @@ static int s3cmci_probe(struct platform_device *pdev)
 	/* depending on the dma state, get a dma channel to use. */
 
 	if (s3cmci_host_usedma(host)) {
-		host->dma = s3c2410_dma_request(DMACH_SDI, &s3cmci_dma_client,
-						host);
-		if (host->dma < 0) {
+		dma_cap_mask_t mask;
+
+		dma_cap_zero(mask);
+		dma_cap_set(DMA_SLAVE, mask);
+
+		host->dma = dma_request_slave_channel_compat(mask,
+			s3c24xx_dma_filter, (void *)DMACH_SDI, &pdev->dev, "rx-tx");
+		if (!host->dma) {
 			dev_err(&pdev->dev, "cannot get DMA channel.\n");
-			if (!s3cmci_host_canpio()) {
-				ret = -EBUSY;
-				goto probe_free_gpio_wp;
-			} else {
-				dev_warn(&pdev->dev, "falling back to PIO.\n");
-				host->dodma = 0;
-			}
+			ret = -EBUSY;
+			goto probe_free_gpio_wp;
 		}
 	}
 
@@ -1787,7 +1707,7 @@ static int s3cmci_probe(struct platform_device *pdev)
 		goto probe_free_dma;
 	}
 
-	ret = clk_enable(host->clk);
+	ret = clk_prepare_enable(host->clk);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to enable clock source.\n");
 		goto clk_free;
@@ -1816,7 +1736,7 @@ static int s3cmci_probe(struct platform_device *pdev)
 	mmc->max_segs		= 128;
 
 	dbg(host, dbg_debug,
-	    "probe: mode:%s mapped mci_base:%p irq:%u irq_cd:%u dma:%u.\n",
+	    "probe: mode:%s mapped mci_base:%p irq:%u irq_cd:%u dma:%p.\n",
 	    (host->is2440?"2440":""),
 	    host->base, host->irq, host->irq_cd, host->dma);
 
@@ -1845,14 +1765,14 @@ static int s3cmci_probe(struct platform_device *pdev)
 	s3cmci_cpufreq_deregister(host);
 
  free_dmabuf:
-	clk_disable(host->clk);
+	clk_disable_unprepare(host->clk);
 
  clk_free:
 	clk_put(host->clk);
 
  probe_free_dma:
 	if (s3cmci_host_usedma(host))
-		s3c2410_dma_free(host->dma, &s3cmci_dma_client);
+		dma_release_channel(host->dma);
 
  probe_free_gpio_wp:
 	if (!host->pdata->no_wprotect)
@@ -1897,7 +1817,7 @@ static void s3cmci_shutdown(struct platform_device *pdev)
 	s3cmci_debugfs_remove(host);
 	s3cmci_cpufreq_deregister(host);
 	mmc_remove_host(mmc);
-	clk_disable(host->clk);
+	clk_disable_unprepare(host->clk);
 }
 
 static int s3cmci_remove(struct platform_device *pdev)
@@ -1914,7 +1834,7 @@ static int s3cmci_remove(struct platform_device *pdev)
 	tasklet_disable(&host->pio_tasklet);
 
 	if (s3cmci_host_usedma(host))
-		s3c2410_dma_free(host->dma, &s3cmci_dma_client);
+		dma_release_channel(host->dma);
 
 	free_irq(host->irq, host);
 
diff --git a/drivers/mmc/host/s3cmci.h b/drivers/mmc/host/s3cmci.h
index c76b53dbeb61..cc2e46cb5c64 100644
--- a/drivers/mmc/host/s3cmci.h
+++ b/drivers/mmc/host/s3cmci.h
@@ -26,7 +26,7 @@ struct s3cmci_host {
 	void __iomem		*base;
 	int			irq;
 	int			irq_cd;
-	int			dma;
+	struct dma_chan		*dma;
 
 	unsigned long		clk_rate;
 	unsigned long		clk_div;
@@ -36,8 +36,6 @@ struct s3cmci_host {
 	int			is2440;
 	unsigned		sdiimsk;
 	unsigned		sdidata;
-	int			dodma;
-	int			dmatogo;
 
 	bool			irq_disabled;
 	bool			irq_enabled;
diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c
index 8ce3c28cb76e..8c5337002c51 100644
--- a/drivers/mmc/host/sdhci-acpi.c
+++ b/drivers/mmc/host/sdhci-acpi.c
@@ -124,9 +124,11 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = {
 
 static const struct sdhci_acpi_slot sdhci_acpi_slot_int_emmc = {
 	.chip    = &sdhci_acpi_chip_int,
-	.caps    = MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE | MMC_CAP_HW_RESET,
+	.caps    = MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
+		   MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR,
 	.caps2   = MMC_CAP2_HC_ERASE_SZ,
 	.flags   = SDHCI_ACPI_RUNTIME_PM,
+	.quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN,
 };
 
 static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sdio = {
diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c
index 40573a58486a..1a6661ed6205 100644
--- a/drivers/mmc/host/sdhci-msm.c
+++ b/drivers/mmc/host/sdhci-msm.c
@@ -16,7 +16,6 @@
 
 #include <linux/module.h>
 #include <linux/of_device.h>
-#include <linux/regulator/consumer.h>
 #include <linux/delay.h>
 #include <linux/mmc/mmc.h>
 #include <linux/slab.h>
diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c
index 52c42fcc284c..c3a1debc9289 100644
--- a/drivers/mmc/host/sdhci-pci.c
+++ b/drivers/mmc/host/sdhci-pci.c
@@ -103,6 +103,10 @@ static const struct sdhci_pci_fixes sdhci_cafe = {
 			  SDHCI_QUIRK_BROKEN_TIMEOUT_VAL,
 };
 
+static const struct sdhci_pci_fixes sdhci_intel_qrk = {
+	.quirks		= SDHCI_QUIRK_NO_HISPD_BIT,
+};
+
 static int mrst_hc_probe_slot(struct sdhci_pci_slot *slot)
 {
 	slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA;
@@ -264,7 +268,7 @@ static void sdhci_pci_int_hw_reset(struct sdhci_host *host)
 static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot)
 {
 	slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
-				 MMC_CAP_HW_RESET;
+				 MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR;
 	slot->host->mmc->caps2 |= MMC_CAP2_HC_ERASE_SZ;
 	slot->hw_reset = sdhci_pci_int_hw_reset;
 	return 0;
@@ -279,6 +283,7 @@ static int byt_sdio_probe_slot(struct sdhci_pci_slot *slot)
 static const struct sdhci_pci_fixes sdhci_intel_byt_emmc = {
 	.allow_runtime_pm = true,
 	.probe_slot	= byt_emmc_probe_slot,
+	.quirks2	= SDHCI_QUIRK2_PRESET_VALUE_BROKEN,
 };
 
 static const struct sdhci_pci_fixes sdhci_intel_byt_sdio = {
@@ -753,6 +758,14 @@ static const struct pci_device_id pci_ids[] = {
 
 	{
 		.vendor		= PCI_VENDOR_ID_INTEL,
+		.device		= PCI_DEVICE_ID_INTEL_QRK_SD,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.driver_data	= (kernel_ulong_t)&sdhci_intel_qrk,
+	},
+
+	{
+		.vendor		= PCI_VENDOR_ID_INTEL,
 		.device		= PCI_DEVICE_ID_INTEL_MRST_SD0,
 		.subvendor	= PCI_ANY_ID,
 		.subdevice	= PCI_ANY_ID,
@@ -1130,18 +1143,13 @@ static int sdhci_pci_suspend(struct device *dev)
 			goto err_pci_suspend;
 	}
 
-	pci_save_state(pdev);
 	if (pm_flags & MMC_PM_KEEP_POWER) {
-		if (pm_flags & MMC_PM_WAKE_SDIO_IRQ) {
-			pci_pme_active(pdev, true);
-			pci_enable_wake(pdev, PCI_D3hot, 1);
-		}
-		pci_set_power_state(pdev, PCI_D3hot);
-	} else {
-		pci_enable_wake(pdev, PCI_D3hot, 0);
-		pci_disable_device(pdev);
-		pci_set_power_state(pdev, PCI_D3hot);
-	}
+		if (pm_flags & MMC_PM_WAKE_SDIO_IRQ)
+			device_init_wakeup(dev, true);
+		else
+			device_init_wakeup(dev, false);
+	} else
+		device_init_wakeup(dev, false);
 
 	return 0;
 
@@ -1162,12 +1170,6 @@ static int sdhci_pci_resume(struct device *dev)
 	if (!chip)
 		return 0;
 
-	pci_set_power_state(pdev, PCI_D0);
-	pci_restore_state(pdev);
-	ret = pci_enable_device(pdev);
-	if (ret)
-		return ret;
-
 	if (chip->fixes && chip->fixes->resume) {
 		ret = chip->fixes->resume(chip);
 		if (ret)
diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h
index 6d718719659e..c101477ef3be 100644
--- a/drivers/mmc/host/sdhci-pci.h
+++ b/drivers/mmc/host/sdhci-pci.h
@@ -17,6 +17,7 @@
 #define PCI_DEVICE_ID_INTEL_CLV_SDIO2	0x08fb
 #define PCI_DEVICE_ID_INTEL_CLV_EMMC0	0x08e5
 #define PCI_DEVICE_ID_INTEL_CLV_EMMC1	0x08e6
+#define PCI_DEVICE_ID_INTEL_QRK_SD	0x08A7
 
 /*
  * PCI registers
diff --git a/drivers/mmc/host/sdhci-pxav3.c b/drivers/mmc/host/sdhci-pxav3.c
index f4f128947561..6f842fb8e6b8 100644
--- a/drivers/mmc/host/sdhci-pxav3.c
+++ b/drivers/mmc/host/sdhci-pxav3.c
@@ -288,15 +288,13 @@ static int sdhci_pxav3_probe(struct platform_device *pdev)
 	int ret;
 	struct clk *clk;
 
-	pxa = kzalloc(sizeof(struct sdhci_pxa), GFP_KERNEL);
+	pxa = devm_kzalloc(&pdev->dev, sizeof(struct sdhci_pxa), GFP_KERNEL);
 	if (!pxa)
 		return -ENOMEM;
 
 	host = sdhci_pltfm_init(pdev, &sdhci_pxav3_pdata, 0);
-	if (IS_ERR(host)) {
-		kfree(pxa);
+	if (IS_ERR(host))
 		return PTR_ERR(host);
-	}
 
 	if (of_device_is_compatible(np, "marvell,armada-380-sdhci")) {
 		ret = mv_conf_mbus_windows(pdev, mv_mbus_dram_info());
@@ -308,7 +306,7 @@ static int sdhci_pxav3_probe(struct platform_device *pdev)
 	pltfm_host = sdhci_priv(host);
 	pltfm_host->priv = pxa;
 
-	clk = clk_get(dev, NULL);
+	clk = devm_clk_get(dev, NULL);
 	if (IS_ERR(clk)) {
 		dev_err(dev, "failed to get io clock\n");
 		ret = PTR_ERR(clk);
@@ -389,11 +387,9 @@ err_add_host:
 	pm_runtime_put_sync(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
 	clk_disable_unprepare(clk);
-	clk_put(clk);
 err_clk_get:
 err_mbus_win:
 	sdhci_pltfm_free(pdev);
-	kfree(pxa);
 	return ret;
 }
 
@@ -401,17 +397,14 @@ static int sdhci_pxav3_remove(struct platform_device *pdev)
 {
 	struct sdhci_host *host = platform_get_drvdata(pdev);
 	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
-	struct sdhci_pxa *pxa = pltfm_host->priv;
 
 	pm_runtime_get_sync(&pdev->dev);
 	sdhci_remove_host(host, 1);
 	pm_runtime_disable(&pdev->dev);
 
 	clk_disable_unprepare(pltfm_host->clk);
-	clk_put(pltfm_host->clk);
 
 	sdhci_pltfm_free(pdev);
-	kfree(pxa);
 
 	return 0;
 }
diff --git a/drivers/mmc/host/sdhci-st.c b/drivers/mmc/host/sdhci-st.c
new file mode 100644
index 000000000000..328f348c7243
--- /dev/null
+++ b/drivers/mmc/host/sdhci-st.c
@@ -0,0 +1,176 @@
+/*
+ * Support for SDHCI on STMicroelectronics SoCs
+ *
+ * Copyright (C) 2014 STMicroelectronics Ltd
+ * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
+ * Contributors: Peter Griffin <peter.griffin@linaro.org>
+ *
+ * Based on sdhci-cns3xxx.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/mmc/host.h>
+
+#include "sdhci-pltfm.h"
+
+static u32 sdhci_st_readl(struct sdhci_host *host, int reg)
+{
+	u32 ret;
+
+	switch (reg) {
+	case SDHCI_CAPABILITIES:
+		ret = readl_relaxed(host->ioaddr + reg);
+		/* Support 3.3V and 1.8V */
+		ret &= ~SDHCI_CAN_VDD_300;
+		break;
+	default:
+		ret = readl_relaxed(host->ioaddr + reg);
+	}
+	return ret;
+}
+
+static const struct sdhci_ops sdhci_st_ops = {
+	.get_max_clock = sdhci_pltfm_clk_get_max_clock,
+	.set_clock = sdhci_set_clock,
+	.set_bus_width = sdhci_set_bus_width,
+	.read_l = sdhci_st_readl,
+	.reset = sdhci_reset,
+};
+
+static const struct sdhci_pltfm_data sdhci_st_pdata = {
+	.ops = &sdhci_st_ops,
+	.quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC |
+	    SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN,
+};
+
+
+static int sdhci_st_probe(struct platform_device *pdev)
+{
+	struct sdhci_host *host;
+	struct sdhci_pltfm_host *pltfm_host;
+	struct clk *clk;
+	int ret = 0;
+	u16 host_version;
+
+	clk =  devm_clk_get(&pdev->dev, "mmc");
+	if (IS_ERR(clk)) {
+		dev_err(&pdev->dev, "Peripheral clk not found\n");
+		return PTR_ERR(clk);
+	}
+
+	host = sdhci_pltfm_init(pdev, &sdhci_st_pdata, 0);
+	if (IS_ERR(host)) {
+		dev_err(&pdev->dev, "Failed sdhci_pltfm_init\n");
+		return PTR_ERR(host);
+	}
+
+	ret = mmc_of_parse(host->mmc);
+
+	if (ret) {
+		dev_err(&pdev->dev, "Failed mmc_of_parse\n");
+		return ret;
+	}
+
+	clk_prepare_enable(clk);
+
+	pltfm_host = sdhci_priv(host);
+	pltfm_host->clk = clk;
+
+	ret = sdhci_add_host(host);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed sdhci_add_host\n");
+		goto err_out;
+	}
+
+	platform_set_drvdata(pdev, host);
+
+	host_version = readw_relaxed((host->ioaddr + SDHCI_HOST_VERSION));
+
+	dev_info(&pdev->dev, "SDHCI ST Initialised: Host Version: 0x%x Vendor Version 0x%x\n",
+		((host_version & SDHCI_SPEC_VER_MASK) >> SDHCI_SPEC_VER_SHIFT),
+		((host_version & SDHCI_VENDOR_VER_MASK) >>
+		SDHCI_VENDOR_VER_SHIFT));
+
+	return 0;
+
+err_out:
+	clk_disable_unprepare(clk);
+	sdhci_pltfm_free(pdev);
+
+	return ret;
+}
+
+static int sdhci_st_remove(struct platform_device *pdev)
+{
+	struct sdhci_host *host = platform_get_drvdata(pdev);
+	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+
+	clk_disable_unprepare(pltfm_host->clk);
+
+	return sdhci_pltfm_unregister(pdev);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int sdhci_st_suspend(struct device *dev)
+{
+	struct sdhci_host *host = dev_get_drvdata(dev);
+	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+	int ret = sdhci_suspend_host(host);
+
+	if (ret)
+		goto out;
+
+	clk_disable_unprepare(pltfm_host->clk);
+out:
+	return ret;
+}
+
+static int sdhci_st_resume(struct device *dev)
+{
+	struct sdhci_host *host = dev_get_drvdata(dev);
+	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+
+	clk_prepare_enable(pltfm_host->clk);
+
+	return sdhci_resume_host(host);
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(sdhci_st_pmops, sdhci_st_suspend, sdhci_st_resume);
+
+static const struct of_device_id st_sdhci_match[] = {
+	{ .compatible = "st,sdhci" },
+	{},
+};
+
+MODULE_DEVICE_TABLE(of, st_sdhci_match);
+
+static struct platform_driver sdhci_st_driver = {
+	.probe = sdhci_st_probe,
+	.remove = sdhci_st_remove,
+	.driver = {
+		   .name = "sdhci-st",
+		   .pm = &sdhci_st_pmops,
+		   .of_match_table = of_match_ptr(st_sdhci_match),
+		  },
+};
+
+module_platform_driver(sdhci_st_driver);
+
+MODULE_DESCRIPTION("SDHCI driver for STMicroelectronics SoCs");
+MODULE_AUTHOR("Giuseppe Cavallaro <peppe.cavallaro@st.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:st-sdhci");
diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c
index d93a063a36f3..33100d10d176 100644
--- a/drivers/mmc/host/sdhci-tegra.c
+++ b/drivers/mmc/host/sdhci-tegra.c
@@ -26,8 +26,6 @@
 #include <linux/mmc/host.h>
 #include <linux/mmc/slot-gpio.h>
 
-#include <asm/gpio.h>
-
 #include "sdhci-pltfm.h"
 
 /* Tegra SDHOST controller vendor register definitions */
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 47055f3f01b8..37b2a9ae52ef 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -1223,8 +1223,16 @@ EXPORT_SYMBOL_GPL(sdhci_set_clock);
 static void sdhci_set_power(struct sdhci_host *host, unsigned char mode,
 			    unsigned short vdd)
 {
+	struct mmc_host *mmc = host->mmc;
 	u8 pwr = 0;
 
+	if (!IS_ERR(mmc->supply.vmmc)) {
+		spin_unlock_irq(&host->lock);
+		mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, vdd);
+		spin_lock_irq(&host->lock);
+		return;
+	}
+
 	if (mode != MMC_POWER_OFF) {
 		switch (1 << vdd) {
 		case MMC_VDD_165_195:
@@ -1283,12 +1291,6 @@ static void sdhci_set_power(struct sdhci_host *host, unsigned char mode,
 		if (host->quirks & SDHCI_QUIRK_DELAY_AFTER_POWER)
 			mdelay(10);
 	}
-
-	if (host->vmmc) {
-		spin_unlock_irq(&host->lock);
-		mmc_regulator_set_ocr(host->mmc, host->vmmc, vdd);
-		spin_lock_irq(&host->lock);
-	}
 }
 
 /*****************************************************************************\
@@ -1440,13 +1442,15 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
 {
 	unsigned long flags;
 	u8 ctrl;
+	struct mmc_host *mmc = host->mmc;
 
 	spin_lock_irqsave(&host->lock, flags);
 
 	if (host->flags & SDHCI_DEVICE_DEAD) {
 		spin_unlock_irqrestore(&host->lock, flags);
-		if (host->vmmc && ios->power_mode == MMC_POWER_OFF)
-			mmc_regulator_set_ocr(host->mmc, host->vmmc, 0);
+		if (!IS_ERR(mmc->supply.vmmc) &&
+		    ios->power_mode == MMC_POWER_OFF)
+			mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, 0);
 		return;
 	}
 
@@ -1530,7 +1534,6 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
 			host->ops->set_clock(host, host->clock);
 		}
 
-
 		/* Reset SD Clock Enable */
 		clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL);
 		clk &= ~SDHCI_CLOCK_CARD_EN;
@@ -1707,6 +1710,7 @@ static void sdhci_enable_sdio_irq(struct mmc_host *mmc, int enable)
 static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
 						struct mmc_ios *ios)
 {
+	struct mmc_host *mmc = host->mmc;
 	u16 ctrl;
 	int ret;
 
@@ -1725,11 +1729,12 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
 		ctrl &= ~SDHCI_CTRL_VDD_180;
 		sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2);
 
-		if (host->vqmmc) {
-			ret = regulator_set_voltage(host->vqmmc, 2700000, 3600000);
+		if (!IS_ERR(mmc->supply.vqmmc)) {
+			ret = regulator_set_voltage(mmc->supply.vqmmc, 2700000,
+						    3600000);
 			if (ret) {
 				pr_warning("%s: Switching to 3.3V signalling voltage "
-						" failed\n", mmc_hostname(host->mmc));
+						" failed\n", mmc_hostname(mmc));
 				return -EIO;
 			}
 		}
@@ -1742,16 +1747,16 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
 			return 0;
 
 		pr_warning("%s: 3.3V regulator output did not became stable\n",
-				mmc_hostname(host->mmc));
+				mmc_hostname(mmc));
 
 		return -EAGAIN;
 	case MMC_SIGNAL_VOLTAGE_180:
-		if (host->vqmmc) {
-			ret = regulator_set_voltage(host->vqmmc,
+		if (!IS_ERR(mmc->supply.vqmmc)) {
+			ret = regulator_set_voltage(mmc->supply.vqmmc,
 					1700000, 1950000);
 			if (ret) {
 				pr_warning("%s: Switching to 1.8V signalling voltage "
-						" failed\n", mmc_hostname(host->mmc));
+						" failed\n", mmc_hostname(mmc));
 				return -EIO;
 			}
 		}
@@ -1763,24 +1768,22 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host,
 		ctrl |= SDHCI_CTRL_VDD_180;
 		sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2);
 
-		/* Wait for 5ms */
-		usleep_range(5000, 5500);
-
 		/* 1.8V regulator output should be stable within 5 ms */
 		ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2);
 		if (ctrl & SDHCI_CTRL_VDD_180)
 			return 0;
 
 		pr_warning("%s: 1.8V regulator output did not became stable\n",
-				mmc_hostname(host->mmc));
+				mmc_hostname(mmc));
 
 		return -EAGAIN;
 	case MMC_SIGNAL_VOLTAGE_120:
-		if (host->vqmmc) {
-			ret = regulator_set_voltage(host->vqmmc, 1100000, 1300000);
+		if (!IS_ERR(mmc->supply.vqmmc)) {
+			ret = regulator_set_voltage(mmc->supply.vqmmc, 1100000,
+						    1300000);
 			if (ret) {
 				pr_warning("%s: Switching to 1.2V signalling voltage "
-						" failed\n", mmc_hostname(host->mmc));
+						" failed\n", mmc_hostname(mmc));
 				return -EIO;
 			}
 		}
@@ -2643,7 +2646,6 @@ static void sdhci_runtime_pm_bus_off(struct sdhci_host *host)
 int sdhci_runtime_suspend_host(struct sdhci_host *host)
 {
 	unsigned long flags;
-	int ret = 0;
 
 	/* Disable tuning since we are suspending */
 	if (host->flags & SDHCI_USING_RETUNING_TIMER) {
@@ -2663,14 +2665,14 @@ int sdhci_runtime_suspend_host(struct sdhci_host *host)
 	host->runtime_suspended = true;
 	spin_unlock_irqrestore(&host->lock, flags);
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(sdhci_runtime_suspend_host);
 
 int sdhci_runtime_resume_host(struct sdhci_host *host)
 {
 	unsigned long flags;
-	int ret = 0, host_flags = host->flags;
+	int host_flags = host->flags;
 
 	if (host_flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) {
 		if (host->ops->enable_dma)
@@ -2709,7 +2711,7 @@ int sdhci_runtime_resume_host(struct sdhci_host *host)
 
 	spin_unlock_irqrestore(&host->lock, flags);
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(sdhci_runtime_resume_host);
 
@@ -2820,12 +2822,12 @@ int sdhci_add_host(struct sdhci_host *host)
 		 * (128) and potentially one alignment transfer for
 		 * each of those entries.
 		 */
-		host->adma_desc = dma_alloc_coherent(mmc_dev(host->mmc),
+		host->adma_desc = dma_alloc_coherent(mmc_dev(mmc),
 						     ADMA_SIZE, &host->adma_addr,
 						     GFP_KERNEL);
 		host->align_buffer = kmalloc(128 * 4, GFP_KERNEL);
 		if (!host->adma_desc || !host->align_buffer) {
-			dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE,
+			dma_free_coherent(mmc_dev(mmc), ADMA_SIZE,
 					  host->adma_desc, host->adma_addr);
 			kfree(host->align_buffer);
 			pr_warning("%s: Unable to allocate ADMA "
@@ -2838,7 +2840,7 @@ int sdhci_add_host(struct sdhci_host *host)
 			pr_warning("%s: unable to allocate aligned ADMA descriptor\n",
 				   mmc_hostname(mmc));
 			host->flags &= ~SDHCI_USE_ADMA;
-			dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE,
+			dma_free_coherent(mmc_dev(mmc), ADMA_SIZE,
 					  host->adma_desc, host->adma_addr);
 			kfree(host->align_buffer);
 			host->adma_desc = NULL;
@@ -2853,7 +2855,7 @@ int sdhci_add_host(struct sdhci_host *host)
 	 */
 	if (!(host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA))) {
 		host->dma_mask = DMA_BIT_MASK(64);
-		mmc_dev(host->mmc)->dma_mask = &host->dma_mask;
+		mmc_dev(mmc)->dma_mask = &host->dma_mask;
 	}
 
 	if (host->version >= SDHCI_SPEC_300)
@@ -2959,28 +2961,25 @@ int sdhci_add_host(struct sdhci_host *host)
 		mmc->caps |= MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED;
 
 	if ((host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) &&
-	    !(host->mmc->caps & MMC_CAP_NONREMOVABLE))
+	    !(mmc->caps & MMC_CAP_NONREMOVABLE))
 		mmc->caps |= MMC_CAP_NEEDS_POLL;
 
+	/* If there are external regulators, get them */
+	if (mmc_regulator_get_supply(mmc) == -EPROBE_DEFER)
+		return -EPROBE_DEFER;
+
 	/* If vqmmc regulator and no 1.8V signalling, then there's no UHS */
-	host->vqmmc = regulator_get_optional(mmc_dev(mmc), "vqmmc");
-	if (IS_ERR_OR_NULL(host->vqmmc)) {
-		if (PTR_ERR(host->vqmmc) < 0) {
-			pr_info("%s: no vqmmc regulator found\n",
-				mmc_hostname(mmc));
-			host->vqmmc = NULL;
-		}
-	} else {
-		ret = regulator_enable(host->vqmmc);
-		if (!regulator_is_supported_voltage(host->vqmmc, 1700000,
-			1950000))
+	if (!IS_ERR(mmc->supply.vqmmc)) {
+		ret = regulator_enable(mmc->supply.vqmmc);
+		if (!regulator_is_supported_voltage(mmc->supply.vqmmc, 1700000,
+						    1950000))
 			caps[1] &= ~(SDHCI_SUPPORT_SDR104 |
 					SDHCI_SUPPORT_SDR50 |
 					SDHCI_SUPPORT_DDR50);
 		if (ret) {
 			pr_warn("%s: Failed to enable vqmmc regulator: %d\n",
 				mmc_hostname(mmc), ret);
-			host->vqmmc = NULL;
+			mmc->supply.vqmmc = NULL;
 		}
 	}
 
@@ -3041,34 +3040,6 @@ int sdhci_add_host(struct sdhci_host *host)
 
 	ocr_avail = 0;
 
-	host->vmmc = regulator_get_optional(mmc_dev(mmc), "vmmc");
-	if (IS_ERR_OR_NULL(host->vmmc)) {
-		if (PTR_ERR(host->vmmc) < 0) {
-			pr_info("%s: no vmmc regulator found\n",
-				mmc_hostname(mmc));
-			host->vmmc = NULL;
-		}
-	}
-
-#ifdef CONFIG_REGULATOR
-	/*
-	 * Voltage range check makes sense only if regulator reports
-	 * any voltage value.
-	 */
-	if (host->vmmc && regulator_get_voltage(host->vmmc) > 0) {
-		ret = regulator_is_supported_voltage(host->vmmc, 2700000,
-			3600000);
-		if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_330)))
-			caps[0] &= ~SDHCI_CAN_VDD_330;
-		if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_300)))
-			caps[0] &= ~SDHCI_CAN_VDD_300;
-		ret = regulator_is_supported_voltage(host->vmmc, 1700000,
-			1950000);
-		if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_180)))
-			caps[0] &= ~SDHCI_CAN_VDD_180;
-	}
-#endif /* CONFIG_REGULATOR */
-
 	/*
 	 * According to SD Host Controller spec v3.00, if the Host System
 	 * can afford more than 150mA, Host Driver should set XPC to 1. Also
@@ -3077,8 +3048,8 @@ int sdhci_add_host(struct sdhci_host *host)
 	 * value.
 	 */
 	max_current_caps = sdhci_readl(host, SDHCI_MAX_CURRENT);
-	if (!max_current_caps && host->vmmc) {
-		u32 curr = regulator_get_current_limit(host->vmmc);
+	if (!max_current_caps && !IS_ERR(mmc->supply.vmmc)) {
+		u32 curr = regulator_get_current_limit(mmc->supply.vmmc);
 		if (curr > 0) {
 
 			/* convert to SDHCI_MAX_CURRENT format */
@@ -3118,8 +3089,12 @@ int sdhci_add_host(struct sdhci_host *host)
 				   SDHCI_MAX_CURRENT_MULTIPLIER;
 	}
 
+	/* If OCR set by external regulators, use it instead */
+	if (mmc->ocr_avail)
+		ocr_avail = mmc->ocr_avail;
+
 	if (host->ocr_mask)
-		ocr_avail = host->ocr_mask;
+		ocr_avail &= host->ocr_mask;
 
 	mmc->ocr_avail = ocr_avail;
 	mmc->ocr_avail_sdio = ocr_avail;
@@ -3273,6 +3248,7 @@ EXPORT_SYMBOL_GPL(sdhci_add_host);
 
 void sdhci_remove_host(struct sdhci_host *host, int dead)
 {
+	struct mmc_host *mmc = host->mmc;
 	unsigned long flags;
 
 	if (dead) {
@@ -3282,7 +3258,7 @@ void sdhci_remove_host(struct sdhci_host *host, int dead)
 
 		if (host->mrq) {
 			pr_err("%s: Controller removed during "
-				" transfer!\n", mmc_hostname(host->mmc));
+				" transfer!\n", mmc_hostname(mmc));
 
 			host->mrq->cmd->error = -ENOMEDIUM;
 			tasklet_schedule(&host->finish_tasklet);
@@ -3293,7 +3269,7 @@ void sdhci_remove_host(struct sdhci_host *host, int dead)
 
 	sdhci_disable_card_detection(host);
 
-	mmc_remove_host(host->mmc);
+	mmc_remove_host(mmc);
 
 #ifdef SDHCI_USE_LEDS_CLASS
 	led_classdev_unregister(&host->led);
@@ -3310,18 +3286,14 @@ void sdhci_remove_host(struct sdhci_host *host, int dead)
 
 	tasklet_kill(&host->finish_tasklet);
 
-	if (host->vmmc) {
-		regulator_disable(host->vmmc);
-		regulator_put(host->vmmc);
-	}
+	if (!IS_ERR(mmc->supply.vmmc))
+		regulator_disable(mmc->supply.vmmc);
 
-	if (host->vqmmc) {
-		regulator_disable(host->vqmmc);
-		regulator_put(host->vqmmc);
-	}
+	if (!IS_ERR(mmc->supply.vqmmc))
+		regulator_disable(mmc->supply.vqmmc);
 
 	if (host->adma_desc)
-		dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE,
+		dma_free_coherent(mmc_dev(mmc), ADMA_SIZE,
 				  host->adma_desc, host->adma_addr);
 	kfree(host->align_buffer);
 
diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c
index 656fbba4c422..d11708c815d7 100644
--- a/drivers/mmc/host/sh_mmcif.c
+++ b/drivers/mmc/host/sh_mmcif.c
@@ -386,7 +386,7 @@ sh_mmcif_request_dma_one(struct sh_mmcif_host *host,
 			 struct sh_mmcif_plat_data *pdata,
 			 enum dma_transfer_direction direction)
 {
-	struct dma_slave_config cfg;
+	struct dma_slave_config cfg = { 0, };
 	struct dma_chan *chan;
 	unsigned int slave_id;
 	struct resource *res;
@@ -417,8 +417,15 @@ sh_mmcif_request_dma_one(struct sh_mmcif_host *host,
 	/* In the OF case the driver will get the slave ID from the DT */
 	cfg.slave_id = slave_id;
 	cfg.direction = direction;
-	cfg.dst_addr = res->start + MMCIF_CE_DATA;
-	cfg.src_addr = 0;
+
+	if (direction == DMA_DEV_TO_MEM) {
+		cfg.src_addr = res->start + MMCIF_CE_DATA;
+		cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+	} else {
+		cfg.dst_addr = res->start + MMCIF_CE_DATA;
+		cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+	}
+
 	ret = dmaengine_slave_config(chan, &cfg);
 	if (ret < 0) {
 		dma_release_channel(chan);
@@ -1378,26 +1385,19 @@ static int sh_mmcif_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "Get irq error\n");
 		return -ENXIO;
 	}
+
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		dev_err(&pdev->dev, "platform_get_resource error.\n");
-		return -ENXIO;
-	}
-	reg = ioremap(res->start, resource_size(res));
-	if (!reg) {
-		dev_err(&pdev->dev, "ioremap error.\n");
-		return -ENOMEM;
-	}
+	reg = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(reg))
+		return PTR_ERR(reg);
 
 	mmc = mmc_alloc_host(sizeof(struct sh_mmcif_host), &pdev->dev);
-	if (!mmc) {
-		ret = -ENOMEM;
-		goto ealloch;
-	}
+	if (!mmc)
+		return -ENOMEM;
 
 	ret = mmc_of_parse(mmc);
 	if (ret < 0)
-		goto eofparse;
+		goto err_host;
 
 	host		= mmc_priv(mmc);
 	host->mmc	= mmc;
@@ -1427,19 +1427,19 @@ static int sh_mmcif_probe(struct platform_device *pdev)
 	pm_runtime_enable(&pdev->dev);
 	host->power = false;
 
-	host->hclk = clk_get(&pdev->dev, NULL);
+	host->hclk = devm_clk_get(&pdev->dev, NULL);
 	if (IS_ERR(host->hclk)) {
 		ret = PTR_ERR(host->hclk);
 		dev_err(&pdev->dev, "cannot get clock: %d\n", ret);
-		goto eclkget;
+		goto err_pm;
 	}
 	ret = sh_mmcif_clk_update(host);
 	if (ret < 0)
-		goto eclkupdate;
+		goto err_pm;
 
 	ret = pm_runtime_resume(&pdev->dev);
 	if (ret < 0)
-		goto eresume;
+		goto err_clk;
 
 	INIT_DELAYED_WORK(&host->timeout_work, mmcif_timeout_work);
 
@@ -1447,65 +1447,55 @@ static int sh_mmcif_probe(struct platform_device *pdev)
 	sh_mmcif_writel(host->addr, MMCIF_CE_INT_MASK, MASK_ALL);
 
 	name = irq[1] < 0 ? dev_name(&pdev->dev) : "sh_mmc:error";
-	ret = request_threaded_irq(irq[0], sh_mmcif_intr, sh_mmcif_irqt, 0, name, host);
+	ret = devm_request_threaded_irq(&pdev->dev, irq[0], sh_mmcif_intr,
+					sh_mmcif_irqt, 0, name, host);
 	if (ret) {
 		dev_err(&pdev->dev, "request_irq error (%s)\n", name);
-		goto ereqirq0;
+		goto err_clk;
 	}
 	if (irq[1] >= 0) {
-		ret = request_threaded_irq(irq[1], sh_mmcif_intr, sh_mmcif_irqt,
-					   0, "sh_mmc:int", host);
+		ret = devm_request_threaded_irq(&pdev->dev, irq[1],
+						sh_mmcif_intr, sh_mmcif_irqt,
+						0, "sh_mmc:int", host);
 		if (ret) {
 			dev_err(&pdev->dev, "request_irq error (sh_mmc:int)\n");
-			goto ereqirq1;
+			goto err_clk;
 		}
 	}
 
 	if (pd && pd->use_cd_gpio) {
 		ret = mmc_gpio_request_cd(mmc, pd->cd_gpio, 0);
 		if (ret < 0)
-			goto erqcd;
+			goto err_clk;
 	}
 
 	mutex_init(&host->thread_lock);
 
-	clk_disable_unprepare(host->hclk);
 	ret = mmc_add_host(mmc);
 	if (ret < 0)
-		goto emmcaddh;
+		goto err_clk;
 
 	dev_pm_qos_expose_latency_limit(&pdev->dev, 100);
 
-	dev_info(&pdev->dev, "driver version %s\n", DRIVER_VERSION);
-	dev_dbg(&pdev->dev, "chip ver H'%04x\n",
-		sh_mmcif_readl(host->addr, MMCIF_CE_VERSION) & 0x0000ffff);
+	dev_info(&pdev->dev, "Chip version 0x%04x, clock rate %luMHz\n",
+		 sh_mmcif_readl(host->addr, MMCIF_CE_VERSION) & 0xffff,
+		 clk_get_rate(host->hclk) / 1000000UL);
+
+	clk_disable_unprepare(host->hclk);
 	return ret;
 
-emmcaddh:
-erqcd:
-	if (irq[1] >= 0)
-		free_irq(irq[1], host);
-ereqirq1:
-	free_irq(irq[0], host);
-ereqirq0:
-	pm_runtime_suspend(&pdev->dev);
-eresume:
+err_clk:
 	clk_disable_unprepare(host->hclk);
-eclkupdate:
-	clk_put(host->hclk);
-eclkget:
+err_pm:
 	pm_runtime_disable(&pdev->dev);
-eofparse:
+err_host:
 	mmc_free_host(mmc);
-ealloch:
-	iounmap(reg);
 	return ret;
 }
 
 static int sh_mmcif_remove(struct platform_device *pdev)
 {
 	struct sh_mmcif_host *host = platform_get_drvdata(pdev);
-	int irq[2];
 
 	host->dying = true;
 	clk_prepare_enable(host->hclk);
@@ -1523,16 +1513,6 @@ static int sh_mmcif_remove(struct platform_device *pdev)
 	 */
 	cancel_delayed_work_sync(&host->timeout_work);
 
-	if (host->addr)
-		iounmap(host->addr);
-
-	irq[0] = platform_get_irq(pdev, 0);
-	irq[1] = platform_get_irq(pdev, 1);
-
-	free_irq(irq[0], host);
-	if (irq[1] >= 0)
-		free_irq(irq[1], host);
-
 	clk_disable_unprepare(host->hclk);
 	mmc_free_host(host->mmc);
 	pm_runtime_put_sync(&pdev->dev);
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index 03e7b280cb4c..eb8f1d5c34b1 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -294,6 +294,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 			cfg.slave_id = pdata->dma->slave_id_tx;
 		cfg.direction = DMA_MEM_TO_DEV;
 		cfg.dst_addr = res->start + (CTL_SD_DATA_PORT << host->pdata->bus_shift);
+		cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
 		cfg.src_addr = 0;
 		ret = dmaengine_slave_config(host->chan_tx, &cfg);
 		if (ret < 0)
@@ -312,6 +313,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 			cfg.slave_id = pdata->dma->slave_id_rx;
 		cfg.direction = DMA_DEV_TO_MEM;
 		cfg.src_addr = cfg.dst_addr;
+		cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
 		cfg.dst_addr = 0;
 		ret = dmaengine_slave_config(host->chan_rx, &cfg);
 		if (ret < 0)
diff --git a/drivers/mmc/host/wmt-sdmmc.c b/drivers/mmc/host/wmt-sdmmc.c
index 282891a8e451..54181b4f6e9e 100644
--- a/drivers/mmc/host/wmt-sdmmc.c
+++ b/drivers/mmc/host/wmt-sdmmc.c
@@ -72,7 +72,6 @@
 #define BM_SPI_CS			0x20
 #define BM_SD_POWER			0x40
 #define BM_SOFT_RESET			0x80
-#define BM_ONEBIT_MASK			0xFD
 
 /* SDMMC_BLKLEN bit fields */
 #define BLKL_CRCERR_ABORT		0x0800
@@ -120,6 +119,8 @@
 #define STS2_DATARSP_BUSY		0x20
 #define STS2_DIS_FORCECLK		0x80
 
+/* SDMMC_EXTCTRL bit fields */
+#define EXT_EIGHTBIT			0x04
 
 /* MMC/SD DMA Controller Registers */
 #define SDDMA_GCR			0x100
@@ -672,7 +673,7 @@ static void wmt_mci_request(struct mmc_host *mmc, struct mmc_request *req)
 static void wmt_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 {
 	struct wmt_mci_priv *priv;
-	u32 reg_tmp;
+	u32 busmode, extctrl;
 
 	priv = mmc_priv(mmc);
 
@@ -687,28 +688,26 @@ static void wmt_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 	if (ios->clock != 0)
 		clk_set_rate(priv->clk_sdmmc, ios->clock);
 
+	busmode = readb(priv->sdmmc_base + SDMMC_BUSMODE);
+	extctrl = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
+
+	busmode &= ~(BM_EIGHTBIT_MODE | BM_FOURBIT_MODE);
+	extctrl &= ~EXT_EIGHTBIT;
+
 	switch (ios->bus_width) {
 	case MMC_BUS_WIDTH_8:
-		reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
-		writeb(reg_tmp | 0x04, priv->sdmmc_base + SDMMC_EXTCTRL);
+		busmode |= BM_EIGHTBIT_MODE;
+		extctrl |= EXT_EIGHTBIT;
 		break;
 	case MMC_BUS_WIDTH_4:
-		reg_tmp = readb(priv->sdmmc_base + SDMMC_BUSMODE);
-		writeb(reg_tmp | BM_FOURBIT_MODE, priv->sdmmc_base +
-		       SDMMC_BUSMODE);
-
-		reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
-		writeb(reg_tmp & 0xFB, priv->sdmmc_base + SDMMC_EXTCTRL);
+		busmode |= BM_FOURBIT_MODE;
 		break;
 	case MMC_BUS_WIDTH_1:
-		reg_tmp = readb(priv->sdmmc_base + SDMMC_BUSMODE);
-		writeb(reg_tmp & BM_ONEBIT_MASK, priv->sdmmc_base +
-		       SDMMC_BUSMODE);
-
-		reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL);
-		writeb(reg_tmp & 0xFB, priv->sdmmc_base + SDMMC_EXTCTRL);
 		break;
 	}
+
+	writeb(busmode, priv->sdmmc_base + SDMMC_BUSMODE);
+	writeb(extctrl, priv->sdmmc_base + SDMMC_EXTCTRL);
 }
 
 static int wmt_mci_get_ro(struct mmc_host *mmc)
@@ -830,7 +829,7 @@ static int wmt_mci_probe(struct platform_device *pdev)
 		goto fail3;
 	}
 
-	ret = request_irq(dma_irq, wmt_mci_dma_isr, 32, "sdmmc", priv);
+	ret = request_irq(dma_irq, wmt_mci_dma_isr, 0, "sdmmc", priv);
 	if (ret) {
 		dev_err(&pdev->dev, "Register DMA IRQ fail\n");
 		goto fail4;
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index 8457df7ec5af..33c64955d4d7 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -378,9 +378,11 @@ int ubiblock_create(struct ubi_volume_info *vi)
 {
 	struct ubiblock *dev;
 	struct gendisk *gd;
-	int disk_capacity;
+	u64 disk_capacity = ((u64)vi->size * vi->usable_leb_size) >> 9;
 	int ret;
 
+	if ((sector_t)disk_capacity != disk_capacity)
+		return -EFBIG;
 	/* Check that the volume isn't already handled */
 	mutex_lock(&devices_mutex);
 	if (find_dev_nolock(vi->ubi_num, vi->vol_id)) {
@@ -412,7 +414,6 @@ int ubiblock_create(struct ubi_volume_info *vi)
 	gd->first_minor = dev->ubi_num * UBI_MAX_VOLUMES + dev->vol_id;
 	gd->private_data = dev;
 	sprintf(gd->disk_name, "ubiblock%d_%d", dev->ubi_num, dev->vol_id);
-	disk_capacity = (vi->size * vi->usable_leb_size) >> 9;
 	set_capacity(gd, disk_capacity);
 	dev->gd = gd;
 
@@ -498,11 +499,16 @@ int ubiblock_remove(struct ubi_volume_info *vi)
 	return 0;
 }
 
-static void ubiblock_resize(struct ubi_volume_info *vi)
+static int ubiblock_resize(struct ubi_volume_info *vi)
 {
 	struct ubiblock *dev;
-	int disk_capacity;
+	u64 disk_capacity = ((u64)vi->size * vi->usable_leb_size) >> 9;
 
+	if ((sector_t)disk_capacity != disk_capacity) {
+		ubi_warn("%s: the volume is too big, cannot resize (%d LEBs)",
+			 dev->gd->disk_name, vi->size);
+		return -EFBIG;
+	}
 	/*
 	 * Need to lock the device list until we stop using the device,
 	 * otherwise the device struct might get released in
@@ -512,15 +518,15 @@ static void ubiblock_resize(struct ubi_volume_info *vi)
 	dev = find_dev_nolock(vi->ubi_num, vi->vol_id);
 	if (!dev) {
 		mutex_unlock(&devices_mutex);
-		return;
+		return -ENODEV;
 	}
 
 	mutex_lock(&dev->dev_mutex);
-	disk_capacity = (vi->size * vi->usable_leb_size) >> 9;
 	set_capacity(dev->gd, disk_capacity);
 	ubi_msg("%s resized to %d LEBs", dev->gd->disk_name, vi->size);
 	mutex_unlock(&dev->dev_mutex);
 	mutex_unlock(&devices_mutex);
+	return 0;
 }
 
 static int ubiblock_notify(struct notifier_block *nb,
diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c
index d77b1c1d7c72..07cac5f9ffb8 100644
--- a/drivers/mtd/ubi/vtbl.c
+++ b/drivers/mtd/ubi/vtbl.c
@@ -591,7 +591,7 @@ static int init_volumes(struct ubi_device *ubi,
 
 		/* Static volumes only */
 		av = ubi_find_av(ai, i);
-		if (!av) {
+		if (!av || !av->leb_count) {
 			/*
 			 * No eraseblocks belonging to this volume found. We
 			 * don't actually know whether this static volume is
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
index 0f3425dac910..20f491713145 100644
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -1718,12 +1718,12 @@ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum)
 	       vol_id, lnum, ubi->works_count);
 
 	while (found) {
-		struct ubi_work *wrk;
+		struct ubi_work *wrk, *tmp;
 		found = 0;
 
 		down_read(&ubi->work_sem);
 		spin_lock(&ubi->wl_lock);
-		list_for_each_entry(wrk, &ubi->works, list) {
+		list_for_each_entry_safe(wrk, tmp, &ubi->works, list) {
 			if ((vol_id == UBI_ALL || wrk->vol_id == vol_id) &&
 			    (lnum == UBI_ALL || wrk->lnum == lnum)) {
 				list_del(&wrk->list);
diff --git a/drivers/net/arcnet/com20020_cs.c b/drivers/net/arcnet/com20020_cs.c
index 326a612a2730..1a790a20210d 100644
--- a/drivers/net/arcnet/com20020_cs.c
+++ b/drivers/net/arcnet/com20020_cs.c
@@ -112,20 +112,20 @@ static void com20020_detach(struct pcmcia_device *p_dev);
 
 /*====================================================================*/
 
-typedef struct com20020_dev_t {
+struct com20020_dev {
     struct net_device       *dev;
-} com20020_dev_t;
+};
 
 static int com20020_probe(struct pcmcia_device *p_dev)
 {
-    com20020_dev_t *info;
+    struct com20020_dev *info;
     struct net_device *dev;
     struct arcnet_local *lp;
 
     dev_dbg(&p_dev->dev, "com20020_attach()\n");
 
     /* Create new network device */
-    info = kzalloc(sizeof(struct com20020_dev_t), GFP_KERNEL);
+    info = kzalloc(sizeof(*info), GFP_KERNEL);
     if (!info)
 	goto fail_alloc_info;
 
@@ -160,7 +160,7 @@ fail_alloc_info:
 
 static void com20020_detach(struct pcmcia_device *link)
 {
-    struct com20020_dev_t *info = link->priv;
+    struct com20020_dev *info = link->priv;
     struct net_device *dev = info->dev;
 
     dev_dbg(&link->dev, "detach...\n");
@@ -199,7 +199,7 @@ static void com20020_detach(struct pcmcia_device *link)
 static int com20020_config(struct pcmcia_device *link)
 {
     struct arcnet_local *lp;
-    com20020_dev_t *info;
+    struct com20020_dev *info;
     struct net_device *dev;
     int i, ret;
     int ioaddr;
@@ -291,7 +291,7 @@ static void com20020_release(struct pcmcia_device *link)
 
 static int com20020_suspend(struct pcmcia_device *link)
 {
-	com20020_dev_t *info = link->priv;
+	struct com20020_dev *info = link->priv;
 	struct net_device *dev = info->dev;
 
 	if (link->open)
@@ -302,7 +302,7 @@ static int com20020_suspend(struct pcmcia_device *link)
 
 static int com20020_resume(struct pcmcia_device *link)
 {
-	com20020_dev_t *info = link->priv;
+	struct com20020_dev *info = link->priv;
 	struct net_device *dev = info->dev;
 
 	if (link->open) {
diff --git a/drivers/net/ethernet/8390/Kconfig b/drivers/net/ethernet/8390/Kconfig
index 0988811f4e40..2d89bd00de61 100644
--- a/drivers/net/ethernet/8390/Kconfig
+++ b/drivers/net/ethernet/8390/Kconfig
@@ -91,7 +91,8 @@ config MCF8390
 
 config NE2000
 	tristate "NE2000/NE1000 support"
-	depends on (ISA || (Q40 && m) || M32R || MACH_TX49XX)
+	depends on (ISA || (Q40 && m) || M32R || MACH_TX49XX || \
+		    ATARI_ETHERNEC)
 	select CRC32
 	---help---
 	  If you have a network (Ethernet) card of this type, say Y and read
diff --git a/drivers/net/ethernet/8390/axnet_cs.c b/drivers/net/ethernet/8390/axnet_cs.c
index 73c57a4a7b9e..7769c05543f1 100644
--- a/drivers/net/ethernet/8390/axnet_cs.c
+++ b/drivers/net/ethernet/8390/axnet_cs.c
@@ -108,7 +108,7 @@ static u32 axnet_msg_enable;
 
 /*====================================================================*/
 
-typedef struct axnet_dev_t {
+struct axnet_dev {
 	struct pcmcia_device	*p_dev;
 	caddr_t	base;
 	struct timer_list	watchdog;
@@ -118,9 +118,9 @@ typedef struct axnet_dev_t {
 	int	phy_id;
 	int	flags;
 	int	active_low;
-} axnet_dev_t;
+};
 
-static inline axnet_dev_t *PRIV(struct net_device *dev)
+static inline struct axnet_dev *PRIV(struct net_device *dev)
 {
 	void *p = (char *)netdev_priv(dev) + sizeof(struct ei_device);
 	return p;
@@ -141,13 +141,13 @@ static const struct net_device_ops axnet_netdev_ops = {
 
 static int axnet_probe(struct pcmcia_device *link)
 {
-    axnet_dev_t *info;
+    struct axnet_dev *info;
     struct net_device *dev;
     struct ei_device *ei_local;
 
     dev_dbg(&link->dev, "axnet_attach()\n");
 
-    dev = alloc_etherdev(sizeof(struct ei_device) + sizeof(axnet_dev_t));
+    dev = alloc_etherdev(sizeof(struct ei_device) + sizeof(struct axnet_dev));
     if (!dev)
 	return -ENOMEM;
 
@@ -274,7 +274,7 @@ static int axnet_configcheck(struct pcmcia_device *p_dev, void *priv_data)
 static int axnet_config(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    axnet_dev_t *info = PRIV(dev);
+    struct axnet_dev *info = PRIV(dev);
     int i, j, j2, ret;
 
     dev_dbg(&link->dev, "axnet_config(0x%p)\n", link);
@@ -389,7 +389,7 @@ static int axnet_suspend(struct pcmcia_device *link)
 static int axnet_resume(struct pcmcia_device *link)
 {
 	struct net_device *dev = link->priv;
-	axnet_dev_t *info = PRIV(dev);
+	struct axnet_dev *info = PRIV(dev);
 
 	if (link->open) {
 		if (info->active_low == 1)
@@ -467,7 +467,7 @@ static void mdio_write(unsigned int addr, int phy_id, int loc, int value)
 static int axnet_open(struct net_device *dev)
 {
     int ret;
-    axnet_dev_t *info = PRIV(dev);
+    struct axnet_dev *info = PRIV(dev);
     struct pcmcia_device *link = info->p_dev;
     unsigned int nic_base = dev->base_addr;
     
@@ -497,7 +497,7 @@ static int axnet_open(struct net_device *dev)
 
 static int axnet_close(struct net_device *dev)
 {
-    axnet_dev_t *info = PRIV(dev);
+    struct axnet_dev *info = PRIV(dev);
     struct pcmcia_device *link = info->p_dev;
 
     dev_dbg(&link->dev, "axnet_close('%s')\n", dev->name);
@@ -554,7 +554,7 @@ static irqreturn_t ei_irq_wrapper(int irq, void *dev_id)
 static void ei_watchdog(u_long arg)
 {
     struct net_device *dev = (struct net_device *)(arg);
-    axnet_dev_t *info = PRIV(dev);
+    struct axnet_dev *info = PRIV(dev);
     unsigned int nic_base = dev->base_addr;
     unsigned int mii_addr = nic_base + AXNET_MII_EEP;
     u_short link;
@@ -610,7 +610,7 @@ reschedule:
 
 static int axnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
-    axnet_dev_t *info = PRIV(dev);
+    struct axnet_dev *info = PRIV(dev);
     struct mii_ioctl_data *data = if_mii(rq);
     unsigned int mii_addr = dev->base_addr + AXNET_MII_EEP;
     switch (cmd) {
@@ -1452,7 +1452,7 @@ static void ei_receive(struct net_device *dev)
 
 static void ei_rx_overrun(struct net_device *dev)
 {
-	axnet_dev_t *info = PRIV(dev);
+	struct axnet_dev *info = PRIV(dev);
 	long e8390_base = dev->base_addr;
 	unsigned char was_txing, must_resend = 0;
 	struct ei_device *ei_local = netdev_priv(dev);
@@ -1624,7 +1624,7 @@ static void set_multicast_list(struct net_device *dev)
 
 static void AX88190_init(struct net_device *dev, int startp)
 {
-	axnet_dev_t *info = PRIV(dev);
+	struct axnet_dev *info = PRIV(dev);
 	long e8390_base = dev->base_addr;
 	struct ei_device *ei_local = netdev_priv(dev);
 	int i;
diff --git a/drivers/net/ethernet/8390/ne.c b/drivers/net/ethernet/8390/ne.c
index 58eaa8f34942..de566fb6e0f7 100644
--- a/drivers/net/ethernet/8390/ne.c
+++ b/drivers/net/ethernet/8390/ne.c
@@ -169,6 +169,8 @@ bad_clone_list[] __initdata = {
 #elif defined(CONFIG_PLAT_OAKS32R)  || \
    defined(CONFIG_MACH_TX49XX)
 #  define DCR_VAL 0x48		/* 8-bit mode */
+#elif defined(CONFIG_ATARI)	/* 8-bit mode on Atari, normal on Q40 */
+#  define DCR_VAL (MACH_IS_ATARI ? 0x48 : 0x49)
 #else
 #  define DCR_VAL 0x49
 #endif
diff --git a/drivers/net/ethernet/8390/pcnet_cs.c b/drivers/net/ethernet/8390/pcnet_cs.c
index ca3c2b921cf6..9fb7b9d4fd6c 100644
--- a/drivers/net/ethernet/8390/pcnet_cs.c
+++ b/drivers/net/ethernet/8390/pcnet_cs.c
@@ -111,11 +111,11 @@ static void pcnet_detach(struct pcmcia_device *p_dev);
 
 /*====================================================================*/
 
-typedef struct hw_info_t {
+struct hw_info {
     u_int	offset;
     u_char	a0, a1, a2;
     u_int	flags;
-} hw_info_t;
+};
 
 #define DELAY_OUTPUT	0x01
 #define HAS_MISC_REG	0x02
@@ -132,7 +132,7 @@ typedef struct hw_info_t {
 #define MII_PHYID_REG1		0x02
 #define MII_PHYID_REG2		0x03
 
-static hw_info_t hw_info[] = {
+static struct hw_info hw_info[] = {
     { /* Accton EN2212 */ 0x0ff0, 0x00, 0x00, 0xe8, DELAY_OUTPUT },
     { /* Allied Telesis LA-PCM */ 0x0ff0, 0x00, 0x00, 0xf4, 0 },
     { /* APEX MultiCard */ 0x03f4, 0x00, 0x20, 0xe5, 0 },
@@ -196,11 +196,11 @@ static hw_info_t hw_info[] = {
 
 #define NR_INFO		ARRAY_SIZE(hw_info)
 
-static hw_info_t default_info = { 0, 0, 0, 0, 0 };
-static hw_info_t dl10019_info = { 0, 0, 0, 0, IS_DL10019|HAS_MII };
-static hw_info_t dl10022_info = { 0, 0, 0, 0, IS_DL10022|HAS_MII };
+static struct hw_info default_info = { 0, 0, 0, 0, 0 };
+static struct hw_info dl10019_info = { 0, 0, 0, 0, IS_DL10019|HAS_MII };
+static struct hw_info dl10022_info = { 0, 0, 0, 0, IS_DL10022|HAS_MII };
 
-typedef struct pcnet_dev_t {
+struct pcnet_dev {
 	struct pcmcia_device	*p_dev;
     u_int		flags;
     void		__iomem *base;
@@ -210,12 +210,12 @@ typedef struct pcnet_dev_t {
     u_char		eth_phy, pna_phy;
     u_short		link_status;
     u_long		mii_reset;
-} pcnet_dev_t;
+};
 
-static inline pcnet_dev_t *PRIV(struct net_device *dev)
+static inline struct pcnet_dev *PRIV(struct net_device *dev)
 {
 	char *p = netdev_priv(dev);
-	return (pcnet_dev_t *)(p + sizeof(struct ei_device));
+	return (struct pcnet_dev *)(p + sizeof(struct ei_device));
 }
 
 static const struct net_device_ops pcnet_netdev_ops = {
@@ -237,13 +237,13 @@ static const struct net_device_ops pcnet_netdev_ops = {
 
 static int pcnet_probe(struct pcmcia_device *link)
 {
-    pcnet_dev_t *info;
+    struct pcnet_dev *info;
     struct net_device *dev;
 
     dev_dbg(&link->dev, "pcnet_attach()\n");
 
     /* Create new ethernet device */
-    dev = __alloc_ei_netdev(sizeof(pcnet_dev_t));
+    dev = __alloc_ei_netdev(sizeof(struct pcnet_dev));
     if (!dev) return -ENOMEM;
     info = PRIV(dev);
     info->p_dev = link;
@@ -276,7 +276,7 @@ static void pcnet_detach(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static hw_info_t *get_hwinfo(struct pcmcia_device *link)
+static struct hw_info *get_hwinfo(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
     u_char __iomem *base, *virt;
@@ -317,7 +317,7 @@ static hw_info_t *get_hwinfo(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static hw_info_t *get_prom(struct pcmcia_device *link)
+static struct hw_info *get_prom(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
     unsigned int ioaddr = dev->base_addr;
@@ -371,7 +371,7 @@ static hw_info_t *get_prom(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static hw_info_t *get_dl10019(struct pcmcia_device *link)
+static struct hw_info *get_dl10019(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
     int i;
@@ -393,7 +393,7 @@ static hw_info_t *get_dl10019(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static hw_info_t *get_ax88190(struct pcmcia_device *link)
+static struct hw_info *get_ax88190(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
     unsigned int ioaddr = dev->base_addr;
@@ -424,7 +424,7 @@ static hw_info_t *get_ax88190(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static hw_info_t *get_hwired(struct pcmcia_device *link)
+static struct hw_info *get_hwired(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
     int i;
@@ -489,12 +489,12 @@ static int pcnet_confcheck(struct pcmcia_device *p_dev, void *priv_data)
 	return try_io_port(p_dev);
 }
 
-static hw_info_t *pcnet_try_config(struct pcmcia_device *link,
-				   int *has_shmem, int try)
+static struct hw_info *pcnet_try_config(struct pcmcia_device *link,
+					int *has_shmem, int try)
 {
 	struct net_device *dev = link->priv;
-	hw_info_t *local_hw_info;
-	pcnet_dev_t *info = PRIV(dev);
+	struct hw_info *local_hw_info;
+	struct pcnet_dev *info = PRIV(dev);
 	int priv = try;
 	int ret;
 
@@ -553,10 +553,10 @@ static hw_info_t *pcnet_try_config(struct pcmcia_device *link,
 static int pcnet_config(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     int start_pg, stop_pg, cm_offset;
     int has_shmem = 0;
-    hw_info_t *local_hw_info;
+    struct hw_info *local_hw_info;
     struct ei_device *ei_local;
 
     dev_dbg(&link->dev, "pcnet_config\n");
@@ -639,7 +639,7 @@ failed:
 
 static void pcnet_release(struct pcmcia_device *link)
 {
-	pcnet_dev_t *info = PRIV(link->priv);
+	struct pcnet_dev *info = PRIV(link->priv);
 
 	dev_dbg(&link->dev, "pcnet_release\n");
 
@@ -836,7 +836,7 @@ static void write_asic(unsigned int ioaddr, int location, short asic_data)
 static void set_misc_reg(struct net_device *dev)
 {
     unsigned int nic_base = dev->base_addr;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     u_char tmp;
 
     if (info->flags & HAS_MISC_REG) {
@@ -873,7 +873,7 @@ static void set_misc_reg(struct net_device *dev)
 
 static void mii_phy_probe(struct net_device *dev)
 {
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     unsigned int mii_addr = dev->base_addr + DLINK_GPIO;
     int i;
     u_int tmp, phyid;
@@ -898,7 +898,7 @@ static void mii_phy_probe(struct net_device *dev)
 static int pcnet_open(struct net_device *dev)
 {
     int ret;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     struct pcmcia_device *link = info->p_dev;
     unsigned int nic_base = dev->base_addr;
 
@@ -931,7 +931,7 @@ static int pcnet_open(struct net_device *dev)
 
 static int pcnet_close(struct net_device *dev)
 {
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     struct pcmcia_device *link = info->p_dev;
 
     dev_dbg(&link->dev, "pcnet_close('%s')\n", dev->name);
@@ -982,7 +982,7 @@ static void pcnet_reset_8390(struct net_device *dev)
 
 static int set_config(struct net_device *dev, struct ifmap *map)
 {
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     if ((map->port != (u_char)(-1)) && (map->port != dev->if_port)) {
 	if (!(info->flags & HAS_MISC_REG))
 	    return -EOPNOTSUPP;
@@ -1000,7 +1000,7 @@ static int set_config(struct net_device *dev, struct ifmap *map)
 static irqreturn_t ei_irq_wrapper(int irq, void *dev_id)
 {
     struct net_device *dev = dev_id;
-    pcnet_dev_t *info;
+    struct pcnet_dev *info;
     irqreturn_t ret = ei_interrupt(irq, dev_id);
 
     if (ret == IRQ_HANDLED) {
@@ -1013,7 +1013,7 @@ static irqreturn_t ei_irq_wrapper(int irq, void *dev_id)
 static void ei_watchdog(u_long arg)
 {
     struct net_device *dev = (struct net_device *)arg;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     unsigned int nic_base = dev->base_addr;
     unsigned int mii_addr = nic_base + DLINK_GPIO;
     u_short link;
@@ -1101,7 +1101,7 @@ reschedule:
 
 static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     struct mii_ioctl_data *data = if_mii(rq);
     unsigned int mii_addr = dev->base_addr + DLINK_GPIO;
 
@@ -1214,7 +1214,7 @@ static void dma_block_output(struct net_device *dev, int count,
 			     const u_char *buf, const int start_page)
 {
     unsigned int nic_base = dev->base_addr;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
 #ifdef PCMCIA_DEBUG
     int retries = 0;
     struct ei_device *ei_local = netdev_priv(dev);
@@ -1403,7 +1403,7 @@ static int setup_shmem_window(struct pcmcia_device *link, int start_pg,
 			      int stop_pg, int cm_offset)
 {
     struct net_device *dev = link->priv;
-    pcnet_dev_t *info = PRIV(dev);
+    struct pcnet_dev *info = PRIV(dev);
     int i, window_size, offset, ret;
 
     window_size = (stop_pg - start_pg) << 8;
diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index edb718661850..dc7406c81c45 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -24,6 +24,7 @@ source "drivers/net/ethernet/allwinner/Kconfig"
 source "drivers/net/ethernet/alteon/Kconfig"
 source "drivers/net/ethernet/altera/Kconfig"
 source "drivers/net/ethernet/amd/Kconfig"
+source "drivers/net/ethernet/apm/Kconfig"
 source "drivers/net/ethernet/apple/Kconfig"
 source "drivers/net/ethernet/arc/Kconfig"
 source "drivers/net/ethernet/atheros/Kconfig"
diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile
index 58de3339ab3c..224a01877149 100644
--- a/drivers/net/ethernet/Makefile
+++ b/drivers/net/ethernet/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_NET_VENDOR_ALLWINNER) += allwinner/
 obj-$(CONFIG_NET_VENDOR_ALTEON) += alteon/
 obj-$(CONFIG_ALTERA_TSE) += altera/
 obj-$(CONFIG_NET_VENDOR_AMD) += amd/
+obj-$(CONFIG_NET_XGENE) += apm/
 obj-$(CONFIG_NET_VENDOR_APPLE) += apple/
 obj-$(CONFIG_NET_VENDOR_ARC) += arc/
 obj-$(CONFIG_NET_VENDOR_ATHEROS) += atheros/
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 1f5487f4888c..dc84f7193c2d 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -117,7 +117,6 @@
 #include <linux/spinlock.h>
 #include <linux/tcp.h>
 #include <linux/if_vlan.h>
-#include <linux/phy.h>
 #include <net/busy_poll.h>
 #include <linux/clk.h>
 #include <linux/if_ether.h>
diff --git a/drivers/net/ethernet/apm/Kconfig b/drivers/net/ethernet/apm/Kconfig
new file mode 100644
index 000000000000..ec63d706d464
--- /dev/null
+++ b/drivers/net/ethernet/apm/Kconfig
@@ -0,0 +1 @@
+source "drivers/net/ethernet/apm/xgene/Kconfig"
diff --git a/drivers/net/ethernet/apm/Makefile b/drivers/net/ethernet/apm/Makefile
new file mode 100644
index 000000000000..65ce32ad1b2c
--- /dev/null
+++ b/drivers/net/ethernet/apm/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for APM X-GENE Ethernet driver.
+#
+
+obj-$(CONFIG_NET_XGENE) += xgene/
diff --git a/drivers/net/ethernet/apm/xgene/Kconfig b/drivers/net/ethernet/apm/xgene/Kconfig
new file mode 100644
index 000000000000..616dff6d3f5f
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene/Kconfig
@@ -0,0 +1,9 @@
+config NET_XGENE
+	tristate "APM X-Gene SoC Ethernet Driver"
+	select PHYLIB
+	help
+	  This is the Ethernet driver for the on-chip ethernet interface on the
+	  APM X-Gene SoC.
+
+	  To compile this driver as a module, choose M here. This module will
+	  be called xgene_enet.
diff --git a/drivers/net/ethernet/apm/xgene/Makefile b/drivers/net/ethernet/apm/xgene/Makefile
new file mode 100644
index 000000000000..c643e8a0a0dc
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for APM X-Gene Ethernet Driver.
+#
+
+xgene-enet-objs := xgene_enet_hw.o xgene_enet_main.o xgene_enet_ethtool.o
+obj-$(CONFIG_NET_XGENE) += xgene-enet.o
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
new file mode 100644
index 000000000000..63f2aa54a594
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
@@ -0,0 +1,125 @@
+/* Applied Micro X-Gene SoC Ethernet Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Authors: Iyappan Subramanian <isubramanian@apm.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/ethtool.h>
+#include "xgene_enet_main.h"
+
+struct xgene_gstrings_stats {
+	char name[ETH_GSTRING_LEN];
+	int offset;
+};
+
+#define XGENE_STAT(m) { #m, offsetof(struct xgene_enet_pdata, stats.m) }
+
+static const struct xgene_gstrings_stats gstrings_stats[] = {
+	XGENE_STAT(rx_packets),
+	XGENE_STAT(tx_packets),
+	XGENE_STAT(rx_bytes),
+	XGENE_STAT(tx_bytes),
+	XGENE_STAT(rx_errors),
+	XGENE_STAT(tx_errors),
+	XGENE_STAT(rx_length_errors),
+	XGENE_STAT(rx_crc_errors),
+	XGENE_STAT(rx_frame_errors),
+	XGENE_STAT(rx_fifo_errors)
+};
+
+#define XGENE_STATS_LEN		ARRAY_SIZE(gstrings_stats)
+
+static void xgene_get_drvinfo(struct net_device *ndev,
+			      struct ethtool_drvinfo *info)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	struct platform_device *pdev = pdata->pdev;
+
+	strcpy(info->driver, "xgene_enet");
+	strcpy(info->version, XGENE_DRV_VERSION);
+	snprintf(info->fw_version, ETHTOOL_FWVERS_LEN, "N/A");
+	sprintf(info->bus_info, "%s", pdev->name);
+}
+
+static int xgene_get_settings(struct net_device *ndev, struct ethtool_cmd *cmd)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	struct phy_device *phydev = pdata->phy_dev;
+
+	if (phydev == NULL)
+		return -ENODEV;
+
+	return phy_ethtool_gset(phydev, cmd);
+}
+
+static int xgene_set_settings(struct net_device *ndev, struct ethtool_cmd *cmd)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	struct phy_device *phydev = pdata->phy_dev;
+
+	if (phydev == NULL)
+		return -ENODEV;
+
+	return phy_ethtool_sset(phydev, cmd);
+}
+
+static void xgene_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+{
+	int i;
+	u8 *p = data;
+
+	if (stringset != ETH_SS_STATS)
+		return;
+
+	for (i = 0; i < XGENE_STATS_LEN; i++) {
+		memcpy(p, gstrings_stats[i].name, ETH_GSTRING_LEN);
+		p += ETH_GSTRING_LEN;
+	}
+}
+
+static int xgene_get_sset_count(struct net_device *ndev, int sset)
+{
+	if (sset != ETH_SS_STATS)
+		return -EINVAL;
+
+	return XGENE_STATS_LEN;
+}
+
+static void xgene_get_ethtool_stats(struct net_device *ndev,
+				    struct ethtool_stats *dummy,
+				    u64 *data)
+{
+	void *pdata = netdev_priv(ndev);
+	int i;
+
+	for (i = 0; i < XGENE_STATS_LEN; i++)
+		*data++ = *(u64 *)(pdata + gstrings_stats[i].offset);
+}
+
+static const struct ethtool_ops xgene_ethtool_ops = {
+	.get_drvinfo = xgene_get_drvinfo,
+	.get_settings = xgene_get_settings,
+	.set_settings = xgene_set_settings,
+	.get_link = ethtool_op_get_link,
+	.get_strings = xgene_get_strings,
+	.get_sset_count = xgene_get_sset_count,
+	.get_ethtool_stats = xgene_get_ethtool_stats
+};
+
+void xgene_enet_set_ethtool_ops(struct net_device *ndev)
+{
+	ndev->ethtool_ops = &xgene_ethtool_ops;
+}
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
new file mode 100644
index 000000000000..812d8d65159b
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
@@ -0,0 +1,728 @@
+/* Applied Micro X-Gene SoC Ethernet Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Authors: Iyappan Subramanian <isubramanian@apm.com>
+ *	    Ravi Patel <rapatel@apm.com>
+ *	    Keyur Chudgar <kchudgar@apm.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "xgene_enet_main.h"
+#include "xgene_enet_hw.h"
+
+static void xgene_enet_ring_init(struct xgene_enet_desc_ring *ring)
+{
+	u32 *ring_cfg = ring->state;
+	u64 addr = ring->dma;
+	enum xgene_enet_ring_cfgsize cfgsize = ring->cfgsize;
+
+	ring_cfg[4] |= (1 << SELTHRSH_POS) &
+			CREATE_MASK(SELTHRSH_POS, SELTHRSH_LEN);
+	ring_cfg[3] |= ACCEPTLERR;
+	ring_cfg[2] |= QCOHERENT;
+
+	addr >>= 8;
+	ring_cfg[2] |= (addr << RINGADDRL_POS) &
+			CREATE_MASK_ULL(RINGADDRL_POS, RINGADDRL_LEN);
+	addr >>= RINGADDRL_LEN;
+	ring_cfg[3] |= addr & CREATE_MASK_ULL(RINGADDRH_POS, RINGADDRH_LEN);
+	ring_cfg[3] |= ((u32)cfgsize << RINGSIZE_POS) &
+			CREATE_MASK(RINGSIZE_POS, RINGSIZE_LEN);
+}
+
+static void xgene_enet_ring_set_type(struct xgene_enet_desc_ring *ring)
+{
+	u32 *ring_cfg = ring->state;
+	bool is_bufpool;
+	u32 val;
+
+	is_bufpool = xgene_enet_is_bufpool(ring->id);
+	val = (is_bufpool) ? RING_BUFPOOL : RING_REGULAR;
+	ring_cfg[4] |= (val << RINGTYPE_POS) &
+			CREATE_MASK(RINGTYPE_POS, RINGTYPE_LEN);
+
+	if (is_bufpool) {
+		ring_cfg[3] |= (BUFPOOL_MODE << RINGMODE_POS) &
+				CREATE_MASK(RINGMODE_POS, RINGMODE_LEN);
+	}
+}
+
+static void xgene_enet_ring_set_recombbuf(struct xgene_enet_desc_ring *ring)
+{
+	u32 *ring_cfg = ring->state;
+
+	ring_cfg[3] |= RECOMBBUF;
+	ring_cfg[3] |= (0xf << RECOMTIMEOUTL_POS) &
+			CREATE_MASK(RECOMTIMEOUTL_POS, RECOMTIMEOUTL_LEN);
+	ring_cfg[4] |= 0x7 & CREATE_MASK(RECOMTIMEOUTH_POS, RECOMTIMEOUTH_LEN);
+}
+
+static void xgene_enet_ring_wr32(struct xgene_enet_desc_ring *ring,
+				 u32 offset, u32 data)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ring->ndev);
+
+	iowrite32(data, pdata->ring_csr_addr + offset);
+}
+
+static void xgene_enet_ring_rd32(struct xgene_enet_desc_ring *ring,
+				 u32 offset, u32 *data)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ring->ndev);
+
+	*data = ioread32(pdata->ring_csr_addr + offset);
+}
+
+static void xgene_enet_write_ring_state(struct xgene_enet_desc_ring *ring)
+{
+	int i;
+
+	xgene_enet_ring_wr32(ring, CSR_RING_CONFIG, ring->num);
+	for (i = 0; i < NUM_RING_CONFIG; i++) {
+		xgene_enet_ring_wr32(ring, CSR_RING_WR_BASE + (i * 4),
+				     ring->state[i]);
+	}
+}
+
+static void xgene_enet_clr_ring_state(struct xgene_enet_desc_ring *ring)
+{
+	memset(ring->state, 0, sizeof(u32) * NUM_RING_CONFIG);
+	xgene_enet_write_ring_state(ring);
+}
+
+static void xgene_enet_set_ring_state(struct xgene_enet_desc_ring *ring)
+{
+	xgene_enet_ring_set_type(ring);
+
+	if (xgene_enet_ring_owner(ring->id) == RING_OWNER_ETH0)
+		xgene_enet_ring_set_recombbuf(ring);
+
+	xgene_enet_ring_init(ring);
+	xgene_enet_write_ring_state(ring);
+}
+
+static void xgene_enet_set_ring_id(struct xgene_enet_desc_ring *ring)
+{
+	u32 ring_id_val, ring_id_buf;
+	bool is_bufpool;
+
+	is_bufpool = xgene_enet_is_bufpool(ring->id);
+
+	ring_id_val = ring->id & GENMASK(9, 0);
+	ring_id_val |= OVERWRITE;
+
+	ring_id_buf = (ring->num << 9) & GENMASK(18, 9);
+	ring_id_buf |= PREFETCH_BUF_EN;
+	if (is_bufpool)
+		ring_id_buf |= IS_BUFFER_POOL;
+
+	xgene_enet_ring_wr32(ring, CSR_RING_ID, ring_id_val);
+	xgene_enet_ring_wr32(ring, CSR_RING_ID_BUF, ring_id_buf);
+}
+
+static void xgene_enet_clr_desc_ring_id(struct xgene_enet_desc_ring *ring)
+{
+	u32 ring_id;
+
+	ring_id = ring->id | OVERWRITE;
+	xgene_enet_ring_wr32(ring, CSR_RING_ID, ring_id);
+	xgene_enet_ring_wr32(ring, CSR_RING_ID_BUF, 0);
+}
+
+struct xgene_enet_desc_ring *xgene_enet_setup_ring(
+					struct xgene_enet_desc_ring *ring)
+{
+	u32 size = ring->size;
+	u32 i, data;
+	bool is_bufpool;
+
+	xgene_enet_clr_ring_state(ring);
+	xgene_enet_set_ring_state(ring);
+	xgene_enet_set_ring_id(ring);
+
+	ring->slots = xgene_enet_get_numslots(ring->id, size);
+
+	is_bufpool = xgene_enet_is_bufpool(ring->id);
+	if (is_bufpool || xgene_enet_ring_owner(ring->id) != RING_OWNER_CPU)
+		return ring;
+
+	for (i = 0; i < ring->slots; i++)
+		xgene_enet_mark_desc_slot_empty(&ring->raw_desc[i]);
+
+	xgene_enet_ring_rd32(ring, CSR_RING_NE_INT_MODE, &data);
+	data |= BIT(31 - xgene_enet_ring_bufnum(ring->id));
+	xgene_enet_ring_wr32(ring, CSR_RING_NE_INT_MODE, data);
+
+	return ring;
+}
+
+void xgene_enet_clear_ring(struct xgene_enet_desc_ring *ring)
+{
+	u32 data;
+	bool is_bufpool;
+
+	is_bufpool = xgene_enet_is_bufpool(ring->id);
+	if (is_bufpool || xgene_enet_ring_owner(ring->id) != RING_OWNER_CPU)
+		goto out;
+
+	xgene_enet_ring_rd32(ring, CSR_RING_NE_INT_MODE, &data);
+	data &= ~BIT(31 - xgene_enet_ring_bufnum(ring->id));
+	xgene_enet_ring_wr32(ring, CSR_RING_NE_INT_MODE, data);
+
+out:
+	xgene_enet_clr_desc_ring_id(ring);
+	xgene_enet_clr_ring_state(ring);
+}
+
+void xgene_enet_parse_error(struct xgene_enet_desc_ring *ring,
+			    struct xgene_enet_pdata *pdata,
+			    enum xgene_enet_err_code status)
+{
+	struct rtnl_link_stats64 *stats = &pdata->stats;
+
+	switch (status) {
+	case INGRESS_CRC:
+		stats->rx_crc_errors++;
+		break;
+	case INGRESS_CHECKSUM:
+	case INGRESS_CHECKSUM_COMPUTE:
+		stats->rx_errors++;
+		break;
+	case INGRESS_TRUNC_FRAME:
+		stats->rx_frame_errors++;
+		break;
+	case INGRESS_PKT_LEN:
+		stats->rx_length_errors++;
+		break;
+	case INGRESS_PKT_UNDER:
+		stats->rx_frame_errors++;
+		break;
+	case INGRESS_FIFO_OVERRUN:
+		stats->rx_fifo_errors++;
+		break;
+	default:
+		break;
+	}
+}
+
+static void xgene_enet_wr_csr(struct xgene_enet_pdata *pdata,
+			      u32 offset, u32 val)
+{
+	void __iomem *addr = pdata->eth_csr_addr + offset;
+
+	iowrite32(val, addr);
+}
+
+static void xgene_enet_wr_ring_if(struct xgene_enet_pdata *pdata,
+				  u32 offset, u32 val)
+{
+	void __iomem *addr = pdata->eth_ring_if_addr + offset;
+
+	iowrite32(val, addr);
+}
+
+static void xgene_enet_wr_diag_csr(struct xgene_enet_pdata *pdata,
+				   u32 offset, u32 val)
+{
+	void __iomem *addr = pdata->eth_diag_csr_addr + offset;
+
+	iowrite32(val, addr);
+}
+
+static void xgene_enet_wr_mcx_csr(struct xgene_enet_pdata *pdata,
+				  u32 offset, u32 val)
+{
+	void __iomem *addr = pdata->mcx_mac_csr_addr + offset;
+
+	iowrite32(val, addr);
+}
+
+static bool xgene_enet_wr_indirect(void __iomem *addr, void __iomem *wr,
+				   void __iomem *cmd, void __iomem *cmd_done,
+				   u32 wr_addr, u32 wr_data)
+{
+	u32 done;
+	u8 wait = 10;
+
+	iowrite32(wr_addr, addr);
+	iowrite32(wr_data, wr);
+	iowrite32(XGENE_ENET_WR_CMD, cmd);
+
+	/* wait for write command to complete */
+	while (!(done = ioread32(cmd_done)) && wait--)
+		udelay(1);
+
+	if (!done)
+		return false;
+
+	iowrite32(0, cmd);
+
+	return true;
+}
+
+static void xgene_enet_wr_mcx_mac(struct xgene_enet_pdata *pdata,
+				  u32 wr_addr, u32 wr_data)
+{
+	void __iomem *addr, *wr, *cmd, *cmd_done;
+
+	addr = pdata->mcx_mac_addr + MAC_ADDR_REG_OFFSET;
+	wr = pdata->mcx_mac_addr + MAC_WRITE_REG_OFFSET;
+	cmd = pdata->mcx_mac_addr + MAC_COMMAND_REG_OFFSET;
+	cmd_done = pdata->mcx_mac_addr + MAC_COMMAND_DONE_REG_OFFSET;
+
+	if (!xgene_enet_wr_indirect(addr, wr, cmd, cmd_done, wr_addr, wr_data))
+		netdev_err(pdata->ndev, "MCX mac write failed, addr: %04x\n",
+			   wr_addr);
+}
+
+static void xgene_enet_rd_csr(struct xgene_enet_pdata *pdata,
+			      u32 offset, u32 *val)
+{
+	void __iomem *addr = pdata->eth_csr_addr + offset;
+
+	*val = ioread32(addr);
+}
+
+static void xgene_enet_rd_diag_csr(struct xgene_enet_pdata *pdata,
+				   u32 offset, u32 *val)
+{
+	void __iomem *addr = pdata->eth_diag_csr_addr + offset;
+
+	*val = ioread32(addr);
+}
+
+static void xgene_enet_rd_mcx_csr(struct xgene_enet_pdata *pdata,
+				  u32 offset, u32 *val)
+{
+	void __iomem *addr = pdata->mcx_mac_csr_addr + offset;
+
+	*val = ioread32(addr);
+}
+
+static bool xgene_enet_rd_indirect(void __iomem *addr, void __iomem *rd,
+				   void __iomem *cmd, void __iomem *cmd_done,
+				   u32 rd_addr, u32 *rd_data)
+{
+	u32 done;
+	u8 wait = 10;
+
+	iowrite32(rd_addr, addr);
+	iowrite32(XGENE_ENET_RD_CMD, cmd);
+
+	/* wait for read command to complete */
+	while (!(done = ioread32(cmd_done)) && wait--)
+		udelay(1);
+
+	if (!done)
+		return false;
+
+	*rd_data = ioread32(rd);
+	iowrite32(0, cmd);
+
+	return true;
+}
+
+static void xgene_enet_rd_mcx_mac(struct xgene_enet_pdata *pdata,
+				  u32 rd_addr, u32 *rd_data)
+{
+	void __iomem *addr, *rd, *cmd, *cmd_done;
+
+	addr = pdata->mcx_mac_addr + MAC_ADDR_REG_OFFSET;
+	rd = pdata->mcx_mac_addr + MAC_READ_REG_OFFSET;
+	cmd = pdata->mcx_mac_addr + MAC_COMMAND_REG_OFFSET;
+	cmd_done = pdata->mcx_mac_addr + MAC_COMMAND_DONE_REG_OFFSET;
+
+	if (!xgene_enet_rd_indirect(addr, rd, cmd, cmd_done, rd_addr, rd_data))
+		netdev_err(pdata->ndev, "MCX mac read failed, addr: %04x\n",
+			   rd_addr);
+}
+
+static int xgene_mii_phy_write(struct xgene_enet_pdata *pdata, int phy_id,
+			       u32 reg, u16 data)
+{
+	u32 addr = 0, wr_data = 0;
+	u32 done;
+	u8 wait = 10;
+
+	PHY_ADDR_SET(&addr, phy_id);
+	REG_ADDR_SET(&addr, reg);
+	xgene_enet_wr_mcx_mac(pdata, MII_MGMT_ADDRESS_ADDR, addr);
+
+	PHY_CONTROL_SET(&wr_data, data);
+	xgene_enet_wr_mcx_mac(pdata, MII_MGMT_CONTROL_ADDR, wr_data);
+	do {
+		usleep_range(5, 10);
+		xgene_enet_rd_mcx_mac(pdata, MII_MGMT_INDICATORS_ADDR, &done);
+	} while ((done & BUSY_MASK) && wait--);
+
+	if (done & BUSY_MASK) {
+		netdev_err(pdata->ndev, "MII_MGMT write failed\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int xgene_mii_phy_read(struct xgene_enet_pdata *pdata,
+			      u8 phy_id, u32 reg)
+{
+	u32 addr = 0;
+	u32 data, done;
+	u8 wait = 10;
+
+	PHY_ADDR_SET(&addr, phy_id);
+	REG_ADDR_SET(&addr, reg);
+	xgene_enet_wr_mcx_mac(pdata, MII_MGMT_ADDRESS_ADDR, addr);
+	xgene_enet_wr_mcx_mac(pdata, MII_MGMT_COMMAND_ADDR, READ_CYCLE_MASK);
+	do {
+		usleep_range(5, 10);
+		xgene_enet_rd_mcx_mac(pdata, MII_MGMT_INDICATORS_ADDR, &done);
+	} while ((done & BUSY_MASK) && wait--);
+
+	if (done & BUSY_MASK) {
+		netdev_err(pdata->ndev, "MII_MGMT read failed\n");
+		return -EBUSY;
+	}
+
+	xgene_enet_rd_mcx_mac(pdata, MII_MGMT_STATUS_ADDR, &data);
+	xgene_enet_wr_mcx_mac(pdata, MII_MGMT_COMMAND_ADDR, 0);
+
+	return data;
+}
+
+void xgene_gmac_set_mac_addr(struct xgene_enet_pdata *pdata)
+{
+	u32 addr0, addr1;
+	u8 *dev_addr = pdata->ndev->dev_addr;
+
+	addr0 = (dev_addr[3] << 24) | (dev_addr[2] << 16) |
+		(dev_addr[1] << 8) | dev_addr[0];
+	addr1 = (dev_addr[5] << 24) | (dev_addr[4] << 16);
+	addr1 |= pdata->phy_addr & 0xFFFF;
+
+	xgene_enet_wr_mcx_mac(pdata, STATION_ADDR0_ADDR, addr0);
+	xgene_enet_wr_mcx_mac(pdata, STATION_ADDR1_ADDR, addr1);
+}
+
+static int xgene_enet_ecc_init(struct xgene_enet_pdata *pdata)
+{
+	struct net_device *ndev = pdata->ndev;
+	u32 data;
+	u8 wait = 10;
+
+	xgene_enet_wr_diag_csr(pdata, ENET_CFG_MEM_RAM_SHUTDOWN_ADDR, 0x0);
+	do {
+		usleep_range(100, 110);
+		xgene_enet_rd_diag_csr(pdata, ENET_BLOCK_MEM_RDY_ADDR, &data);
+	} while ((data != 0xffffffff) && wait--);
+
+	if (data != 0xffffffff) {
+		netdev_err(ndev, "Failed to release memory from shutdown\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+void xgene_gmac_reset(struct xgene_enet_pdata *pdata)
+{
+	xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, SOFT_RESET1);
+	xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, 0);
+}
+
+void xgene_gmac_init(struct xgene_enet_pdata *pdata, int speed)
+{
+	u32 value, mc2;
+	u32 intf_ctl, rgmii;
+	u32 icm0, icm2;
+
+	xgene_gmac_reset(pdata);
+
+	xgene_enet_rd_mcx_csr(pdata, ICM_CONFIG0_REG_0_ADDR, &icm0);
+	xgene_enet_rd_mcx_csr(pdata, ICM_CONFIG2_REG_0_ADDR, &icm2);
+	xgene_enet_rd_mcx_mac(pdata, MAC_CONFIG_2_ADDR, &mc2);
+	xgene_enet_rd_mcx_mac(pdata, INTERFACE_CONTROL_ADDR, &intf_ctl);
+	xgene_enet_rd_csr(pdata, RGMII_REG_0_ADDR, &rgmii);
+
+	switch (speed) {
+	case SPEED_10:
+		ENET_INTERFACE_MODE2_SET(&mc2, 1);
+		CFG_MACMODE_SET(&icm0, 0);
+		CFG_WAITASYNCRD_SET(&icm2, 500);
+		rgmii &= ~CFG_SPEED_1250;
+		break;
+	case SPEED_100:
+		ENET_INTERFACE_MODE2_SET(&mc2, 1);
+		intf_ctl |= ENET_LHD_MODE;
+		CFG_MACMODE_SET(&icm0, 1);
+		CFG_WAITASYNCRD_SET(&icm2, 80);
+		rgmii &= ~CFG_SPEED_1250;
+		break;
+	default:
+		ENET_INTERFACE_MODE2_SET(&mc2, 2);
+		intf_ctl |= ENET_GHD_MODE;
+		CFG_TXCLK_MUXSEL0_SET(&rgmii, 4);
+		xgene_enet_rd_csr(pdata, DEBUG_REG_ADDR, &value);
+		value |= CFG_BYPASS_UNISEC_TX | CFG_BYPASS_UNISEC_RX;
+		xgene_enet_wr_csr(pdata, DEBUG_REG_ADDR, value);
+		break;
+	}
+
+	mc2 |= FULL_DUPLEX2;
+	xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_2_ADDR, mc2);
+	xgene_enet_wr_mcx_mac(pdata, INTERFACE_CONTROL_ADDR, intf_ctl);
+
+	xgene_gmac_set_mac_addr(pdata);
+
+	/* Adjust MDC clock frequency */
+	xgene_enet_rd_mcx_mac(pdata, MII_MGMT_CONFIG_ADDR, &value);
+	MGMT_CLOCK_SEL_SET(&value, 7);
+	xgene_enet_wr_mcx_mac(pdata, MII_MGMT_CONFIG_ADDR, value);
+
+	/* Enable drop if bufpool not available */
+	xgene_enet_rd_csr(pdata, RSIF_CONFIG_REG_ADDR, &value);
+	value |= CFG_RSIF_FPBUFF_TIMEOUT_EN;
+	xgene_enet_wr_csr(pdata, RSIF_CONFIG_REG_ADDR, value);
+
+	/* Rtype should be copied from FP */
+	xgene_enet_wr_csr(pdata, RSIF_RAM_DBG_REG0_ADDR, 0);
+	xgene_enet_wr_csr(pdata, RGMII_REG_0_ADDR, rgmii);
+
+	/* Rx-Tx traffic resume */
+	xgene_enet_wr_csr(pdata, CFG_LINK_AGGR_RESUME_0_ADDR, TX_PORT0);
+
+	xgene_enet_wr_mcx_csr(pdata, ICM_CONFIG0_REG_0_ADDR, icm0);
+	xgene_enet_wr_mcx_csr(pdata, ICM_CONFIG2_REG_0_ADDR, icm2);
+
+	xgene_enet_rd_mcx_csr(pdata, RX_DV_GATE_REG_0_ADDR, &value);
+	value &= ~TX_DV_GATE_EN0;
+	value &= ~RX_DV_GATE_EN0;
+	value |= RESUME_RX0;
+	xgene_enet_wr_mcx_csr(pdata, RX_DV_GATE_REG_0_ADDR, value);
+
+	xgene_enet_wr_csr(pdata, CFG_BYPASS_ADDR, RESUME_TX);
+}
+
+static void xgene_enet_config_ring_if_assoc(struct xgene_enet_pdata *pdata)
+{
+	u32 val = 0xffffffff;
+
+	xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIWQASSOC_ADDR, val);
+	xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIFPQASSOC_ADDR, val);
+	xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIQMLITEWQASSOC_ADDR, val);
+	xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIQMLITEFPQASSOC_ADDR, val);
+}
+
+void xgene_enet_cle_bypass(struct xgene_enet_pdata *pdata,
+			   u32 dst_ring_num, u16 bufpool_id)
+{
+	u32 cb;
+	u32 fpsel;
+
+	fpsel = xgene_enet_ring_bufnum(bufpool_id) - 0x20;
+
+	xgene_enet_rd_csr(pdata, CLE_BYPASS_REG0_0_ADDR, &cb);
+	cb |= CFG_CLE_BYPASS_EN0;
+	CFG_CLE_IP_PROTOCOL0_SET(&cb, 3);
+	xgene_enet_wr_csr(pdata, CLE_BYPASS_REG0_0_ADDR, cb);
+
+	xgene_enet_rd_csr(pdata, CLE_BYPASS_REG1_0_ADDR, &cb);
+	CFG_CLE_DSTQID0_SET(&cb, dst_ring_num);
+	CFG_CLE_FPSEL0_SET(&cb, fpsel);
+	xgene_enet_wr_csr(pdata, CLE_BYPASS_REG1_0_ADDR, cb);
+}
+
+void xgene_gmac_rx_enable(struct xgene_enet_pdata *pdata)
+{
+	u32 data;
+
+	xgene_enet_rd_mcx_mac(pdata, MAC_CONFIG_1_ADDR, &data);
+	xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, data | RX_EN);
+}
+
+void xgene_gmac_tx_enable(struct xgene_enet_pdata *pdata)
+{
+	u32 data;
+
+	xgene_enet_rd_mcx_mac(pdata, MAC_CONFIG_1_ADDR, &data);
+	xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, data | TX_EN);
+}
+
+void xgene_gmac_rx_disable(struct xgene_enet_pdata *pdata)
+{
+	u32 data;
+
+	xgene_enet_rd_mcx_mac(pdata, MAC_CONFIG_1_ADDR, &data);
+	xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, data & ~RX_EN);
+}
+
+void xgene_gmac_tx_disable(struct xgene_enet_pdata *pdata)
+{
+	u32 data;
+
+	xgene_enet_rd_mcx_mac(pdata, MAC_CONFIG_1_ADDR, &data);
+	xgene_enet_wr_mcx_mac(pdata, MAC_CONFIG_1_ADDR, data & ~TX_EN);
+}
+
+void xgene_enet_reset(struct xgene_enet_pdata *pdata)
+{
+	u32 val;
+
+	clk_prepare_enable(pdata->clk);
+	clk_disable_unprepare(pdata->clk);
+	clk_prepare_enable(pdata->clk);
+	xgene_enet_ecc_init(pdata);
+	xgene_enet_config_ring_if_assoc(pdata);
+
+	/* Enable auto-incr for scanning */
+	xgene_enet_rd_mcx_mac(pdata, MII_MGMT_CONFIG_ADDR, &val);
+	val |= SCAN_AUTO_INCR;
+	MGMT_CLOCK_SEL_SET(&val, 1);
+	xgene_enet_wr_mcx_mac(pdata, MII_MGMT_CONFIG_ADDR, val);
+}
+
+void xgene_gport_shutdown(struct xgene_enet_pdata *pdata)
+{
+	clk_disable_unprepare(pdata->clk);
+}
+
+static int xgene_enet_mdio_read(struct mii_bus *bus, int mii_id, int regnum)
+{
+	struct xgene_enet_pdata *pdata = bus->priv;
+	u32 val;
+
+	val = xgene_mii_phy_read(pdata, mii_id, regnum);
+	netdev_dbg(pdata->ndev, "mdio_rd: bus=%d reg=%d val=%x\n",
+		   mii_id, regnum, val);
+
+	return val;
+}
+
+static int xgene_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum,
+				 u16 val)
+{
+	struct xgene_enet_pdata *pdata = bus->priv;
+
+	netdev_dbg(pdata->ndev, "mdio_wr: bus=%d reg=%d val=%x\n",
+		   mii_id, regnum, val);
+	return xgene_mii_phy_write(pdata, mii_id, regnum, val);
+}
+
+static void xgene_enet_adjust_link(struct net_device *ndev)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	struct phy_device *phydev = pdata->phy_dev;
+
+	if (phydev->link) {
+		if (pdata->phy_speed != phydev->speed) {
+			xgene_gmac_init(pdata, phydev->speed);
+			xgene_gmac_rx_enable(pdata);
+			xgene_gmac_tx_enable(pdata);
+			pdata->phy_speed = phydev->speed;
+			phy_print_status(phydev);
+		}
+	} else {
+		xgene_gmac_rx_disable(pdata);
+		xgene_gmac_tx_disable(pdata);
+		pdata->phy_speed = SPEED_UNKNOWN;
+		phy_print_status(phydev);
+	}
+}
+
+static int xgene_enet_phy_connect(struct net_device *ndev)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	struct device_node *phy_np;
+	struct phy_device *phy_dev;
+	struct device *dev = &pdata->pdev->dev;
+
+	phy_np = of_parse_phandle(dev->of_node, "phy-handle", 0);
+	if (!phy_np) {
+		netdev_dbg(ndev, "No phy-handle found\n");
+		return -ENODEV;
+	}
+
+	phy_dev = of_phy_connect(ndev, phy_np, &xgene_enet_adjust_link,
+				 0, pdata->phy_mode);
+	if (!phy_dev) {
+		netdev_err(ndev, "Could not connect to PHY\n");
+		return  -ENODEV;
+	}
+
+	pdata->phy_speed = SPEED_UNKNOWN;
+	phy_dev->supported &= ~SUPPORTED_10baseT_Half &
+			      ~SUPPORTED_100baseT_Half &
+			      ~SUPPORTED_1000baseT_Half;
+	phy_dev->advertising = phy_dev->supported;
+	pdata->phy_dev = phy_dev;
+
+	return 0;
+}
+
+int xgene_enet_mdio_config(struct xgene_enet_pdata *pdata)
+{
+	struct net_device *ndev = pdata->ndev;
+	struct device *dev = &pdata->pdev->dev;
+	struct device_node *child_np;
+	struct device_node *mdio_np = NULL;
+	struct mii_bus *mdio_bus;
+	int ret;
+
+	for_each_child_of_node(dev->of_node, child_np) {
+		if (of_device_is_compatible(child_np, "apm,xgene-mdio")) {
+			mdio_np = child_np;
+			break;
+		}
+	}
+
+	if (!mdio_np) {
+		netdev_dbg(ndev, "No mdio node in the dts\n");
+		return -ENXIO;
+	}
+
+	mdio_bus = mdiobus_alloc();
+	if (!mdio_bus)
+		return -ENOMEM;
+
+	mdio_bus->name = "APM X-Gene MDIO bus";
+	mdio_bus->read = xgene_enet_mdio_read;
+	mdio_bus->write = xgene_enet_mdio_write;
+	snprintf(mdio_bus->id, MII_BUS_ID_SIZE, "%s-%s", "xgene-mii",
+		 ndev->name);
+
+	mdio_bus->priv = pdata;
+	mdio_bus->parent = &ndev->dev;
+
+	ret = of_mdiobus_register(mdio_bus, mdio_np);
+	if (ret) {
+		netdev_err(ndev, "Failed to register MDIO bus\n");
+		mdiobus_free(mdio_bus);
+		return ret;
+	}
+	pdata->mdio_bus = mdio_bus;
+
+	ret = xgene_enet_phy_connect(ndev);
+	if (ret)
+		xgene_enet_mdio_remove(pdata);
+
+	return ret;
+}
+
+void xgene_enet_mdio_remove(struct xgene_enet_pdata *pdata)
+{
+	mdiobus_unregister(pdata->mdio_bus);
+	mdiobus_free(pdata->mdio_bus);
+	pdata->mdio_bus = NULL;
+}
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
new file mode 100644
index 000000000000..371e7a5b2507
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
@@ -0,0 +1,337 @@
+/* Applied Micro X-Gene SoC Ethernet Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Authors: Iyappan Subramanian <isubramanian@apm.com>
+ *	    Ravi Patel <rapatel@apm.com>
+ *	    Keyur Chudgar <kchudgar@apm.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __XGENE_ENET_HW_H__
+#define __XGENE_ENET_HW_H__
+
+#include "xgene_enet_main.h"
+
+struct xgene_enet_pdata;
+struct xgene_enet_stats;
+
+/* clears and then set bits */
+static inline void xgene_set_bits(u32 *dst, u32 val, u32 start, u32 len)
+{
+	u32 end = start + len - 1;
+	u32 mask = GENMASK(end, start);
+
+	*dst &= ~mask;
+	*dst |= (val << start) & mask;
+}
+
+static inline u32 xgene_get_bits(u32 val, u32 start, u32 end)
+{
+	return (val & GENMASK(end, start)) >> start;
+}
+
+#define CSR_RING_ID		0x0008
+#define OVERWRITE		BIT(31)
+#define IS_BUFFER_POOL		BIT(20)
+#define PREFETCH_BUF_EN		BIT(21)
+#define CSR_RING_ID_BUF		0x000c
+#define CSR_RING_NE_INT_MODE	0x017c
+#define CSR_RING_CONFIG		0x006c
+#define CSR_RING_WR_BASE	0x0070
+#define NUM_RING_CONFIG		5
+#define BUFPOOL_MODE		3
+#define RM3			3
+#define INC_DEC_CMD_ADDR	0x002c
+#define UDP_HDR_SIZE		2
+#define BUF_LEN_CODE_2K		0x5000
+
+#define CREATE_MASK(pos, len)		GENMASK((pos)+(len)-1, (pos))
+#define CREATE_MASK_ULL(pos, len)	GENMASK_ULL((pos)+(len)-1, (pos))
+
+/* Empty slot soft signature */
+#define EMPTY_SLOT_INDEX	1
+#define EMPTY_SLOT		~0ULL
+
+#define WORK_DESC_SIZE		32
+#define BUFPOOL_DESC_SIZE	16
+
+#define RING_OWNER_MASK		GENMASK(9, 6)
+#define RING_BUFNUM_MASK	GENMASK(5, 0)
+
+#define SELTHRSH_POS		3
+#define SELTHRSH_LEN		3
+#define RINGADDRL_POS		5
+#define RINGADDRL_LEN		27
+#define RINGADDRH_POS		0
+#define RINGADDRH_LEN		6
+#define RINGSIZE_POS		23
+#define RINGSIZE_LEN		3
+#define RINGTYPE_POS		19
+#define RINGTYPE_LEN		2
+#define RINGMODE_POS		20
+#define RINGMODE_LEN		3
+#define RECOMTIMEOUTL_POS	28
+#define RECOMTIMEOUTL_LEN	3
+#define RECOMTIMEOUTH_POS	0
+#define RECOMTIMEOUTH_LEN	2
+#define NUMMSGSINQ_POS		1
+#define NUMMSGSINQ_LEN		16
+#define ACCEPTLERR		BIT(19)
+#define QCOHERENT		BIT(4)
+#define RECOMBBUF		BIT(27)
+
+#define BLOCK_ETH_CSR_OFFSET		0x2000
+#define BLOCK_ETH_RING_IF_OFFSET	0x9000
+#define BLOCK_ETH_CLKRST_CSR_OFFSET	0xC000
+#define BLOCK_ETH_DIAG_CSR_OFFSET	0xD000
+
+#define BLOCK_ETH_MAC_OFFSET		0x0000
+#define BLOCK_ETH_STATS_OFFSET		0x0014
+#define BLOCK_ETH_MAC_CSR_OFFSET	0x2800
+
+#define MAC_ADDR_REG_OFFSET		0x00
+#define MAC_COMMAND_REG_OFFSET		0x04
+#define MAC_WRITE_REG_OFFSET		0x08
+#define MAC_READ_REG_OFFSET		0x0c
+#define MAC_COMMAND_DONE_REG_OFFSET	0x10
+
+#define STAT_ADDR_REG_OFFSET		0x00
+#define STAT_COMMAND_REG_OFFSET		0x04
+#define STAT_WRITE_REG_OFFSET		0x08
+#define STAT_READ_REG_OFFSET		0x0c
+#define STAT_COMMAND_DONE_REG_OFFSET	0x10
+
+#define MII_MGMT_CONFIG_ADDR		0x20
+#define MII_MGMT_COMMAND_ADDR		0x24
+#define MII_MGMT_ADDRESS_ADDR		0x28
+#define MII_MGMT_CONTROL_ADDR		0x2c
+#define MII_MGMT_STATUS_ADDR		0x30
+#define MII_MGMT_INDICATORS_ADDR	0x34
+
+#define BUSY_MASK			BIT(0)
+#define READ_CYCLE_MASK			BIT(0)
+#define PHY_CONTROL_SET(dst, val)	xgene_set_bits(dst, val, 0, 16)
+
+#define ENET_SPARE_CFG_REG_ADDR		0x0750
+#define RSIF_CONFIG_REG_ADDR		0x0010
+#define RSIF_RAM_DBG_REG0_ADDR		0x0048
+#define RGMII_REG_0_ADDR		0x07e0
+#define CFG_LINK_AGGR_RESUME_0_ADDR	0x07c8
+#define DEBUG_REG_ADDR			0x0700
+#define CFG_BYPASS_ADDR			0x0294
+#define CLE_BYPASS_REG0_0_ADDR		0x0490
+#define CLE_BYPASS_REG1_0_ADDR		0x0494
+#define CFG_RSIF_FPBUFF_TIMEOUT_EN	BIT(31)
+#define RESUME_TX			BIT(0)
+#define CFG_SPEED_1250			BIT(24)
+#define TX_PORT0			BIT(0)
+#define CFG_BYPASS_UNISEC_TX		BIT(2)
+#define CFG_BYPASS_UNISEC_RX		BIT(1)
+#define CFG_CLE_BYPASS_EN0		BIT(31)
+#define CFG_TXCLK_MUXSEL0_SET(dst, val)	xgene_set_bits(dst, val, 29, 3)
+
+#define CFG_CLE_IP_PROTOCOL0_SET(dst, val)	xgene_set_bits(dst, val, 16, 2)
+#define CFG_CLE_DSTQID0_SET(dst, val)		xgene_set_bits(dst, val, 0, 12)
+#define CFG_CLE_FPSEL0_SET(dst, val)		xgene_set_bits(dst, val, 16, 4)
+#define CFG_MACMODE_SET(dst, val)		xgene_set_bits(dst, val, 18, 2)
+#define CFG_WAITASYNCRD_SET(dst, val)		xgene_set_bits(dst, val, 0, 16)
+#define ICM_CONFIG0_REG_0_ADDR		0x0400
+#define ICM_CONFIG2_REG_0_ADDR		0x0410
+#define RX_DV_GATE_REG_0_ADDR		0x05fc
+#define TX_DV_GATE_EN0			BIT(2)
+#define RX_DV_GATE_EN0			BIT(1)
+#define RESUME_RX0			BIT(0)
+#define ENET_CFGSSQMIWQASSOC_ADDR		0xe0
+#define ENET_CFGSSQMIFPQASSOC_ADDR		0xdc
+#define ENET_CFGSSQMIQMLITEFPQASSOC_ADDR	0xf0
+#define ENET_CFGSSQMIQMLITEWQASSOC_ADDR		0xf4
+#define ENET_CFG_MEM_RAM_SHUTDOWN_ADDR		0x70
+#define ENET_BLOCK_MEM_RDY_ADDR			0x74
+#define MAC_CONFIG_1_ADDR			0x00
+#define MAC_CONFIG_2_ADDR			0x04
+#define MAX_FRAME_LEN_ADDR			0x10
+#define INTERFACE_CONTROL_ADDR			0x38
+#define STATION_ADDR0_ADDR			0x40
+#define STATION_ADDR1_ADDR			0x44
+#define PHY_ADDR_SET(dst, val)			xgene_set_bits(dst, val, 8, 5)
+#define REG_ADDR_SET(dst, val)			xgene_set_bits(dst, val, 0, 5)
+#define ENET_INTERFACE_MODE2_SET(dst, val)	xgene_set_bits(dst, val, 8, 2)
+#define MGMT_CLOCK_SEL_SET(dst, val)		xgene_set_bits(dst, val, 0, 3)
+#define SOFT_RESET1			BIT(31)
+#define TX_EN				BIT(0)
+#define RX_EN				BIT(2)
+#define ENET_LHD_MODE			BIT(25)
+#define ENET_GHD_MODE			BIT(26)
+#define FULL_DUPLEX2			BIT(0)
+#define SCAN_AUTO_INCR			BIT(5)
+#define TBYT_ADDR			0x38
+#define TPKT_ADDR			0x39
+#define TDRP_ADDR			0x45
+#define TFCS_ADDR			0x47
+#define TUND_ADDR			0x4a
+
+#define TSO_IPPROTO_TCP			1
+#define	FULL_DUPLEX			2
+
+#define USERINFO_POS			0
+#define USERINFO_LEN			32
+#define FPQNUM_POS			32
+#define FPQNUM_LEN			12
+#define LERR_POS			60
+#define LERR_LEN			3
+#define STASH_POS			52
+#define STASH_LEN			2
+#define BUFDATALEN_POS			48
+#define BUFDATALEN_LEN			12
+#define DATAADDR_POS			0
+#define DATAADDR_LEN			42
+#define COHERENT_POS			63
+#define HENQNUM_POS			48
+#define HENQNUM_LEN			12
+#define TYPESEL_POS			44
+#define TYPESEL_LEN			4
+#define ETHHDR_POS			12
+#define ETHHDR_LEN			8
+#define IC_POS				35	/* Insert CRC */
+#define TCPHDR_POS			0
+#define TCPHDR_LEN			6
+#define IPHDR_POS			6
+#define IPHDR_LEN			6
+#define EC_POS				22	/* Enable checksum */
+#define EC_LEN				1
+#define IS_POS				24	/* IP protocol select */
+#define IS_LEN				1
+#define TYPE_ETH_WORK_MESSAGE_POS	44
+
+struct xgene_enet_raw_desc {
+	__le64 m0;
+	__le64 m1;
+	__le64 m2;
+	__le64 m3;
+};
+
+struct xgene_enet_raw_desc16 {
+	__le64 m0;
+	__le64 m1;
+};
+
+static inline void xgene_enet_mark_desc_slot_empty(void *desc_slot_ptr)
+{
+	__le64 *desc_slot = desc_slot_ptr;
+
+	desc_slot[EMPTY_SLOT_INDEX] = cpu_to_le64(EMPTY_SLOT);
+}
+
+static inline bool xgene_enet_is_desc_slot_empty(void *desc_slot_ptr)
+{
+	__le64 *desc_slot = desc_slot_ptr;
+
+	return (desc_slot[EMPTY_SLOT_INDEX] == cpu_to_le64(EMPTY_SLOT));
+}
+
+enum xgene_enet_ring_cfgsize {
+	RING_CFGSIZE_512B,
+	RING_CFGSIZE_2KB,
+	RING_CFGSIZE_16KB,
+	RING_CFGSIZE_64KB,
+	RING_CFGSIZE_512KB,
+	RING_CFGSIZE_INVALID
+};
+
+enum xgene_enet_ring_type {
+	RING_DISABLED,
+	RING_REGULAR,
+	RING_BUFPOOL
+};
+
+enum xgene_ring_owner {
+	RING_OWNER_ETH0,
+	RING_OWNER_CPU = 15,
+	RING_OWNER_INVALID
+};
+
+enum xgene_enet_ring_bufnum {
+	RING_BUFNUM_REGULAR = 0x0,
+	RING_BUFNUM_BUFPOOL = 0x20,
+	RING_BUFNUM_INVALID
+};
+
+enum xgene_enet_cmd {
+	XGENE_ENET_WR_CMD = BIT(31),
+	XGENE_ENET_RD_CMD = BIT(30)
+};
+
+enum xgene_enet_err_code {
+	HBF_READ_DATA = 3,
+	HBF_LL_READ = 4,
+	BAD_WORK_MSG = 6,
+	BUFPOOL_TIMEOUT = 15,
+	INGRESS_CRC = 16,
+	INGRESS_CHECKSUM = 17,
+	INGRESS_TRUNC_FRAME = 18,
+	INGRESS_PKT_LEN = 19,
+	INGRESS_PKT_UNDER = 20,
+	INGRESS_FIFO_OVERRUN = 21,
+	INGRESS_CHECKSUM_COMPUTE = 26,
+	ERR_CODE_INVALID
+};
+
+static inline enum xgene_ring_owner xgene_enet_ring_owner(u16 id)
+{
+	return (id & RING_OWNER_MASK) >> 6;
+}
+
+static inline u8 xgene_enet_ring_bufnum(u16 id)
+{
+	return id & RING_BUFNUM_MASK;
+}
+
+static inline bool xgene_enet_is_bufpool(u16 id)
+{
+	return ((id & RING_BUFNUM_MASK) >= 0x20) ? true : false;
+}
+
+static inline u16 xgene_enet_get_numslots(u16 id, u32 size)
+{
+	bool is_bufpool = xgene_enet_is_bufpool(id);
+
+	return (is_bufpool) ? size / BUFPOOL_DESC_SIZE :
+		      size / WORK_DESC_SIZE;
+}
+
+struct xgene_enet_desc_ring *xgene_enet_setup_ring(
+		struct xgene_enet_desc_ring *ring);
+void xgene_enet_clear_ring(struct xgene_enet_desc_ring *ring);
+void xgene_enet_parse_error(struct xgene_enet_desc_ring *ring,
+			    struct xgene_enet_pdata *pdata,
+			    enum xgene_enet_err_code status);
+
+void xgene_enet_reset(struct xgene_enet_pdata *priv);
+void xgene_gmac_reset(struct xgene_enet_pdata *priv);
+void xgene_gmac_init(struct xgene_enet_pdata *priv, int speed);
+void xgene_gmac_tx_enable(struct xgene_enet_pdata *priv);
+void xgene_gmac_rx_enable(struct xgene_enet_pdata *priv);
+void xgene_gmac_tx_disable(struct xgene_enet_pdata *priv);
+void xgene_gmac_rx_disable(struct xgene_enet_pdata *priv);
+void xgene_gmac_set_mac_addr(struct xgene_enet_pdata *pdata);
+void xgene_enet_cle_bypass(struct xgene_enet_pdata *pdata,
+			   u32 dst_ring_num, u16 bufpool_id);
+void xgene_gport_shutdown(struct xgene_enet_pdata *priv);
+void xgene_gmac_get_tx_stats(struct xgene_enet_pdata *pdata);
+
+int xgene_enet_mdio_config(struct xgene_enet_pdata *pdata);
+void xgene_enet_mdio_remove(struct xgene_enet_pdata *pdata);
+
+#endif /* __XGENE_ENET_HW_H__ */
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
new file mode 100644
index 000000000000..af7c40ac1455
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -0,0 +1,951 @@
+/* Applied Micro X-Gene SoC Ethernet Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Authors: Iyappan Subramanian <isubramanian@apm.com>
+ *	    Ravi Patel <rapatel@apm.com>
+ *	    Keyur Chudgar <kchudgar@apm.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "xgene_enet_main.h"
+#include "xgene_enet_hw.h"
+
+static void xgene_enet_init_bufpool(struct xgene_enet_desc_ring *buf_pool)
+{
+	struct xgene_enet_raw_desc16 *raw_desc;
+	int i;
+
+	for (i = 0; i < buf_pool->slots; i++) {
+		raw_desc = &buf_pool->raw_desc16[i];
+
+		/* Hardware expects descriptor in little endian format */
+		raw_desc->m0 = cpu_to_le64(i |
+				SET_VAL(FPQNUM, buf_pool->dst_ring_num) |
+				SET_VAL(STASH, 3));
+	}
+}
+
+static int xgene_enet_refill_bufpool(struct xgene_enet_desc_ring *buf_pool,
+				     u32 nbuf)
+{
+	struct sk_buff *skb;
+	struct xgene_enet_raw_desc16 *raw_desc;
+	struct net_device *ndev;
+	struct device *dev;
+	dma_addr_t dma_addr;
+	u32 tail = buf_pool->tail;
+	u32 slots = buf_pool->slots - 1;
+	u16 bufdatalen, len;
+	int i;
+
+	ndev = buf_pool->ndev;
+	dev = ndev_to_dev(buf_pool->ndev);
+	bufdatalen = BUF_LEN_CODE_2K | (SKB_BUFFER_SIZE & GENMASK(11, 0));
+	len = XGENE_ENET_MAX_MTU;
+
+	for (i = 0; i < nbuf; i++) {
+		raw_desc = &buf_pool->raw_desc16[tail];
+
+		skb = netdev_alloc_skb_ip_align(ndev, len);
+		if (unlikely(!skb))
+			return -ENOMEM;
+		buf_pool->rx_skb[tail] = skb;
+
+		dma_addr = dma_map_single(dev, skb->data, len, DMA_FROM_DEVICE);
+		if (dma_mapping_error(dev, dma_addr)) {
+			netdev_err(ndev, "DMA mapping error\n");
+			dev_kfree_skb_any(skb);
+			return -EINVAL;
+		}
+
+		raw_desc->m1 = cpu_to_le64(SET_VAL(DATAADDR, dma_addr) |
+					   SET_VAL(BUFDATALEN, bufdatalen) |
+					   SET_BIT(COHERENT));
+		tail = (tail + 1) & slots;
+	}
+
+	iowrite32(nbuf, buf_pool->cmd);
+	buf_pool->tail = tail;
+
+	return 0;
+}
+
+static u16 xgene_enet_dst_ring_num(struct xgene_enet_desc_ring *ring)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ring->ndev);
+
+	return ((u16)pdata->rm << 10) | ring->num;
+}
+
+static u8 xgene_enet_hdr_len(const void *data)
+{
+	const struct ethhdr *eth = data;
+
+	return (eth->h_proto == htons(ETH_P_8021Q)) ? VLAN_ETH_HLEN : ETH_HLEN;
+}
+
+static u32 xgene_enet_ring_len(struct xgene_enet_desc_ring *ring)
+{
+	u32 __iomem *cmd_base = ring->cmd_base;
+	u32 ring_state, num_msgs;
+
+	ring_state = ioread32(&cmd_base[1]);
+	num_msgs = ring_state & CREATE_MASK(NUMMSGSINQ_POS, NUMMSGSINQ_LEN);
+
+	return num_msgs >> NUMMSGSINQ_POS;
+}
+
+static void xgene_enet_delete_bufpool(struct xgene_enet_desc_ring *buf_pool)
+{
+	struct xgene_enet_raw_desc16 *raw_desc;
+	u32 slots = buf_pool->slots - 1;
+	u32 tail = buf_pool->tail;
+	u32 userinfo;
+	int i, len;
+
+	len = xgene_enet_ring_len(buf_pool);
+	for (i = 0; i < len; i++) {
+		tail = (tail - 1) & slots;
+		raw_desc = &buf_pool->raw_desc16[tail];
+
+		/* Hardware stores descriptor in little endian format */
+		userinfo = GET_VAL(USERINFO, le64_to_cpu(raw_desc->m0));
+		dev_kfree_skb_any(buf_pool->rx_skb[userinfo]);
+	}
+
+	iowrite32(-len, buf_pool->cmd);
+	buf_pool->tail = tail;
+}
+
+static irqreturn_t xgene_enet_rx_irq(const int irq, void *data)
+{
+	struct xgene_enet_desc_ring *rx_ring = data;
+
+	if (napi_schedule_prep(&rx_ring->napi)) {
+		disable_irq_nosync(irq);
+		__napi_schedule(&rx_ring->napi);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int xgene_enet_tx_completion(struct xgene_enet_desc_ring *cp_ring,
+				    struct xgene_enet_raw_desc *raw_desc)
+{
+	struct sk_buff *skb;
+	struct device *dev;
+	u16 skb_index;
+	u8 status;
+	int ret = 0;
+
+	skb_index = GET_VAL(USERINFO, le64_to_cpu(raw_desc->m0));
+	skb = cp_ring->cp_skb[skb_index];
+
+	dev = ndev_to_dev(cp_ring->ndev);
+	dma_unmap_single(dev, GET_VAL(DATAADDR, le64_to_cpu(raw_desc->m1)),
+			 GET_VAL(BUFDATALEN, le64_to_cpu(raw_desc->m1)),
+			 DMA_TO_DEVICE);
+
+	/* Checking for error */
+	status = GET_VAL(LERR, le64_to_cpu(raw_desc->m0));
+	if (unlikely(status > 2)) {
+		xgene_enet_parse_error(cp_ring, netdev_priv(cp_ring->ndev),
+				       status);
+		ret = -EIO;
+	}
+
+	if (likely(skb)) {
+		dev_kfree_skb_any(skb);
+	} else {
+		netdev_err(cp_ring->ndev, "completion skb is NULL\n");
+		ret = -EIO;
+	}
+
+	return ret;
+}
+
+static u64 xgene_enet_work_msg(struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	u8 l3hlen, l4hlen = 0;
+	u8 csum_enable = 0;
+	u8 proto = 0;
+	u8 ethhdr;
+	u64 hopinfo;
+
+	if (unlikely(skb->protocol != htons(ETH_P_IP)) &&
+	    unlikely(skb->protocol != htons(ETH_P_8021Q)))
+		goto out;
+
+	if (unlikely(!(skb->dev->features & NETIF_F_IP_CSUM)))
+		goto out;
+
+	iph = ip_hdr(skb);
+	if (unlikely(ip_is_fragment(iph)))
+		goto out;
+
+	if (likely(iph->protocol == IPPROTO_TCP)) {
+		l4hlen = tcp_hdrlen(skb) >> 2;
+		csum_enable = 1;
+		proto = TSO_IPPROTO_TCP;
+	} else if (iph->protocol == IPPROTO_UDP) {
+		l4hlen = UDP_HDR_SIZE;
+		csum_enable = 1;
+	}
+out:
+	l3hlen = ip_hdrlen(skb) >> 2;
+	ethhdr = xgene_enet_hdr_len(skb->data);
+	hopinfo = SET_VAL(TCPHDR, l4hlen) |
+		  SET_VAL(IPHDR, l3hlen) |
+		  SET_VAL(ETHHDR, ethhdr) |
+		  SET_VAL(EC, csum_enable) |
+		  SET_VAL(IS, proto) |
+		  SET_BIT(IC) |
+		  SET_BIT(TYPE_ETH_WORK_MESSAGE);
+
+	return hopinfo;
+}
+
+static int xgene_enet_setup_tx_desc(struct xgene_enet_desc_ring *tx_ring,
+				    struct sk_buff *skb)
+{
+	struct device *dev = ndev_to_dev(tx_ring->ndev);
+	struct xgene_enet_raw_desc *raw_desc;
+	dma_addr_t dma_addr;
+	u16 tail = tx_ring->tail;
+	u64 hopinfo;
+
+	raw_desc = &tx_ring->raw_desc[tail];
+	memset(raw_desc, 0, sizeof(struct xgene_enet_raw_desc));
+
+	dma_addr = dma_map_single(dev, skb->data, skb->len, DMA_TO_DEVICE);
+	if (dma_mapping_error(dev, dma_addr)) {
+		netdev_err(tx_ring->ndev, "DMA mapping error\n");
+		return -EINVAL;
+	}
+
+	/* Hardware expects descriptor in little endian format */
+	raw_desc->m0 = cpu_to_le64(tail);
+	raw_desc->m1 = cpu_to_le64(SET_VAL(DATAADDR, dma_addr) |
+				   SET_VAL(BUFDATALEN, skb->len) |
+				   SET_BIT(COHERENT));
+	hopinfo = xgene_enet_work_msg(skb);
+	raw_desc->m3 = cpu_to_le64(SET_VAL(HENQNUM, tx_ring->dst_ring_num) |
+				   hopinfo);
+	tx_ring->cp_ring->cp_skb[tail] = skb;
+
+	return 0;
+}
+
+static netdev_tx_t xgene_enet_start_xmit(struct sk_buff *skb,
+					 struct net_device *ndev)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	struct xgene_enet_desc_ring *tx_ring = pdata->tx_ring;
+	struct xgene_enet_desc_ring *cp_ring = tx_ring->cp_ring;
+	u32 tx_level, cq_level;
+
+	tx_level = xgene_enet_ring_len(tx_ring);
+	cq_level = xgene_enet_ring_len(cp_ring);
+	if (unlikely(tx_level > pdata->tx_qcnt_hi ||
+		     cq_level > pdata->cp_qcnt_hi)) {
+		netif_stop_queue(ndev);
+		return NETDEV_TX_BUSY;
+	}
+
+	if (xgene_enet_setup_tx_desc(tx_ring, skb)) {
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+	iowrite32(1, tx_ring->cmd);
+	skb_tx_timestamp(skb);
+	tx_ring->tail = (tx_ring->tail + 1) & (tx_ring->slots - 1);
+
+	pdata->stats.tx_packets++;
+	pdata->stats.tx_bytes += skb->len;
+
+	return NETDEV_TX_OK;
+}
+
+static void xgene_enet_skip_csum(struct sk_buff *skb)
+{
+	struct iphdr *iph = ip_hdr(skb);
+
+	if (!ip_is_fragment(iph) ||
+	    (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP)) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+}
+
+static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring,
+			       struct xgene_enet_raw_desc *raw_desc)
+{
+	struct net_device *ndev;
+	struct xgene_enet_pdata *pdata;
+	struct device *dev;
+	struct xgene_enet_desc_ring *buf_pool;
+	u32 datalen, skb_index;
+	struct sk_buff *skb;
+	u8 status;
+	int ret = 0;
+
+	ndev = rx_ring->ndev;
+	pdata = netdev_priv(ndev);
+	dev = ndev_to_dev(rx_ring->ndev);
+	buf_pool = rx_ring->buf_pool;
+
+	dma_unmap_single(dev, GET_VAL(DATAADDR, le64_to_cpu(raw_desc->m1)),
+			 XGENE_ENET_MAX_MTU, DMA_FROM_DEVICE);
+	skb_index = GET_VAL(USERINFO, le64_to_cpu(raw_desc->m0));
+	skb = buf_pool->rx_skb[skb_index];
+
+	/* checking for error */
+	status = GET_VAL(LERR, le64_to_cpu(raw_desc->m0));
+	if (unlikely(status > 2)) {
+		dev_kfree_skb_any(skb);
+		xgene_enet_parse_error(rx_ring, netdev_priv(rx_ring->ndev),
+				       status);
+		pdata->stats.rx_dropped++;
+		ret = -EIO;
+		goto out;
+	}
+
+	/* strip off CRC as HW isn't doing this */
+	datalen = GET_VAL(BUFDATALEN, le64_to_cpu(raw_desc->m1));
+	datalen -= 4;
+	prefetch(skb->data - NET_IP_ALIGN);
+	skb_put(skb, datalen);
+
+	skb_checksum_none_assert(skb);
+	skb->protocol = eth_type_trans(skb, ndev);
+	if (likely((ndev->features & NETIF_F_IP_CSUM) &&
+		   skb->protocol == htons(ETH_P_IP))) {
+		xgene_enet_skip_csum(skb);
+	}
+
+	pdata->stats.rx_packets++;
+	pdata->stats.rx_bytes += datalen;
+	napi_gro_receive(&rx_ring->napi, skb);
+out:
+	if (--rx_ring->nbufpool == 0) {
+		ret = xgene_enet_refill_bufpool(buf_pool, NUM_BUFPOOL);
+		rx_ring->nbufpool = NUM_BUFPOOL;
+	}
+
+	return ret;
+}
+
+static bool is_rx_desc(struct xgene_enet_raw_desc *raw_desc)
+{
+	return GET_VAL(FPQNUM, le64_to_cpu(raw_desc->m0)) ? true : false;
+}
+
+static int xgene_enet_process_ring(struct xgene_enet_desc_ring *ring,
+				   int budget)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ring->ndev);
+	struct xgene_enet_raw_desc *raw_desc;
+	u16 head = ring->head;
+	u16 slots = ring->slots - 1;
+	int ret, count = 0;
+
+	do {
+		raw_desc = &ring->raw_desc[head];
+		if (unlikely(xgene_enet_is_desc_slot_empty(raw_desc)))
+			break;
+
+		if (is_rx_desc(raw_desc))
+			ret = xgene_enet_rx_frame(ring, raw_desc);
+		else
+			ret = xgene_enet_tx_completion(ring, raw_desc);
+		xgene_enet_mark_desc_slot_empty(raw_desc);
+
+		head = (head + 1) & slots;
+		count++;
+
+		if (ret)
+			break;
+	} while (--budget);
+
+	if (likely(count)) {
+		iowrite32(-count, ring->cmd);
+		ring->head = head;
+
+		if (netif_queue_stopped(ring->ndev)) {
+			if (xgene_enet_ring_len(ring) < pdata->cp_qcnt_low)
+				netif_wake_queue(ring->ndev);
+		}
+	}
+
+	return budget;
+}
+
+static int xgene_enet_napi(struct napi_struct *napi, const int budget)
+{
+	struct xgene_enet_desc_ring *ring;
+	int processed;
+
+	ring = container_of(napi, struct xgene_enet_desc_ring, napi);
+	processed = xgene_enet_process_ring(ring, budget);
+
+	if (processed != budget) {
+		napi_complete(napi);
+		enable_irq(ring->irq);
+	}
+
+	return processed;
+}
+
+static void xgene_enet_timeout(struct net_device *ndev)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+
+	xgene_gmac_reset(pdata);
+}
+
+static int xgene_enet_register_irq(struct net_device *ndev)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	struct device *dev = ndev_to_dev(ndev);
+	int ret;
+
+	ret = devm_request_irq(dev, pdata->rx_ring->irq, xgene_enet_rx_irq,
+			       IRQF_SHARED, ndev->name, pdata->rx_ring);
+	if (ret) {
+		netdev_err(ndev, "rx%d interrupt request failed\n",
+			   pdata->rx_ring->irq);
+	}
+
+	return ret;
+}
+
+static void xgene_enet_free_irq(struct net_device *ndev)
+{
+	struct xgene_enet_pdata *pdata;
+	struct device *dev;
+
+	pdata = netdev_priv(ndev);
+	dev = ndev_to_dev(ndev);
+	devm_free_irq(dev, pdata->rx_ring->irq, pdata->rx_ring);
+}
+
+static int xgene_enet_open(struct net_device *ndev)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	int ret;
+
+	xgene_gmac_tx_enable(pdata);
+	xgene_gmac_rx_enable(pdata);
+
+	ret = xgene_enet_register_irq(ndev);
+	if (ret)
+		return ret;
+	napi_enable(&pdata->rx_ring->napi);
+
+	if (pdata->phy_dev)
+		phy_start(pdata->phy_dev);
+
+	netif_start_queue(ndev);
+
+	return ret;
+}
+
+static int xgene_enet_close(struct net_device *ndev)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+
+	netif_stop_queue(ndev);
+
+	if (pdata->phy_dev)
+		phy_stop(pdata->phy_dev);
+
+	napi_disable(&pdata->rx_ring->napi);
+	xgene_enet_free_irq(ndev);
+	xgene_enet_process_ring(pdata->rx_ring, -1);
+
+	xgene_gmac_tx_disable(pdata);
+	xgene_gmac_rx_disable(pdata);
+
+	return 0;
+}
+
+static void xgene_enet_delete_ring(struct xgene_enet_desc_ring *ring)
+{
+	struct xgene_enet_pdata *pdata;
+	struct device *dev;
+
+	pdata = netdev_priv(ring->ndev);
+	dev = ndev_to_dev(ring->ndev);
+
+	xgene_enet_clear_ring(ring);
+	dma_free_coherent(dev, ring->size, ring->desc_addr, ring->dma);
+}
+
+static void xgene_enet_delete_desc_rings(struct xgene_enet_pdata *pdata)
+{
+	struct xgene_enet_desc_ring *buf_pool;
+
+	if (pdata->tx_ring) {
+		xgene_enet_delete_ring(pdata->tx_ring);
+		pdata->tx_ring = NULL;
+	}
+
+	if (pdata->rx_ring) {
+		buf_pool = pdata->rx_ring->buf_pool;
+		xgene_enet_delete_bufpool(buf_pool);
+		xgene_enet_delete_ring(buf_pool);
+		xgene_enet_delete_ring(pdata->rx_ring);
+		pdata->rx_ring = NULL;
+	}
+}
+
+static int xgene_enet_get_ring_size(struct device *dev,
+				    enum xgene_enet_ring_cfgsize cfgsize)
+{
+	int size = -EINVAL;
+
+	switch (cfgsize) {
+	case RING_CFGSIZE_512B:
+		size = 0x200;
+		break;
+	case RING_CFGSIZE_2KB:
+		size = 0x800;
+		break;
+	case RING_CFGSIZE_16KB:
+		size = 0x4000;
+		break;
+	case RING_CFGSIZE_64KB:
+		size = 0x10000;
+		break;
+	case RING_CFGSIZE_512KB:
+		size = 0x80000;
+		break;
+	default:
+		dev_err(dev, "Unsupported cfg ring size %d\n", cfgsize);
+		break;
+	}
+
+	return size;
+}
+
+static void xgene_enet_free_desc_ring(struct xgene_enet_desc_ring *ring)
+{
+	struct device *dev;
+
+	if (!ring)
+		return;
+
+	dev = ndev_to_dev(ring->ndev);
+
+	if (ring->desc_addr) {
+		xgene_enet_clear_ring(ring);
+		dma_free_coherent(dev, ring->size, ring->desc_addr, ring->dma);
+	}
+	devm_kfree(dev, ring);
+}
+
+static void xgene_enet_free_desc_rings(struct xgene_enet_pdata *pdata)
+{
+	struct device *dev = &pdata->pdev->dev;
+	struct xgene_enet_desc_ring *ring;
+
+	ring = pdata->tx_ring;
+	if (ring && ring->cp_ring && ring->cp_ring->cp_skb)
+		devm_kfree(dev, ring->cp_ring->cp_skb);
+	xgene_enet_free_desc_ring(ring);
+
+	ring = pdata->rx_ring;
+	if (ring && ring->buf_pool && ring->buf_pool->rx_skb)
+		devm_kfree(dev, ring->buf_pool->rx_skb);
+	xgene_enet_free_desc_ring(ring->buf_pool);
+	xgene_enet_free_desc_ring(ring);
+}
+
+static struct xgene_enet_desc_ring *xgene_enet_create_desc_ring(
+			struct net_device *ndev, u32 ring_num,
+			enum xgene_enet_ring_cfgsize cfgsize, u32 ring_id)
+{
+	struct xgene_enet_desc_ring *ring;
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	struct device *dev = ndev_to_dev(ndev);
+	u32 size;
+
+	ring = devm_kzalloc(dev, sizeof(struct xgene_enet_desc_ring),
+			    GFP_KERNEL);
+	if (!ring)
+		return NULL;
+
+	ring->ndev = ndev;
+	ring->num = ring_num;
+	ring->cfgsize = cfgsize;
+	ring->id = ring_id;
+
+	size = xgene_enet_get_ring_size(dev, cfgsize);
+	ring->desc_addr = dma_zalloc_coherent(dev, size, &ring->dma,
+					      GFP_KERNEL);
+	if (!ring->desc_addr) {
+		devm_kfree(dev, ring);
+		return NULL;
+	}
+	ring->size = size;
+
+	ring->cmd_base = pdata->ring_cmd_addr + (ring->num << 6);
+	ring->cmd = ring->cmd_base + INC_DEC_CMD_ADDR;
+	pdata->rm = RM3;
+	ring = xgene_enet_setup_ring(ring);
+	netdev_dbg(ndev, "ring info: num=%d  size=%d  id=%d  slots=%d\n",
+		   ring->num, ring->size, ring->id, ring->slots);
+
+	return ring;
+}
+
+static u16 xgene_enet_get_ring_id(enum xgene_ring_owner owner, u8 bufnum)
+{
+	return (owner << 6) | (bufnum & GENMASK(5, 0));
+}
+
+static int xgene_enet_create_desc_rings(struct net_device *ndev)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	struct device *dev = ndev_to_dev(ndev);
+	struct xgene_enet_desc_ring *rx_ring, *tx_ring, *cp_ring;
+	struct xgene_enet_desc_ring *buf_pool = NULL;
+	u8 cpu_bufnum = 0, eth_bufnum = 0;
+	u8 bp_bufnum = 0x20;
+	u16 ring_id, ring_num = 0;
+	int ret;
+
+	/* allocate rx descriptor ring */
+	ring_id = xgene_enet_get_ring_id(RING_OWNER_CPU, cpu_bufnum++);
+	rx_ring = xgene_enet_create_desc_ring(ndev, ring_num++,
+					      RING_CFGSIZE_16KB, ring_id);
+	if (!rx_ring) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/* allocate buffer pool for receiving packets */
+	ring_id = xgene_enet_get_ring_id(RING_OWNER_ETH0, bp_bufnum++);
+	buf_pool = xgene_enet_create_desc_ring(ndev, ring_num++,
+					       RING_CFGSIZE_2KB, ring_id);
+	if (!buf_pool) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	rx_ring->nbufpool = NUM_BUFPOOL;
+	rx_ring->buf_pool = buf_pool;
+	rx_ring->irq = pdata->rx_irq;
+	buf_pool->rx_skb = devm_kcalloc(dev, buf_pool->slots,
+					sizeof(struct sk_buff *), GFP_KERNEL);
+	if (!buf_pool->rx_skb) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	buf_pool->dst_ring_num = xgene_enet_dst_ring_num(buf_pool);
+	rx_ring->buf_pool = buf_pool;
+	pdata->rx_ring = rx_ring;
+
+	/* allocate tx descriptor ring */
+	ring_id = xgene_enet_get_ring_id(RING_OWNER_ETH0, eth_bufnum++);
+	tx_ring = xgene_enet_create_desc_ring(ndev, ring_num++,
+					      RING_CFGSIZE_16KB, ring_id);
+	if (!tx_ring) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	pdata->tx_ring = tx_ring;
+
+	cp_ring = pdata->rx_ring;
+	cp_ring->cp_skb = devm_kcalloc(dev, tx_ring->slots,
+				       sizeof(struct sk_buff *), GFP_KERNEL);
+	if (!cp_ring->cp_skb) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	pdata->tx_ring->cp_ring = cp_ring;
+	pdata->tx_ring->dst_ring_num = xgene_enet_dst_ring_num(cp_ring);
+
+	pdata->tx_qcnt_hi = pdata->tx_ring->slots / 2;
+	pdata->cp_qcnt_hi = pdata->rx_ring->slots / 2;
+	pdata->cp_qcnt_low = pdata->cp_qcnt_hi / 2;
+
+	return 0;
+
+err:
+	xgene_enet_free_desc_rings(pdata);
+	return ret;
+}
+
+static struct rtnl_link_stats64 *xgene_enet_get_stats64(
+			struct net_device *ndev,
+			struct rtnl_link_stats64 *storage)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	struct rtnl_link_stats64 *stats = &pdata->stats;
+
+	stats->rx_errors += stats->rx_length_errors +
+			    stats->rx_crc_errors +
+			    stats->rx_frame_errors +
+			    stats->rx_fifo_errors;
+	memcpy(storage, &pdata->stats, sizeof(struct rtnl_link_stats64));
+
+	return storage;
+}
+
+static int xgene_enet_set_mac_address(struct net_device *ndev, void *addr)
+{
+	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
+	int ret;
+
+	ret = eth_mac_addr(ndev, addr);
+	if (ret)
+		return ret;
+	xgene_gmac_set_mac_addr(pdata);
+
+	return ret;
+}
+
+static const struct net_device_ops xgene_ndev_ops = {
+	.ndo_open = xgene_enet_open,
+	.ndo_stop = xgene_enet_close,
+	.ndo_start_xmit = xgene_enet_start_xmit,
+	.ndo_tx_timeout = xgene_enet_timeout,
+	.ndo_get_stats64 = xgene_enet_get_stats64,
+	.ndo_change_mtu = eth_change_mtu,
+	.ndo_set_mac_address = xgene_enet_set_mac_address,
+};
+
+static int xgene_enet_get_resources(struct xgene_enet_pdata *pdata)
+{
+	struct platform_device *pdev;
+	struct net_device *ndev;
+	struct device *dev;
+	struct resource *res;
+	void __iomem *base_addr;
+	const char *mac;
+	int ret;
+
+	pdev = pdata->pdev;
+	dev = &pdev->dev;
+	ndev = pdata->ndev;
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "enet_csr");
+	if (!res) {
+		dev_err(dev, "Resource enet_csr not defined\n");
+		return -ENODEV;
+	}
+	pdata->base_addr = devm_ioremap_resource(dev, res);
+	if (IS_ERR(pdata->base_addr)) {
+		dev_err(dev, "Unable to retrieve ENET Port CSR region\n");
+		return PTR_ERR(pdata->base_addr);
+	}
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ring_csr");
+	if (!res) {
+		dev_err(dev, "Resource ring_csr not defined\n");
+		return -ENODEV;
+	}
+	pdata->ring_csr_addr = devm_ioremap_resource(dev, res);
+	if (IS_ERR(pdata->ring_csr_addr)) {
+		dev_err(dev, "Unable to retrieve ENET Ring CSR region\n");
+		return PTR_ERR(pdata->ring_csr_addr);
+	}
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ring_cmd");
+	if (!res) {
+		dev_err(dev, "Resource ring_cmd not defined\n");
+		return -ENODEV;
+	}
+	pdata->ring_cmd_addr = devm_ioremap_resource(dev, res);
+	if (IS_ERR(pdata->ring_cmd_addr)) {
+		dev_err(dev, "Unable to retrieve ENET Ring command region\n");
+		return PTR_ERR(pdata->ring_cmd_addr);
+	}
+
+	ret = platform_get_irq(pdev, 0);
+	if (ret <= 0) {
+		dev_err(dev, "Unable to get ENET Rx IRQ\n");
+		ret = ret ? : -ENXIO;
+		return ret;
+	}
+	pdata->rx_irq = ret;
+
+	mac = of_get_mac_address(dev->of_node);
+	if (mac)
+		memcpy(ndev->dev_addr, mac, ndev->addr_len);
+	else
+		eth_hw_addr_random(ndev);
+	memcpy(ndev->perm_addr, ndev->dev_addr, ndev->addr_len);
+
+	pdata->phy_mode = of_get_phy_mode(pdev->dev.of_node);
+	if (pdata->phy_mode < 0) {
+		dev_err(dev, "Incorrect phy-connection-type in DTS\n");
+		return -EINVAL;
+	}
+
+	pdata->clk = devm_clk_get(&pdev->dev, NULL);
+	ret = IS_ERR(pdata->clk);
+	if (IS_ERR(pdata->clk)) {
+		dev_err(&pdev->dev, "can't get clock\n");
+		ret = PTR_ERR(pdata->clk);
+		return ret;
+	}
+
+	base_addr = pdata->base_addr;
+	pdata->eth_csr_addr = base_addr + BLOCK_ETH_CSR_OFFSET;
+	pdata->eth_ring_if_addr = base_addr + BLOCK_ETH_RING_IF_OFFSET;
+	pdata->eth_diag_csr_addr = base_addr + BLOCK_ETH_DIAG_CSR_OFFSET;
+	pdata->mcx_mac_addr = base_addr + BLOCK_ETH_MAC_OFFSET;
+	pdata->mcx_stats_addr = base_addr + BLOCK_ETH_STATS_OFFSET;
+	pdata->mcx_mac_csr_addr = base_addr + BLOCK_ETH_MAC_CSR_OFFSET;
+	pdata->rx_buff_cnt = NUM_PKT_BUF;
+
+	return ret;
+}
+
+static int xgene_enet_init_hw(struct xgene_enet_pdata *pdata)
+{
+	struct net_device *ndev = pdata->ndev;
+	struct xgene_enet_desc_ring *buf_pool;
+	u16 dst_ring_num;
+	int ret;
+
+	xgene_gmac_tx_disable(pdata);
+	xgene_gmac_rx_disable(pdata);
+
+	ret = xgene_enet_create_desc_rings(ndev);
+	if (ret) {
+		netdev_err(ndev, "Error in ring configuration\n");
+		return ret;
+	}
+
+	/* setup buffer pool */
+	buf_pool = pdata->rx_ring->buf_pool;
+	xgene_enet_init_bufpool(buf_pool);
+	ret = xgene_enet_refill_bufpool(buf_pool, pdata->rx_buff_cnt);
+	if (ret) {
+		xgene_enet_delete_desc_rings(pdata);
+		return ret;
+	}
+
+	dst_ring_num = xgene_enet_dst_ring_num(pdata->rx_ring);
+	xgene_enet_cle_bypass(pdata, dst_ring_num, buf_pool->id);
+
+	return ret;
+}
+
+static int xgene_enet_probe(struct platform_device *pdev)
+{
+	struct net_device *ndev;
+	struct xgene_enet_pdata *pdata;
+	struct device *dev = &pdev->dev;
+	struct napi_struct *napi;
+	int ret;
+
+	ndev = alloc_etherdev(sizeof(struct xgene_enet_pdata));
+	if (!ndev)
+		return -ENOMEM;
+
+	pdata = netdev_priv(ndev);
+
+	pdata->pdev = pdev;
+	pdata->ndev = ndev;
+	SET_NETDEV_DEV(ndev, dev);
+	platform_set_drvdata(pdev, pdata);
+	ndev->netdev_ops = &xgene_ndev_ops;
+	xgene_enet_set_ethtool_ops(ndev);
+	ndev->features |= NETIF_F_IP_CSUM |
+			  NETIF_F_GSO |
+			  NETIF_F_GRO;
+
+	ret = xgene_enet_get_resources(pdata);
+	if (ret)
+		goto err;
+
+	xgene_enet_reset(pdata);
+	xgene_gmac_init(pdata, SPEED_1000);
+
+	ret = register_netdev(ndev);
+	if (ret) {
+		netdev_err(ndev, "Failed to register netdev\n");
+		goto err;
+	}
+
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+	if (ret) {
+		netdev_err(ndev, "No usable DMA configuration\n");
+		goto err;
+	}
+
+	ret = xgene_enet_init_hw(pdata);
+	if (ret)
+		goto err;
+
+	napi = &pdata->rx_ring->napi;
+	netif_napi_add(ndev, napi, xgene_enet_napi, NAPI_POLL_WEIGHT);
+	ret = xgene_enet_mdio_config(pdata);
+
+	return ret;
+err:
+	free_netdev(ndev);
+	return ret;
+}
+
+static int xgene_enet_remove(struct platform_device *pdev)
+{
+	struct xgene_enet_pdata *pdata;
+	struct net_device *ndev;
+
+	pdata = platform_get_drvdata(pdev);
+	ndev = pdata->ndev;
+
+	xgene_gmac_rx_disable(pdata);
+	xgene_gmac_tx_disable(pdata);
+
+	netif_napi_del(&pdata->rx_ring->napi);
+	xgene_enet_mdio_remove(pdata);
+	xgene_enet_delete_desc_rings(pdata);
+	unregister_netdev(ndev);
+	xgene_gport_shutdown(pdata);
+	free_netdev(ndev);
+
+	return 0;
+}
+
+static struct of_device_id xgene_enet_match[] = {
+	{.compatible = "apm,xgene-enet",},
+	{},
+};
+
+MODULE_DEVICE_TABLE(of, xgene_enet_match);
+
+static struct platform_driver xgene_enet_driver = {
+	.driver = {
+		   .name = "xgene-enet",
+		   .of_match_table = xgene_enet_match,
+	},
+	.probe = xgene_enet_probe,
+	.remove = xgene_enet_remove,
+};
+
+module_platform_driver(xgene_enet_driver);
+
+MODULE_DESCRIPTION("APM X-Gene SoC Ethernet driver");
+MODULE_VERSION(XGENE_DRV_VERSION);
+MODULE_AUTHOR("Keyur Chudgar <kchudgar@apm.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
new file mode 100644
index 000000000000..0815866986b0
--- /dev/null
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
@@ -0,0 +1,135 @@
+/* Applied Micro X-Gene SoC Ethernet Driver
+ *
+ * Copyright (c) 2014, Applied Micro Circuits Corporation
+ * Authors: Iyappan Subramanian <isubramanian@apm.com>
+ *	    Ravi Patel <rapatel@apm.com>
+ *	    Keyur Chudgar <kchudgar@apm.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __XGENE_ENET_MAIN_H__
+#define __XGENE_ENET_MAIN_H__
+
+#include <linux/clk.h>
+#include <linux/of_platform.h>
+#include <linux/of_net.h>
+#include <linux/of_mdio.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <linux/prefetch.h>
+#include <linux/if_vlan.h>
+#include <linux/phy.h>
+#include "xgene_enet_hw.h"
+
+#define XGENE_DRV_VERSION	"v1.0"
+#define XGENE_ENET_MAX_MTU	1536
+#define SKB_BUFFER_SIZE		(XGENE_ENET_MAX_MTU - NET_IP_ALIGN)
+#define NUM_PKT_BUF	64
+#define NUM_BUFPOOL	32
+
+/* software context of a descriptor ring */
+struct xgene_enet_desc_ring {
+	struct net_device *ndev;
+	u16 id;
+	u16 num;
+	u16 head;
+	u16 tail;
+	u16 slots;
+	u16 irq;
+	u32 size;
+	u32 state[NUM_RING_CONFIG];
+	void __iomem *cmd_base;
+	void __iomem *cmd;
+	dma_addr_t dma;
+	u16 dst_ring_num;
+	u8 nbufpool;
+	struct sk_buff *(*rx_skb);
+	struct sk_buff *(*cp_skb);
+	enum xgene_enet_ring_cfgsize cfgsize;
+	struct xgene_enet_desc_ring *cp_ring;
+	struct xgene_enet_desc_ring *buf_pool;
+	struct napi_struct napi;
+	union {
+		void *desc_addr;
+		struct xgene_enet_raw_desc *raw_desc;
+		struct xgene_enet_raw_desc16 *raw_desc16;
+	};
+};
+
+/* ethernet private data */
+struct xgene_enet_pdata {
+	struct net_device *ndev;
+	struct mii_bus *mdio_bus;
+	struct phy_device *phy_dev;
+	int phy_speed;
+	struct clk *clk;
+	struct platform_device *pdev;
+	struct xgene_enet_desc_ring *tx_ring;
+	struct xgene_enet_desc_ring *rx_ring;
+	char *dev_name;
+	u32 rx_buff_cnt;
+	u32 tx_qcnt_hi;
+	u32 cp_qcnt_hi;
+	u32 cp_qcnt_low;
+	u32 rx_irq;
+	void __iomem *eth_csr_addr;
+	void __iomem *eth_ring_if_addr;
+	void __iomem *eth_diag_csr_addr;
+	void __iomem *mcx_mac_addr;
+	void __iomem *mcx_stats_addr;
+	void __iomem *mcx_mac_csr_addr;
+	void __iomem *base_addr;
+	void __iomem *ring_csr_addr;
+	void __iomem *ring_cmd_addr;
+	u32 phy_addr;
+	int phy_mode;
+	u32 speed;
+	u16 rm;
+	struct rtnl_link_stats64 stats;
+};
+
+/* Set the specified value into a bit-field defined by its starting position
+ * and length within a single u64.
+ */
+static inline u64 xgene_enet_set_field_value(int pos, int len, u64 val)
+{
+	return (val & ((1ULL << len) - 1)) << pos;
+}
+
+#define SET_VAL(field, val) \
+		xgene_enet_set_field_value(field ## _POS, field ## _LEN, val)
+
+#define SET_BIT(field) \
+		xgene_enet_set_field_value(field ## _POS, 1, 1)
+
+/* Get the value from a bit-field defined by its starting position
+ * and length within the specified u64.
+ */
+static inline u64 xgene_enet_get_field_value(int pos, int len, u64 src)
+{
+	return (src >> pos) & ((1ULL << len) - 1);
+}
+
+#define GET_VAL(field, src) \
+		xgene_enet_get_field_value(field ## _POS, field ## _LEN, src)
+
+static inline struct device *ndev_to_dev(struct net_device *ndev)
+{
+	return ndev->dev.parent;
+}
+
+void xgene_enet_set_ethtool_ops(struct net_device *netdev);
+
+#endif /* __XGENE_ENET_MAIN_H__ */
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index ce455aed5a2f..3f9d4de8173c 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -739,7 +739,6 @@ static void bcmgenet_power_down(struct bcmgenet_priv *priv,
 
 	case GENET_POWER_PASSIVE:
 		/* Power down LED */
-		bcmgenet_mii_reset(priv->dev);
 		if (priv->hw_params->flags & GENET_HAS_EXT) {
 			reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT);
 			reg |= (EXT_PWR_DOWN_PHY |
@@ -779,7 +778,9 @@ static void bcmgenet_power_up(struct bcmgenet_priv *priv,
 	}
 
 	bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT);
-	bcmgenet_mii_reset(priv->dev);
+
+	if (mode == GENET_POWER_PASSIVE)
+		bcmgenet_mii_reset(priv->dev);
 }
 
 /* ioctl handle special commands that are not present in ethtool. */
@@ -1961,7 +1962,8 @@ static void bcmgenet_set_hw_addr(struct bcmgenet_priv *priv,
 static int bcmgenet_wol_resume(struct bcmgenet_priv *priv)
 {
 	/* From WOL-enabled suspend, switch to regular clock */
-	clk_disable_unprepare(priv->clk_wol);
+	if (priv->wolopts)
+		clk_disable_unprepare(priv->clk_wol);
 
 	phy_init_hw(priv->phydev);
 	/* Speed settings must be restored */
@@ -2164,6 +2166,10 @@ static void bcmgenet_netif_stop(struct net_device *dev)
 	 * disabled no new work will be scheduled.
 	 */
 	cancel_work_sync(&priv->bcmgenet_irq_work);
+
+	priv->old_pause = -1;
+	priv->old_link = -1;
+	priv->old_duplex = -1;
 }
 
 static int bcmgenet_close(struct net_device *dev)
@@ -2533,6 +2539,13 @@ static int bcmgenet_probe(struct platform_device *pdev)
 	priv->pdev = pdev;
 	priv->version = (enum bcmgenet_version)of_id->data;
 
+	priv->clk = devm_clk_get(&priv->pdev->dev, "enet");
+	if (IS_ERR(priv->clk))
+		dev_warn(&priv->pdev->dev, "failed to get enet clock\n");
+
+	if (!IS_ERR(priv->clk))
+		clk_prepare_enable(priv->clk);
+
 	bcmgenet_set_hw_params(priv);
 
 	/* Mii wait queue */
@@ -2541,17 +2554,10 @@ static int bcmgenet_probe(struct platform_device *pdev)
 	priv->rx_buf_len = RX_BUF_LENGTH;
 	INIT_WORK(&priv->bcmgenet_irq_work, bcmgenet_irq_task);
 
-	priv->clk = devm_clk_get(&priv->pdev->dev, "enet");
-	if (IS_ERR(priv->clk))
-		dev_warn(&priv->pdev->dev, "failed to get enet clock\n");
-
 	priv->clk_wol = devm_clk_get(&priv->pdev->dev, "enet-wol");
 	if (IS_ERR(priv->clk_wol))
 		dev_warn(&priv->pdev->dev, "failed to get enet-wol clock\n");
 
-	if (!IS_ERR(priv->clk))
-		clk_prepare_enable(priv->clk);
-
 	err = reset_umac(priv);
 	if (err)
 		goto err_clk_disable;
@@ -2611,6 +2617,8 @@ static int bcmgenet_suspend(struct device *d)
 
 	bcmgenet_netif_stop(dev);
 
+	phy_suspend(priv->phydev);
+
 	netif_device_detach(dev);
 
 	/* Disable MAC receive */
@@ -2661,9 +2669,7 @@ static int bcmgenet_resume(struct device *d)
 	if (ret)
 		goto out_clk_disable;
 
-	if (priv->wolopts)
-		ret = bcmgenet_wol_resume(priv);
-
+	ret = bcmgenet_wol_resume(priv);
 	if (ret)
 		goto out_clk_disable;
 
@@ -2678,6 +2684,9 @@ static int bcmgenet_resume(struct device *d)
 		bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT);
 	}
 
+	if (priv->wolopts)
+		bcmgenet_power_up(priv, GENET_POWER_WOL_MAGIC);
+
 	/* Disable RX/TX DMA and flush TX queues */
 	dma_ctrl = bcmgenet_dma_disable(priv);
 
@@ -2693,6 +2702,8 @@ static int bcmgenet_resume(struct device *d)
 
 	netif_device_attach(dev);
 
+	phy_resume(priv->phydev);
+
 	bcmgenet_netif_start(dev);
 
 	return 0;
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index 18961613d385..c88f7ae99636 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -129,7 +129,10 @@ static void bcmgenet_mii_setup(struct net_device *dev)
 			cmd_bits |= CMD_RX_PAUSE_IGNORE | CMD_TX_PAUSE_IGNORE;
 	}
 
-	if (status_changed) {
+	if (!status_changed)
+		return;
+
+	if (phydev->link) {
 		reg = bcmgenet_umac_readl(priv, UMAC_CMD);
 		reg &= ~((CMD_SPEED_MASK << CMD_SPEED_SHIFT) |
 			       CMD_HD_EN |
@@ -137,8 +140,9 @@ static void bcmgenet_mii_setup(struct net_device *dev)
 		reg |= cmd_bits;
 		bcmgenet_umac_writel(priv, reg, UMAC_CMD);
 
-		phy_print_status(phydev);
 	}
+
+	phy_print_status(phydev);
 }
 
 void bcmgenet_mii_reset(struct net_device *dev)
@@ -303,12 +307,12 @@ static int bcmgenet_mii_probe(struct net_device *dev)
 	/* In the case of a fixed PHY, the DT node associated
 	 * to the PHY is the Ethernet MAC DT node.
 	 */
-	if (of_phy_is_fixed_link(dn)) {
+	if (!priv->phy_dn && of_phy_is_fixed_link(dn)) {
 		ret = of_phy_register_fixed_link(dn);
 		if (ret)
 			return ret;
 
-		priv->phy_dn = dn;
+		priv->phy_dn = of_node_get(dn);
 	}
 
 	phydev = of_phy_connect(dev, priv->phy_dn, bcmgenet_mii_setup, 0,
@@ -444,6 +448,7 @@ int bcmgenet_mii_init(struct net_device *dev)
 	return 0;
 
 out:
+	of_node_put(priv->phy_dn);
 	mdiobus_unregister(priv->mii_bus);
 out_free:
 	kfree(priv->mii_bus->irq);
@@ -455,6 +460,7 @@ void bcmgenet_mii_exit(struct net_device *dev)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
 
+	of_node_put(priv->phy_dn);
 	mdiobus_unregister(priv->mii_bus);
 	kfree(priv->mii_bus->irq);
 	mdiobus_free(priv->mii_bus);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index c9b922cc3e67..d57282172ea5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -50,13 +50,13 @@
 #include "cxgb4_uld.h"
 
 #define T4FW_VERSION_MAJOR 0x01
-#define T4FW_VERSION_MINOR 0x09
-#define T4FW_VERSION_MICRO 0x17
+#define T4FW_VERSION_MINOR 0x0B
+#define T4FW_VERSION_MICRO 0x1B
 #define T4FW_VERSION_BUILD 0x00
 
 #define T5FW_VERSION_MAJOR 0x01
-#define T5FW_VERSION_MINOR 0x09
-#define T5FW_VERSION_MICRO 0x17
+#define T5FW_VERSION_MINOR 0x0B
+#define T5FW_VERSION_MICRO 0x1B
 #define T5FW_VERSION_BUILD 0x00
 
 #define CH_WARN(adap, fmt, ...) dev_warn(adap->pdev_dev, fmt, ## __VA_ARGS__)
@@ -522,6 +522,9 @@ struct sge_txq {
 struct sge_eth_txq {                /* state for an SGE Ethernet Tx queue */
 	struct sge_txq q;
 	struct netdev_queue *txq;   /* associated netdev TX queue */
+#ifdef CONFIG_CHELSIO_T4_DCB
+	u8 dcb_prio;		    /* DCB Priority bound to queue */
+#endif
 	unsigned long tso;          /* # of TSO requests */
 	unsigned long tx_cso;       /* # of Tx checksum offloads */
 	unsigned long vlan_ins;     /* # of Tx VLAN insertions */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
index 0d3a9df5be36..8edf0f5bd679 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
@@ -20,6 +20,17 @@
 
 #include "cxgb4.h"
 
+/* DCBx version control
+ */
+char *dcb_ver_array[] = {
+	"Unknown",
+	"DCBx-CIN",
+	"DCBx-CEE 1.01",
+	"DCBx-IEEE",
+	"", "", "",
+	"Auto Negotiated"
+};
+
 /* Initialize a port's Data Center Bridging state.  Typically used after a
  * Link Down event.
  */
@@ -27,25 +38,45 @@ void cxgb4_dcb_state_init(struct net_device *dev)
 {
 	struct port_info *pi = netdev2pinfo(dev);
 	struct port_dcb_info *dcb = &pi->dcb;
+	int version_temp = dcb->dcb_version;
 
 	memset(dcb, 0, sizeof(struct port_dcb_info));
 	dcb->state = CXGB4_DCB_STATE_START;
+	if (version_temp)
+		dcb->dcb_version = version_temp;
+
+	netdev_dbg(dev, "%s: Initializing DCB state for port[%d]\n",
+		    __func__, pi->port_id);
+}
+
+void cxgb4_dcb_version_init(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+
+	/* Any writes here are only done on kernels that exlicitly need
+	 * a specific version, say < 2.6.38 which only support CEE
+	 */
+	dcb->dcb_version = FW_PORT_DCB_VER_AUTO;
 }
 
 /* Finite State machine for Data Center Bridging.
  */
 void cxgb4_dcb_state_fsm(struct net_device *dev,
-			 enum cxgb4_dcb_state_input input)
+			 enum cxgb4_dcb_state_input transition_to)
 {
 	struct port_info *pi = netdev2pinfo(dev);
 	struct port_dcb_info *dcb = &pi->dcb;
 	struct adapter *adap = pi->adapter;
+	enum cxgb4_dcb_state current_state = dcb->state;
 
-	switch (input) {
-	case CXGB4_DCB_INPUT_FW_DISABLED: {
-		/* Firmware tells us it's not doing DCB */
-		switch (dcb->state) {
-		case CXGB4_DCB_STATE_START: {
+	netdev_dbg(dev, "%s: State change from %d to %d for %s\n",
+		    __func__, dcb->state, transition_to, dev->name);
+
+	switch (current_state) {
+	case CXGB4_DCB_STATE_START: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_DISABLED: {
 			/* we're going to use Host DCB */
 			dcb->state = CXGB4_DCB_STATE_HOST;
 			dcb->supported = CXGB4_DCBX_HOST_SUPPORT;
@@ -53,48 +84,62 @@ void cxgb4_dcb_state_fsm(struct net_device *dev,
 			break;
 		}
 
-		case CXGB4_DCB_STATE_HOST: {
-			/* we're alreaady in Host DCB mode */
+		case CXGB4_DCB_INPUT_FW_ENABLED: {
+			/* we're going to use Firmware DCB */
+			dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE;
+			dcb->supported = CXGB4_DCBX_FW_SUPPORT;
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
+			/* expected transition */
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+			dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED;
 			break;
 		}
 
 		default:
-			goto bad_state_transition;
+			goto bad_state_input;
 		}
 		break;
 	}
 
-	case CXGB4_DCB_INPUT_FW_ENABLED: {
-		/* Firmware tells us that it is doing DCB */
-		switch (dcb->state) {
-		case CXGB4_DCB_STATE_START: {
-			/* we're going to use Firmware DCB */
-			dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE;
-			dcb->supported = CXGB4_DCBX_FW_SUPPORT;
+	case CXGB4_DCB_STATE_FW_INCOMPLETE: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_ENABLED: {
+			/* we're alreaady in firmware DCB mode */
 			break;
 		}
 
-		case CXGB4_DCB_STATE_FW_INCOMPLETE:
-		case CXGB4_DCB_STATE_FW_ALLSYNCED: {
-			/* we're alreaady in firmware DCB mode */
+		case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
+			/* we're already incomplete */
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+			dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED;
+			dcb->enabled = 1;
+			linkwatch_fire_event(dev);
 			break;
 		}
 
 		default:
-			goto bad_state_transition;
+			goto bad_state_input;
 		}
 		break;
 	}
 
-	case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
-		/* Firmware tells us that its DCB state is incomplete */
-		switch (dcb->state) {
-		case CXGB4_DCB_STATE_FW_INCOMPLETE: {
-			/* we're already incomplete */
+	case CXGB4_DCB_STATE_FW_ALLSYNCED: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_ENABLED: {
+			/* we're alreaady in firmware DCB mode */
 			break;
 		}
 
-		case CXGB4_DCB_STATE_FW_ALLSYNCED: {
+		case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
 			/* We were successfully running with firmware DCB but
 			 * now it's telling us that it's in an "incomplete
 			 * state.  We need to reset back to a ground state
@@ -107,46 +152,48 @@ void cxgb4_dcb_state_fsm(struct net_device *dev,
 			break;
 		}
 
-		default:
-			goto bad_state_transition;
-		}
-		break;
-	}
-
-	case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
-		/* Firmware tells us that its DCB state is complete */
-		switch (dcb->state) {
-		case CXGB4_DCB_STATE_FW_INCOMPLETE: {
-			dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED;
+		case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+			/* we're already all sync'ed
+			 * this is only applicable for IEEE or
+			 * when another VI already completed negotiaton
+			 */
 			dcb->enabled = 1;
 			linkwatch_fire_event(dev);
 			break;
 		}
 
-		case CXGB4_DCB_STATE_FW_ALLSYNCED: {
-			/* we're already all sync'ed */
+		default:
+			goto bad_state_input;
+		}
+		break;
+	}
+
+	case CXGB4_DCB_STATE_HOST: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_DISABLED: {
+			/* we're alreaady in Host DCB mode */
 			break;
 		}
 
 		default:
-			goto bad_state_transition;
+			goto bad_state_input;
 		}
 		break;
 	}
 
 	default:
-		goto  bad_state_input;
+		goto bad_state_transition;
 	}
 	return;
 
 bad_state_input:
 	dev_err(adap->pdev_dev, "cxgb4_dcb_state_fsm: illegal input symbol %d\n",
-		input);
+		transition_to);
 	return;
 
 bad_state_transition:
 	dev_err(adap->pdev_dev, "cxgb4_dcb_state_fsm: bad state transition, state = %d, input = %d\n",
-		dcb->state, input);
+		current_state, transition_to);
 }
 
 /* Handle a DCB/DCBX update message from the firmware.
@@ -160,6 +207,7 @@ void cxgb4_dcb_handle_fw_update(struct adapter *adap,
 	struct port_info *pi = netdev_priv(dev);
 	struct port_dcb_info *dcb = &pi->dcb;
 	int dcb_type = pcmd->u.dcb.pgid.type;
+	int dcb_running_version;
 
 	/* Handle Firmware DCB Control messages separately since they drive
 	 * our state machine.
@@ -171,6 +219,25 @@ void cxgb4_dcb_handle_fw_update(struct adapter *adap,
 			 ? CXGB4_DCB_STATE_FW_ALLSYNCED
 			 : CXGB4_DCB_STATE_FW_INCOMPLETE);
 
+		if (dcb->dcb_version != FW_PORT_DCB_VER_UNKNOWN) {
+			dcb_running_version = FW_PORT_CMD_DCB_VERSION_GET(
+				be16_to_cpu(
+				pcmd->u.dcb.control.dcb_version_to_app_state));
+			if (dcb_running_version == FW_PORT_DCB_VER_CEE1D01 ||
+			    dcb_running_version == FW_PORT_DCB_VER_IEEE) {
+				dcb->dcb_version = dcb_running_version;
+				dev_warn(adap->pdev_dev, "Interface %s is running %s\n",
+					 dev->name,
+					 dcb_ver_array[dcb->dcb_version]);
+			} else {
+				dev_warn(adap->pdev_dev,
+					 "Something screwed up, requested firmware for %s, but firmware returned %s instead\n",
+					 dcb_ver_array[dcb->dcb_version],
+					 dcb_ver_array[dcb_running_version]);
+				dcb->dcb_version = FW_PORT_DCB_VER_UNKNOWN;
+			}
+		}
+
 		cxgb4_dcb_state_fsm(dev, input);
 		return;
 	}
@@ -199,7 +266,11 @@ void cxgb4_dcb_handle_fw_update(struct adapter *adap,
 		dcb->pg_num_tcs_supported = fwdcb->pgrate.num_tcs_supported;
 		memcpy(dcb->pgrate, &fwdcb->pgrate.pgrate,
 		       sizeof(dcb->pgrate));
+		memcpy(dcb->tsa, &fwdcb->pgrate.tsa,
+		       sizeof(dcb->tsa));
 		dcb->msgs |= CXGB4_DCB_FW_PGRATE;
+		if (dcb->msgs & CXGB4_DCB_FW_PGID)
+			IEEE_FAUX_SYNC(dev, dcb);
 		break;
 
 	case FW_PORT_DCB_TYPE_PRIORATE:
@@ -212,6 +283,7 @@ void cxgb4_dcb_handle_fw_update(struct adapter *adap,
 		dcb->pfcen = fwdcb->pfc.pfcen;
 		dcb->pfc_num_tcs_supported = fwdcb->pfc.max_pfc_tcs;
 		dcb->msgs |= CXGB4_DCB_FW_PFC;
+		IEEE_FAUX_SYNC(dev, dcb);
 		break;
 
 	case FW_PORT_DCB_TYPE_APP_ID: {
@@ -220,13 +292,25 @@ void cxgb4_dcb_handle_fw_update(struct adapter *adap,
 		struct app_priority *ap = &dcb->app_priority[idx];
 
 		struct dcb_app app = {
-			.selector = fwap->sel_field,
 			.protocol = be16_to_cpu(fwap->protocolid),
-			.priority = fwap->user_prio_map,
 		};
 		int err;
 
-		err = dcb_setapp(dev, &app);
+		/* Convert from firmware format to relevant format
+		 * when using app selector
+		 */
+		if (dcb->dcb_version == FW_PORT_DCB_VER_IEEE) {
+			app.selector = (fwap->sel_field + 1);
+			app.priority = ffs(fwap->user_prio_map) - 1;
+			err = dcb_ieee_setapp(dev, &app);
+			IEEE_FAUX_SYNC(dev, dcb);
+		} else {
+			/* Default is CEE */
+			app.selector = !!(fwap->sel_field);
+			app.priority = fwap->user_prio_map;
+			err = dcb_setapp(dev, &app);
+		}
+
 		if (err)
 			dev_err(adap->pdev_dev,
 				"Failed DCB Set Application Priority: sel=%d, prot=%d, prio=%d, err=%d\n",
@@ -408,9 +492,10 @@ static void cxgb4_getpgbwgcfg(struct net_device *dev, int pgid, u8 *bw_per,
 	if (err != FW_PORT_DCB_CFG_SUCCESS) {
 		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
 			-err);
-	} else {
-		*bw_per = pcmd.u.dcb.pgrate.pgrate[pgid];
+		return;
 	}
+
+	*bw_per = pcmd.u.dcb.pgrate.pgrate[pgid];
 }
 
 static void cxgb4_getpgbwgcfg_tx(struct net_device *dev, int pgid, u8 *bw_per)
@@ -637,7 +722,8 @@ static int __cxgb4_getapp(struct net_device *dev, u8 app_idtype, u16 app_id,
 			return err;
 		}
 		if (be16_to_cpu(pcmd.u.dcb.app_priority.protocolid) == app_id)
-			return pcmd.u.dcb.app_priority.user_prio_map;
+			if (pcmd.u.dcb.app_priority.sel_field == app_idtype)
+				return pcmd.u.dcb.app_priority.user_prio_map;
 
 		/* exhausted app list */
 		if (!pcmd.u.dcb.app_priority.protocolid)
@@ -657,8 +743,8 @@ static int cxgb4_getapp(struct net_device *dev, u8 app_idtype, u16 app_id)
 
 /* Write a new Application User Priority Map for the specified Application ID
  */
-static int cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
-			u8 app_prio)
+static int __cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
+			  u8 app_prio)
 {
 	struct fw_port_cmd pcmd;
 	struct port_info *pi = netdev2pinfo(dev);
@@ -673,10 +759,6 @@ static int cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
 	if (!netif_carrier_ok(dev))
 		return -ENOLINK;
 
-	if (app_idtype != DCB_APP_IDTYPE_ETHTYPE &&
-	    app_idtype != DCB_APP_IDTYPE_PORTNUM)
-		return -EINVAL;
-
 	for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
 		INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
 		pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
@@ -725,6 +807,30 @@ static int cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
 	return 0;
 }
 
+/* Priority for CEE inside dcb_app is bitmask, with 0 being an invalid value */
+static int cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
+			u8 app_prio)
+{
+	int ret;
+	struct dcb_app app = {
+		.selector = app_idtype,
+		.protocol = app_id,
+		.priority = app_prio,
+	};
+
+	if (app_idtype != DCB_APP_IDTYPE_ETHTYPE &&
+	    app_idtype != DCB_APP_IDTYPE_PORTNUM)
+		return -EINVAL;
+
+	/* Convert app_idtype to a format that firmware understands */
+	ret = __cxgb4_setapp(dev, app_idtype == DCB_APP_IDTYPE_ETHTYPE ?
+			      app_idtype : 3, app_id, app_prio);
+	if (ret)
+		return ret;
+
+	return dcb_setapp(dev, &app);
+}
+
 /* Return whether IEEE Data Center Bridging has been negotiated.
  */
 static inline int cxgb4_ieee_negotiation_complete(struct net_device *dev)
@@ -738,6 +844,7 @@ static inline int cxgb4_ieee_negotiation_complete(struct net_device *dev)
 
 /* Fill in the Application User Priority Map associated with the
  * specified Application.
+ * Priority for IEEE dcb_app is an integer, with 0 being a valid value
  */
 static int cxgb4_ieee_getapp(struct net_device *dev, struct dcb_app *app)
 {
@@ -748,28 +855,39 @@ static int cxgb4_ieee_getapp(struct net_device *dev, struct dcb_app *app)
 	if (!(app->selector && app->protocol))
 		return -EINVAL;
 
-	prio = dcb_getapp(dev, app);
-	if (prio == 0) {
-		/* If app doesn't exist in dcb_app table, try firmware
-		 * directly.
-		 */
-		prio = __cxgb4_getapp(dev, app->selector, app->protocol, 0);
-	}
+	/* Try querying firmware first, use firmware format */
+	prio = __cxgb4_getapp(dev, app->selector - 1, app->protocol, 0);
+
+	if (prio < 0)
+		prio = dcb_ieee_getapp_mask(dev, app);
 
-	app->priority = prio;
+	app->priority = ffs(prio) - 1;
 	return 0;
 }
 
-/* Write a new Application User Priority Map for the specified App id. */
+/* Write a new Application User Priority Map for the specified Application ID.
+ * Priority for IEEE dcb_app is an integer, with 0 being a valid value
+ */
 static int cxgb4_ieee_setapp(struct net_device *dev, struct dcb_app *app)
 {
+	int ret;
+
 	if (!cxgb4_ieee_negotiation_complete(dev))
 		return -EINVAL;
-	if (!(app->selector && app->protocol && app->priority))
+	if (!(app->selector && app->protocol))
+		return -EINVAL;
+
+	if (!(app->selector > IEEE_8021QAZ_APP_SEL_ETHERTYPE  &&
+	      app->selector < IEEE_8021QAZ_APP_SEL_ANY))
 		return -EINVAL;
 
-	cxgb4_setapp(dev, app->selector, app->protocol, app->priority);
-	return dcb_setapp(dev, app);
+	/* change selector to a format that firmware understands */
+	ret = __cxgb4_setapp(dev, app->selector - 1, app->protocol,
+			     (1 << app->priority));
+	if (ret)
+		return ret;
+
+	return dcb_ieee_setapp(dev, app);
 }
 
 /* Return our DCBX parameters.
@@ -794,8 +912,9 @@ static u8 cxgb4_setdcbx(struct net_device *dev, u8 dcb_request)
 	    != dcb_request)
 		return 1;
 
-	/* Can't set DCBX capabilities if DCBX isn't enabled. */
-	if (!pi->dcb.state)
+	/* Can't enable DCB if we haven't successfully negotiated it.
+	 */
+	if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED)
 		return 1;
 
 	/* There's currently no mechanism to allow for the firmware DCBX
@@ -874,7 +993,8 @@ static int cxgb4_getpeerapp_tbl(struct net_device *dev, struct dcb_app *table)
 		table[i].selector = pcmd.u.dcb.app_priority.sel_field;
 		table[i].protocol =
 			be16_to_cpu(pcmd.u.dcb.app_priority.protocolid);
-		table[i].priority = pcmd.u.dcb.app_priority.user_prio_map;
+		table[i].priority =
+			ffs(pcmd.u.dcb.app_priority.user_prio_map) - 1;
 	}
 	return err;
 }
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
index 1ec1d834e257..2a6aa88984f4 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
@@ -63,6 +63,13 @@
 #define INIT_PORT_DCB_WRITE_CMD(__pcmd, __port) \
 	INIT_PORT_DCB_CMD(__pcmd, __port, EXEC, FW_PORT_ACTION_L2_DCB_CFG)
 
+#define IEEE_FAUX_SYNC(__dev, __dcb) \
+	do { \
+		if ((__dcb)->dcb_version == FW_PORT_DCB_VER_IEEE) \
+			cxgb4_dcb_state_fsm((__dev), \
+					    CXGB4_DCB_STATE_FW_ALLSYNCED); \
+	} while (0)
+
 /* States we can be in for a port's Data Center Bridging.
  */
 enum cxgb4_dcb_state {
@@ -108,11 +115,13 @@ struct port_dcb_info {
 	 * Native Endian format).
 	 */
 	u32	pgid;			/* Priority Group[0..7] */
+	u8	dcb_version;		/* Running DCBx version */
 	u8	pfcen;			/* Priority Flow Control[0..7] */
 	u8	pg_num_tcs_supported;	/* max PG Traffic Classes */
 	u8	pfc_num_tcs_supported;	/* max PFC Traffic Classes */
 	u8	pgrate[8];		/* Priority Group Rate[0..7] */
 	u8	priorate[8];		/* Priority Rate[0..7] */
+	u8	tsa[8];			/* TSA Algorithm[0..7] */
 	struct app_priority { /* Application Information */
 		u8	user_prio_map;	/* Priority Map bitfield */
 		u8	sel_field;	/* Protocol ID interpretation */
@@ -121,6 +130,7 @@ struct port_dcb_info {
 };
 
 void cxgb4_dcb_state_init(struct net_device *);
+void cxgb4_dcb_version_init(struct net_device *);
 void cxgb4_dcb_state_fsm(struct net_device *, enum cxgb4_dcb_state_input);
 void cxgb4_dcb_handle_fw_update(struct adapter *, const struct fw_port_cmd *);
 void cxgb4_dcb_set_caps(struct adapter *, const struct fw_port_cmd *);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 1a162d21d8ac..a62d3f468c52 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -522,6 +522,8 @@ static void dcb_tx_queue_prio_enable(struct net_device *dev, int enable)
 			dev_err(adap->pdev_dev,
 				"Can't %s DCB Priority on port %d, TX Queue %d: err=%d\n",
 				enable ? "set" : "unset", pi->port_id, i, -err);
+		else
+			txq->dcb_prio = value;
 	}
 }
 #endif /* CONFIG_CHELSIO_T4_DCB */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index ff709e3b3e7e..0549170d7e2e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -1629,6 +1629,14 @@ enum fw_port_l2cfg_ctlbf {
 	FW_PORT_L2_CTLBF_TXIPG	= 0x20
 };
 
+enum fw_port_dcb_versions {
+	FW_PORT_DCB_VER_UNKNOWN,
+	FW_PORT_DCB_VER_CEE1D0,
+	FW_PORT_DCB_VER_CEE1D01,
+	FW_PORT_DCB_VER_IEEE,
+	FW_PORT_DCB_VER_AUTO = 7
+};
+
 enum fw_port_dcb_cfg {
 	FW_PORT_DCB_CFG_PG	= 0x01,
 	FW_PORT_DCB_CFG_PFC	= 0x02,
@@ -1709,6 +1717,7 @@ struct fw_port_cmd {
 				__u8   r10_lo[5];
 				__u8   num_tcs_supported;
 				__u8   pgrate[8];
+				__u8   tsa[8];
 			} pgrate;
 			struct fw_port_dcb_priorate {
 				__u8   type;
@@ -1735,7 +1744,7 @@ struct fw_port_cmd {
 			struct fw_port_dcb_control {
 				__u8   type;
 				__u8   all_syncd_pkd;
-				__be16 pfc_state_to_app_state;
+				__be16 dcb_version_to_app_state;
 				__be32 r11;
 				__be64 r12;
 			} control;
@@ -1778,6 +1787,7 @@ struct fw_port_cmd {
 #define FW_PORT_CMD_DCBXDIS (1U << 7)
 #define FW_PORT_CMD_APPLY (1U <<  7)
 #define FW_PORT_CMD_ALL_SYNCD (1U << 7)
+#define FW_PORT_CMD_DCB_VERSION_GET(x) (((x) >> 8) & 0xf)
 
 #define FW_PORT_CMD_PPPEN(x) ((x) << 31)
 #define FW_PORT_CMD_TPSRC(x) ((x) << 28)
diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c
index 23084fb2090e..9b33057a9477 100644
--- a/drivers/net/ethernet/davicom/dm9000.c
+++ b/drivers/net/ethernet/davicom/dm9000.c
@@ -93,7 +93,7 @@ enum dm9000_type {
 };
 
 /* Structure/enum declaration ------------------------------- */
-typedef struct board_info {
+struct board_info {
 
 	void __iomem	*io_addr;	/* Register I/O base address */
 	void __iomem	*io_data;	/* Data I/O address */
@@ -141,7 +141,7 @@ typedef struct board_info {
 	u32		wake_state;
 
 	int		ip_summed;
-} board_info_t;
+};
 
 /* debug code */
 
@@ -151,7 +151,7 @@ typedef struct board_info {
 	}						\
 } while (0)
 
-static inline board_info_t *to_dm9000_board(struct net_device *dev)
+static inline struct board_info *to_dm9000_board(struct net_device *dev)
 {
 	return netdev_priv(dev);
 }
@@ -162,7 +162,7 @@ static inline board_info_t *to_dm9000_board(struct net_device *dev)
  *   Read a byte from I/O port
  */
 static u8
-ior(board_info_t *db, int reg)
+ior(struct board_info *db, int reg)
 {
 	writeb(reg, db->io_addr);
 	return readb(db->io_data);
@@ -173,14 +173,14 @@ ior(board_info_t *db, int reg)
  */
 
 static void
-iow(board_info_t *db, int reg, int value)
+iow(struct board_info *db, int reg, int value)
 {
 	writeb(reg, db->io_addr);
 	writeb(value, db->io_data);
 }
 
 static void
-dm9000_reset(board_info_t *db)
+dm9000_reset(struct board_info *db)
 {
 	dev_dbg(db->dev, "resetting device\n");
 
@@ -272,7 +272,7 @@ static void dm9000_dumpblk_32bit(void __iomem *reg, int count)
  * Sleep, either by using msleep() or if we are suspending, then
  * use mdelay() to sleep.
  */
-static void dm9000_msleep(board_info_t *db, unsigned int ms)
+static void dm9000_msleep(struct board_info *db, unsigned int ms)
 {
 	if (db->in_suspend || db->in_timeout)
 		mdelay(ms);
@@ -284,7 +284,7 @@ static void dm9000_msleep(board_info_t *db, unsigned int ms)
 static int
 dm9000_phy_read(struct net_device *dev, int phy_reg_unused, int reg)
 {
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 	unsigned long flags;
 	unsigned int reg_save;
 	int ret;
@@ -330,7 +330,7 @@ static void
 dm9000_phy_write(struct net_device *dev,
 		 int phyaddr_unused, int reg, int value)
 {
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 	unsigned long flags;
 	unsigned long reg_save;
 
@@ -408,7 +408,7 @@ static void dm9000_set_io(struct board_info *db, int byte_width)
 	}
 }
 
-static void dm9000_schedule_poll(board_info_t *db)
+static void dm9000_schedule_poll(struct board_info *db)
 {
 	if (db->type == TYPE_DM9000E)
 		schedule_delayed_work(&db->phy_poll, HZ * 2);
@@ -416,7 +416,7 @@ static void dm9000_schedule_poll(board_info_t *db)
 
 static int dm9000_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 
 	if (!netif_running(dev))
 		return -EINVAL;
@@ -425,7 +425,7 @@ static int dm9000_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
 }
 
 static unsigned int
-dm9000_read_locked(board_info_t *db, int reg)
+dm9000_read_locked(struct board_info *db, int reg)
 {
 	unsigned long flags;
 	unsigned int ret;
@@ -437,7 +437,7 @@ dm9000_read_locked(board_info_t *db, int reg)
 	return ret;
 }
 
-static int dm9000_wait_eeprom(board_info_t *db)
+static int dm9000_wait_eeprom(struct board_info *db)
 {
 	unsigned int status;
 	int timeout = 8;	/* wait max 8msec */
@@ -474,7 +474,7 @@ static int dm9000_wait_eeprom(board_info_t *db)
  *  Read a word data from EEPROM
  */
 static void
-dm9000_read_eeprom(board_info_t *db, int offset, u8 *to)
+dm9000_read_eeprom(struct board_info *db, int offset, u8 *to)
 {
 	unsigned long flags;
 
@@ -514,7 +514,7 @@ dm9000_read_eeprom(board_info_t *db, int offset, u8 *to)
  * Write a word data to SROM
  */
 static void
-dm9000_write_eeprom(board_info_t *db, int offset, u8 *data)
+dm9000_write_eeprom(struct board_info *db, int offset, u8 *data)
 {
 	unsigned long flags;
 
@@ -546,7 +546,7 @@ dm9000_write_eeprom(board_info_t *db, int offset, u8 *data)
 static void dm9000_get_drvinfo(struct net_device *dev,
 			       struct ethtool_drvinfo *info)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 
 	strlcpy(info->driver, CARDNAME, sizeof(info->driver));
 	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
@@ -556,21 +556,21 @@ static void dm9000_get_drvinfo(struct net_device *dev,
 
 static u32 dm9000_get_msglevel(struct net_device *dev)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 
 	return dm->msg_enable;
 }
 
 static void dm9000_set_msglevel(struct net_device *dev, u32 value)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 
 	dm->msg_enable = value;
 }
 
 static int dm9000_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 
 	mii_ethtool_gset(&dm->mii, cmd);
 	return 0;
@@ -578,21 +578,21 @@ static int dm9000_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 
 static int dm9000_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 
 	return mii_ethtool_sset(&dm->mii, cmd);
 }
 
 static int dm9000_nway_reset(struct net_device *dev)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 	return mii_nway_restart(&dm->mii);
 }
 
 static int dm9000_set_features(struct net_device *dev,
 	netdev_features_t features)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 	netdev_features_t changed = dev->features ^ features;
 	unsigned long flags;
 
@@ -608,7 +608,7 @@ static int dm9000_set_features(struct net_device *dev,
 
 static u32 dm9000_get_link(struct net_device *dev)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 	u32 ret;
 
 	if (dm->flags & DM9000_PLATF_EXT_PHY)
@@ -629,7 +629,7 @@ static int dm9000_get_eeprom_len(struct net_device *dev)
 static int dm9000_get_eeprom(struct net_device *dev,
 			     struct ethtool_eeprom *ee, u8 *data)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 	int offset = ee->offset;
 	int len = ee->len;
 	int i;
@@ -653,7 +653,7 @@ static int dm9000_get_eeprom(struct net_device *dev,
 static int dm9000_set_eeprom(struct net_device *dev,
 			     struct ethtool_eeprom *ee, u8 *data)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 	int offset = ee->offset;
 	int len = ee->len;
 	int done;
@@ -691,7 +691,7 @@ static int dm9000_set_eeprom(struct net_device *dev,
 
 static void dm9000_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 
 	memset(w, 0, sizeof(struct ethtool_wolinfo));
 
@@ -702,7 +702,7 @@ static void dm9000_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 
 static int dm9000_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 	unsigned long flags;
 	u32 opts = w->wolopts;
 	u32 wcr = 0;
@@ -752,7 +752,7 @@ static const struct ethtool_ops dm9000_ethtool_ops = {
 	.set_eeprom		= dm9000_set_eeprom,
 };
 
-static void dm9000_show_carrier(board_info_t *db,
+static void dm9000_show_carrier(struct board_info *db,
 				unsigned carrier, unsigned nsr)
 {
 	int lpa;
@@ -775,7 +775,7 @@ static void
 dm9000_poll_work(struct work_struct *w)
 {
 	struct delayed_work *dw = to_delayed_work(w);
-	board_info_t *db = container_of(dw, board_info_t, phy_poll);
+	struct board_info *db = container_of(dw, struct board_info, phy_poll);
 	struct net_device *ndev = db->ndev;
 
 	if (db->flags & DM9000_PLATF_SIMPLE_PHY &&
@@ -843,7 +843,7 @@ static unsigned char dm9000_type_to_char(enum dm9000_type type)
 static void
 dm9000_hash_table_unlocked(struct net_device *dev)
 {
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 	struct netdev_hw_addr *ha;
 	int i, oft;
 	u32 hash_val;
@@ -879,7 +879,7 @@ dm9000_hash_table_unlocked(struct net_device *dev)
 static void
 dm9000_hash_table(struct net_device *dev)
 {
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 	unsigned long flags;
 
 	spin_lock_irqsave(&db->lock, flags);
@@ -888,13 +888,13 @@ dm9000_hash_table(struct net_device *dev)
 }
 
 static void
-dm9000_mask_interrupts(board_info_t *db)
+dm9000_mask_interrupts(struct board_info *db)
 {
 	iow(db, DM9000_IMR, IMR_PAR);
 }
 
 static void
-dm9000_unmask_interrupts(board_info_t *db)
+dm9000_unmask_interrupts(struct board_info *db)
 {
 	iow(db, DM9000_IMR, db->imr_all);
 }
@@ -905,7 +905,7 @@ dm9000_unmask_interrupts(board_info_t *db)
 static void
 dm9000_init_dm9000(struct net_device *dev)
 {
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 	unsigned int imr;
 	unsigned int ncr;
 
@@ -970,7 +970,7 @@ dm9000_init_dm9000(struct net_device *dev)
 /* Our watchdog timed out. Called by the networking layer */
 static void dm9000_timeout(struct net_device *dev)
 {
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 	u8 reg_save;
 	unsigned long flags;
 
@@ -996,7 +996,7 @@ static void dm9000_send_packet(struct net_device *dev,
 			       int ip_summed,
 			       u16 pkt_len)
 {
-	board_info_t *dm = to_dm9000_board(dev);
+	struct board_info *dm = to_dm9000_board(dev);
 
 	/* The DM9000 is not smart enough to leave fragmented packets alone. */
 	if (dm->ip_summed != ip_summed) {
@@ -1023,7 +1023,7 @@ static int
 dm9000_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	unsigned long flags;
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 
 	dm9000_dbg(db, 3, "%s:\n", __func__);
 
@@ -1062,7 +1062,7 @@ dm9000_start_xmit(struct sk_buff *skb, struct net_device *dev)
  * receive the packet to upper layer, free the transmitted packet
  */
 
-static void dm9000_tx_done(struct net_device *dev, board_info_t *db)
+static void dm9000_tx_done(struct net_device *dev, struct board_info *db)
 {
 	int tx_status = ior(db, DM9000_NSR);	/* Got TX status */
 
@@ -1094,7 +1094,7 @@ struct dm9000_rxhdr {
 static void
 dm9000_rx(struct net_device *dev)
 {
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 	struct dm9000_rxhdr rxhdr;
 	struct sk_buff *skb;
 	u8 rxbyte, *rdptr;
@@ -1196,7 +1196,7 @@ dm9000_rx(struct net_device *dev)
 static irqreturn_t dm9000_interrupt(int irq, void *dev_id)
 {
 	struct net_device *dev = dev_id;
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 	int int_status;
 	unsigned long flags;
 	u8 reg_save;
@@ -1246,7 +1246,7 @@ static irqreturn_t dm9000_interrupt(int irq, void *dev_id)
 static irqreturn_t dm9000_wol_interrupt(int irq, void *dev_id)
 {
 	struct net_device *dev = dev_id;
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 	unsigned long flags;
 	unsigned nsr, wcr;
 
@@ -1296,7 +1296,7 @@ static void dm9000_poll_controller(struct net_device *dev)
 static int
 dm9000_open(struct net_device *dev)
 {
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 	unsigned long irqflags = db->irq_res->flags & IRQF_TRIGGER_MASK;
 
 	if (netif_msg_ifup(db))
@@ -1342,7 +1342,7 @@ dm9000_open(struct net_device *dev)
 static void
 dm9000_shutdown(struct net_device *dev)
 {
-	board_info_t *db = netdev_priv(dev);
+	struct board_info *db = netdev_priv(dev);
 
 	/* RESET device */
 	dm9000_phy_write(dev, 0, MII_BMCR, BMCR_RESET);	/* PHY RESET */
@@ -1358,7 +1358,7 @@ dm9000_shutdown(struct net_device *dev)
 static int
 dm9000_stop(struct net_device *ndev)
 {
-	board_info_t *db = netdev_priv(ndev);
+	struct board_info *db = netdev_priv(ndev);
 
 	if (netif_msg_ifdown(db))
 		dev_dbg(db->dev, "shutting down %s\n", ndev->name);
@@ -1681,7 +1681,7 @@ dm9000_drv_suspend(struct device *dev)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct net_device *ndev = platform_get_drvdata(pdev);
-	board_info_t *db;
+	struct board_info *db;
 
 	if (ndev) {
 		db = netdev_priv(ndev);
@@ -1704,7 +1704,7 @@ dm9000_drv_resume(struct device *dev)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct net_device *ndev = platform_get_drvdata(pdev);
-	board_info_t *db = netdev_priv(ndev);
+	struct board_info *db = netdev_priv(ndev);
 
 	if (ndev) {
 		if (netif_running(ndev)) {
diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index bd53caf1c1eb..9f7fa644a397 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -310,6 +310,7 @@ struct fec_enet_private {
 	int	mii_timeout;
 	uint	phy_speed;
 	phy_interface_t	phy_interface;
+	struct device_node *phy_node;
 	int	link;
 	int	full_duplex;
 	int	speed;
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 66fe1f672499..4f87dffcb9b2 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -52,6 +52,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
+#include <linux/of_mdio.h>
 #include <linux/of_net.h>
 #include <linux/regulator/consumer.h>
 #include <linux/if_vlan.h>
@@ -1648,29 +1649,37 @@ static int fec_enet_mii_probe(struct net_device *ndev)
 
 	fep->phy_dev = NULL;
 
-	/* check for attached phy */
-	for (phy_id = 0; (phy_id < PHY_MAX_ADDR); phy_id++) {
-		if ((fep->mii_bus->phy_mask & (1 << phy_id)))
-			continue;
-		if (fep->mii_bus->phy_map[phy_id] == NULL)
-			continue;
-		if (fep->mii_bus->phy_map[phy_id]->phy_id == 0)
-			continue;
-		if (dev_id--)
-			continue;
-		strncpy(mdio_bus_id, fep->mii_bus->id, MII_BUS_ID_SIZE);
-		break;
-	}
+	if (fep->phy_node) {
+		phy_dev = of_phy_connect(ndev, fep->phy_node,
+					 &fec_enet_adjust_link, 0,
+					 fep->phy_interface);
+	} else {
+		/* check for attached phy */
+		for (phy_id = 0; (phy_id < PHY_MAX_ADDR); phy_id++) {
+			if ((fep->mii_bus->phy_mask & (1 << phy_id)))
+				continue;
+			if (fep->mii_bus->phy_map[phy_id] == NULL)
+				continue;
+			if (fep->mii_bus->phy_map[phy_id]->phy_id == 0)
+				continue;
+			if (dev_id--)
+				continue;
+			strncpy(mdio_bus_id, fep->mii_bus->id, MII_BUS_ID_SIZE);
+			break;
+		}
 
-	if (phy_id >= PHY_MAX_ADDR) {
-		netdev_info(ndev, "no PHY, assuming direct connection to switch\n");
-		strncpy(mdio_bus_id, "fixed-0", MII_BUS_ID_SIZE);
-		phy_id = 0;
+		if (phy_id >= PHY_MAX_ADDR) {
+			netdev_info(ndev, "no PHY, assuming direct connection to switch\n");
+			strncpy(mdio_bus_id, "fixed-0", MII_BUS_ID_SIZE);
+			phy_id = 0;
+		}
+
+		snprintf(phy_name, sizeof(phy_name),
+			 PHY_ID_FMT, mdio_bus_id, phy_id);
+		phy_dev = phy_connect(ndev, phy_name, &fec_enet_adjust_link,
+				      fep->phy_interface);
 	}
 
-	snprintf(phy_name, sizeof(phy_name), PHY_ID_FMT, mdio_bus_id, phy_id);
-	phy_dev = phy_connect(ndev, phy_name, &fec_enet_adjust_link,
-			      fep->phy_interface);
 	if (IS_ERR(phy_dev)) {
 		netdev_err(ndev, "could not attach to PHY\n");
 		return PTR_ERR(phy_dev);
@@ -1707,6 +1716,7 @@ static int fec_enet_mii_init(struct platform_device *pdev)
 	struct fec_enet_private *fep = netdev_priv(ndev);
 	const struct platform_device_id *id_entry =
 				platform_get_device_id(fep->pdev);
+	struct device_node *node;
 	int err = -ENXIO, i;
 
 	/*
@@ -1774,7 +1784,15 @@ static int fec_enet_mii_init(struct platform_device *pdev)
 	for (i = 0; i < PHY_MAX_ADDR; i++)
 		fep->mii_bus->irq[i] = PHY_POLL;
 
-	if (mdiobus_register(fep->mii_bus))
+	node = of_get_child_by_name(pdev->dev.of_node, "mdio");
+	if (node) {
+		err = of_mdiobus_register(fep->mii_bus, node);
+		of_node_put(node);
+	} else {
+		err = mdiobus_register(fep->mii_bus);
+	}
+
+	if (err)
 		goto err_out_free_mdio_irq;
 
 	mii_cnt++;
@@ -2527,6 +2545,7 @@ fec_probe(struct platform_device *pdev)
 	struct resource *r;
 	const struct of_device_id *of_id;
 	static int dev_id;
+	struct device_node *np = pdev->dev.of_node, *phy_node;
 
 	of_id = of_match_device(fec_dt_ids, &pdev->dev);
 	if (of_id)
@@ -2566,6 +2585,18 @@ fec_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, ndev);
 
+	phy_node = of_parse_phandle(np, "phy-handle", 0);
+	if (!phy_node && of_phy_is_fixed_link(np)) {
+		ret = of_phy_register_fixed_link(np);
+		if (ret < 0) {
+			dev_err(&pdev->dev,
+				"broken fixed-link specification\n");
+			goto failed_phy;
+		}
+		phy_node = of_node_get(np);
+	}
+	fep->phy_node = phy_node;
+
 	ret = of_get_phy_mode(pdev->dev.of_node);
 	if (ret < 0) {
 		pdata = dev_get_platdata(&pdev->dev);
@@ -2670,6 +2701,8 @@ failed_init:
 failed_regulator:
 	fec_enet_clk_enable(ndev, false);
 failed_clk:
+failed_phy:
+	of_node_put(phy_node);
 failed_ioremap:
 	free_netdev(ndev);
 
@@ -2691,6 +2724,7 @@ fec_drv_remove(struct platform_device *pdev)
 	if (fep->ptp_clock)
 		ptp_clock_unregister(fep->ptp_clock);
 	fec_enet_clk_enable(ndev, false);
+	of_node_put(fep->phy_node);
 	free_netdev(ndev);
 
 	return 0;
diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c
index 9947765e90c5..ff55fbb20a75 100644
--- a/drivers/net/ethernet/freescale/fec_mpc52xx.c
+++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c
@@ -1015,8 +1015,7 @@ mpc52xx_fec_remove(struct platform_device *op)
 
 	unregister_netdev(ndev);
 
-	if (priv->phy_node)
-		of_node_put(priv->phy_node);
+	of_node_put(priv->phy_node);
 	priv->phy_node = NULL;
 
 	irq_dispose_mapping(ndev->irq);
diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
index cfaf17b70f3f..748fd24d3d9e 100644
--- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
+++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
@@ -1033,7 +1033,7 @@ static int fs_enet_probe(struct platform_device *ofdev)
 		/* In the case of a fixed PHY, the DT node associated
 		 * to the PHY is the Ethernet MAC DT node.
 		 */
-		fpi->phy_node = ofdev->dev.of_node;
+		fpi->phy_node = of_node_get(ofdev->dev.of_node);
 	}
 
 	if (of_device_is_compatible(ofdev->dev.of_node, "fsl,mpc5125-fec")) {
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index a6cf40e62f3a..fb29d049f4e1 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -892,12 +892,12 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
 	/* In the case of a fixed PHY, the DT node associated
 	 * to the PHY is the Ethernet MAC DT node.
 	 */
-	if (of_phy_is_fixed_link(np)) {
+	if (!priv->phy_node && of_phy_is_fixed_link(np)) {
 		err = of_phy_register_fixed_link(np);
 		if (err)
 			goto err_grp_init;
 
-		priv->phy_node = np;
+		priv->phy_node = of_node_get(np);
 	}
 
 	/* Find the TBI PHY.  If it's not there, we don't support SGMII */
@@ -1435,10 +1435,8 @@ register_fail:
 	unmap_group_regs(priv);
 	gfar_free_rx_queues(priv);
 	gfar_free_tx_queues(priv);
-	if (priv->phy_node)
-		of_node_put(priv->phy_node);
-	if (priv->tbi_node)
-		of_node_put(priv->tbi_node);
+	of_node_put(priv->phy_node);
+	of_node_put(priv->tbi_node);
 	free_gfar_dev(priv);
 	return err;
 }
@@ -1447,10 +1445,8 @@ static int gfar_remove(struct platform_device *ofdev)
 {
 	struct gfar_private *priv = platform_get_drvdata(ofdev);
 
-	if (priv->phy_node)
-		of_node_put(priv->phy_node);
-	if (priv->tbi_node)
-		of_node_put(priv->tbi_node);
+	of_node_put(priv->phy_node);
+	of_node_put(priv->tbi_node);
 
 	unregister_netdev(priv->ndev);
 	unmap_group_regs(priv);
diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c
index 8ceaf7a2660c..3cf0478b3728 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.c
+++ b/drivers/net/ethernet/freescale/ucc_geth.c
@@ -3785,16 +3785,15 @@ static int ucc_geth_probe(struct platform_device* ofdev)
 	ug_info->uf_info.irq = irq_of_parse_and_map(np, 0);
 
 	ug_info->phy_node = of_parse_phandle(np, "phy-handle", 0);
-	if (!ug_info->phy_node) {
-		/* In the case of a fixed PHY, the DT node associated
+	if (!ug_info->phy_node && of_phy_is_fixed_link(np)) {
+		/*
+		 * In the case of a fixed PHY, the DT node associated
 		 * to the PHY is the Ethernet MAC DT node.
 		 */
-		if (of_phy_is_fixed_link(np)) {
-			err = of_phy_register_fixed_link(np);
-			if (err)
-				return err;
-		}
-		ug_info->phy_node = np;
+		err = of_phy_register_fixed_link(np);
+		if (err)
+			return err;
+		ug_info->phy_node = of_node_get(np);
 	}
 
 	/* Find the TBI PHY node.  If it's not there, we don't support SGMII */
@@ -3862,8 +3861,11 @@ static int ucc_geth_probe(struct platform_device* ofdev)
 	/* Create an ethernet device instance */
 	dev = alloc_etherdev(sizeof(*ugeth));
 
-	if (dev == NULL)
+	if (dev == NULL) {
+		of_node_put(ug_info->tbi_node);
+		of_node_put(ug_info->phy_node);
 		return -ENOMEM;
+	}
 
 	ugeth = netdev_priv(dev);
 	spin_lock_init(&ugeth->lock);
@@ -3897,6 +3899,8 @@ static int ucc_geth_probe(struct platform_device* ofdev)
 			pr_err("%s: Cannot register net device, aborting\n",
 			       dev->name);
 		free_netdev(dev);
+		of_node_put(ug_info->tbi_node);
+		of_node_put(ug_info->phy_node);
 		return err;
 	}
 
@@ -3920,6 +3924,8 @@ static int ucc_geth_remove(struct platform_device* ofdev)
 	unregister_netdev(dev);
 	free_netdev(dev);
 	ucc_geth_memclean(ugeth);
+	of_node_put(ugeth->ug_info->tbi_node);
+	of_node_put(ugeth->ug_info->phy_node);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c
index cfe7a7431730..a7139f588ad2 100644
--- a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c
+++ b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c
@@ -99,23 +99,23 @@ static const struct ethtool_ops netdev_ethtool_ops;
 /*
     card type
  */
-typedef enum { MBH10302, MBH10304, TDK, CONTEC, LA501, UNGERMANN, 
+enum cardtype { MBH10302, MBH10304, TDK, CONTEC, LA501, UNGERMANN,
 	       XXX10304, NEC, KME
-} cardtype_t;
+};
 
 /*
     driver specific data structure
 */
-typedef struct local_info_t {
+struct local_info {
 	struct pcmcia_device	*p_dev;
     long open_time;
     uint tx_started:1;
     uint tx_queue;
     u_short tx_queue_len;
-    cardtype_t cardtype;
+    enum cardtype cardtype;
     u_short sent;
     u_char __iomem *base;
-} local_info_t;
+};
 
 #define MC_FILTERBREAK 64
 
@@ -232,13 +232,13 @@ static const struct net_device_ops fjn_netdev_ops = {
 
 static int fmvj18x_probe(struct pcmcia_device *link)
 {
-    local_info_t *lp;
+    struct local_info *lp;
     struct net_device *dev;
 
     dev_dbg(&link->dev, "fmvj18x_attach()\n");
 
     /* Make up a FMVJ18x specific data structure */
-    dev = alloc_etherdev(sizeof(local_info_t));
+    dev = alloc_etherdev(sizeof(struct local_info));
     if (!dev)
 	return -ENOMEM;
     lp = netdev_priv(dev);
@@ -327,10 +327,10 @@ static int fmvj18x_ioprobe(struct pcmcia_device *p_dev, void *priv_data)
 static int fmvj18x_config(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     int i, ret;
     unsigned int ioaddr;
-    cardtype_t cardtype;
+    enum cardtype cardtype;
     char *card_name = "unknown";
     u8 *buf;
     size_t len;
@@ -584,7 +584,7 @@ static int fmvj18x_setup_mfc(struct pcmcia_device *link)
     int i;
     struct net_device *dev = link->priv;
     unsigned int ioaddr;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
 
     /* Allocate a small memory window */
     link->resource[3]->flags = WIN_DATA_WIDTH_8|WIN_MEMORY_TYPE_AM|WIN_ENABLE;
@@ -626,7 +626,7 @@ static void fmvj18x_release(struct pcmcia_device *link)
 {
 
     struct net_device *dev = link->priv;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     u_char __iomem *tmp;
 
     dev_dbg(&link->dev, "fmvj18x_release\n");
@@ -711,7 +711,7 @@ module_pcmcia_driver(fmvj18x_cs_driver);
 static irqreturn_t fjn_interrupt(int dummy, void *dev_id)
 {
     struct net_device *dev = dev_id;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr;
     unsigned short tx_stat, rx_stat;
 
@@ -772,7 +772,7 @@ static irqreturn_t fjn_interrupt(int dummy, void *dev_id)
 
 static void fjn_tx_timeout(struct net_device *dev)
 {
-    struct local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
 
     netdev_notice(dev, "transmit timed out with status %04x, %s?\n",
@@ -802,7 +802,7 @@ static void fjn_tx_timeout(struct net_device *dev)
 static netdev_tx_t fjn_start_xmit(struct sk_buff *skb,
 					struct net_device *dev)
 {
-    struct local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     short length = skb->len;
     
@@ -874,7 +874,7 @@ static netdev_tx_t fjn_start_xmit(struct sk_buff *skb,
 
 static void fjn_reset(struct net_device *dev)
 {
-    struct local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     int i;
 
@@ -1058,7 +1058,7 @@ static int fjn_config(struct net_device *dev, struct ifmap *map){
 
 static int fjn_open(struct net_device *dev)
 {
-    struct local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
 
     pr_debug("fjn_open('%s').\n", dev->name);
@@ -1083,7 +1083,7 @@ static int fjn_open(struct net_device *dev)
 
 static int fjn_close(struct net_device *dev)
 {
-    struct local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
     unsigned int ioaddr = dev->base_addr;
 
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index dadd9a5f6323..c9f1d1b7ef37 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2969,14 +2969,14 @@ static int mvneta_probe(struct platform_device *pdev)
 		/* In the case of a fixed PHY, the DT node associated
 		 * to the PHY is the Ethernet MAC DT node.
 		 */
-		phy_node = dn;
+		phy_node = of_node_get(dn);
 	}
 
 	phy_mode = of_get_phy_mode(dn);
 	if (phy_mode < 0) {
 		dev_err(&pdev->dev, "incorrect phy-mode\n");
 		err = -EINVAL;
-		goto err_free_irq;
+		goto err_put_phy_node;
 	}
 
 	dev->tx_queue_len = MVNETA_MAX_TXD;
@@ -2992,7 +2992,7 @@ static int mvneta_probe(struct platform_device *pdev)
 	pp->clk = devm_clk_get(&pdev->dev, NULL);
 	if (IS_ERR(pp->clk)) {
 		err = PTR_ERR(pp->clk);
-		goto err_free_irq;
+		goto err_put_phy_node;
 	}
 
 	clk_prepare_enable(pp->clk);
@@ -3071,6 +3071,8 @@ err_free_stats:
 	free_percpu(pp->stats);
 err_clk:
 	clk_disable_unprepare(pp->clk);
+err_put_phy_node:
+	of_node_put(phy_node);
 err_free_irq:
 	irq_dispose_mapping(dev->irq);
 err_free_netdev:
@@ -3088,6 +3090,7 @@ static int mvneta_remove(struct platform_device *pdev)
 	clk_disable_unprepare(pp->clk);
 	free_percpu(pp->stats);
 	irq_dispose_mapping(dev->irq);
+	of_node_put(pp->phy_node);
 	free_netdev(dev);
 
 	return 0;
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index f3d5d79f1cd1..69c26f04d8ce 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -574,6 +574,7 @@ myri10ge_validate_firmware(struct myri10ge_priv *mgp,
 
 	/* save firmware version for ethtool */
 	strncpy(mgp->fw_version, hdr->version, sizeof(mgp->fw_version));
+	mgp->fw_version[sizeof(mgp->fw_version) - 1] = '\0';
 
 	sscanf(mgp->fw_version, "%d.%d.%d", &mgp->fw_ver_major,
 	       &mgp->fw_ver_minor, &mgp->fw_ver_tiny);
diff --git a/drivers/net/ethernet/qlogic/qlcnic/Makefile b/drivers/net/ethernet/qlogic/qlcnic/Makefile
index a848d2979722..3c2c2c7c1559 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/Makefile
+++ b/drivers/net/ethernet/qlogic/qlcnic/Makefile
@@ -8,7 +8,7 @@ qlcnic-y := qlcnic_hw.o qlcnic_main.o qlcnic_init.o \
 	qlcnic_ethtool.o qlcnic_ctx.o qlcnic_io.o \
 	qlcnic_sysfs.o qlcnic_minidump.o qlcnic_83xx_hw.o \
 	qlcnic_83xx_init.o qlcnic_83xx_vnic.o \
-	qlcnic_minidump.o qlcnic_sriov_common.o
+	qlcnic_sriov_common.o
 
 qlcnic-$(CONFIG_QLCNIC_SRIOV) += qlcnic_sriov_pf.o
 
diff --git a/drivers/net/ethernet/smsc/smsc911x.h b/drivers/net/ethernet/smsc/smsc911x.h
index 23953957fed8..54d648920a1b 100644
--- a/drivers/net/ethernet/smsc/smsc911x.h
+++ b/drivers/net/ethernet/smsc/smsc911x.h
@@ -51,7 +51,7 @@
 
 #ifdef CONFIG_DEBUG_SPINLOCK
 #define SMSC_ASSERT_MAC_LOCK(pdata) \
-		WARN_ON(!spin_is_locked(&pdata->mac_lock))
+		WARN_ON_SMP(!spin_is_locked(&pdata->mac_lock))
 #else
 #define SMSC_ASSERT_MAC_LOCK(pdata) do {} while (0)
 #endif				/* CONFIG_DEBUG_SPINLOCK */
diff --git a/drivers/net/ethernet/ti/cpmac.c b/drivers/net/ethernet/ti/cpmac.c
index 3809f4ec2820..f9bcf7aa88ca 100644
--- a/drivers/net/ethernet/ti/cpmac.c
+++ b/drivers/net/ethernet/ti/cpmac.c
@@ -1130,6 +1130,7 @@ static int cpmac_probe(struct platform_device *pdev)
 		strncpy(mdio_bus_id, "fixed-0", MII_BUS_ID_SIZE); /* fixed phys bus */
 		phy_id = pdev->id;
 	}
+	mdio_bus_id[sizeof(mdio_bus_id) - 1] = '\0';
 
 	dev = alloc_etherdev_mq(sizeof(*priv), CPMAC_QUEUES);
 	if (!dev)
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 8a6e5c2d6f95..36f4459520c3 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -1148,8 +1148,7 @@ static int temac_of_remove(struct platform_device *op)
 	temac_mdio_teardown(lp);
 	unregister_netdev(ndev);
 	sysfs_remove_group(&lp->dev->kobj, &temac_attr_group);
-	if (lp->phy_node)
-		of_node_put(lp->phy_node);
+	of_node_put(lp->phy_node);
 	lp->phy_node = NULL;
 	iounmap(lp->regs);
 	if (lp->sdma_regs)
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 7b0a73556264..30e8608ff050 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1630,8 +1630,7 @@ static int axienet_of_remove(struct platform_device *op)
 	axienet_mdio_teardown(lp);
 	unregister_netdev(ndev);
 
-	if (lp->phy_node)
-		of_node_put(lp->phy_node);
+	of_node_put(lp->phy_node);
 	lp->phy_node = NULL;
 
 	iounmap(lp->regs);
diff --git a/drivers/net/ethernet/xircom/xirc2ps_cs.c b/drivers/net/ethernet/xircom/xirc2ps_cs.c
index 7c81ffb861e8..d56f8693202b 100644
--- a/drivers/net/ethernet/xircom/xirc2ps_cs.c
+++ b/drivers/net/ethernet/xircom/xirc2ps_cs.c
@@ -266,7 +266,7 @@ static void xirc2ps_detach(struct pcmcia_device *p_dev);
 
 static irqreturn_t xirc2ps_interrupt(int irq, void *dev_id);
 
-typedef struct local_info_t {
+struct local_info {
 	struct net_device	*dev;
 	struct pcmcia_device	*p_dev;
 
@@ -281,7 +281,7 @@ typedef struct local_info_t {
     unsigned last_ptr_value; /* last packets transmitted value */
     const char *manf_str;
     struct work_struct tx_timeout_task;
-} local_info_t;
+};
 
 /****************
  * Some more prototypes
@@ -475,12 +475,12 @@ static int
 xirc2ps_probe(struct pcmcia_device *link)
 {
     struct net_device *dev;
-    local_info_t *local;
+    struct local_info *local;
 
     dev_dbg(&link->dev, "attach()\n");
 
     /* Allocate the device structure */
-    dev = alloc_etherdev(sizeof(local_info_t));
+    dev = alloc_etherdev(sizeof(struct local_info));
     if (!dev)
 	    return -ENOMEM;
     local = netdev_priv(dev);
@@ -536,7 +536,7 @@ static int
 set_card_type(struct pcmcia_device *link)
 {
     struct net_device *dev = link->priv;
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     u8 *buf;
     unsigned int cisrev, mediaid, prodid;
     size_t len;
@@ -690,7 +690,7 @@ static int
 xirc2ps_config(struct pcmcia_device * link)
 {
     struct net_device *dev = link->priv;
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     unsigned int ioaddr;
     int err;
     u8 *buf;
@@ -931,7 +931,7 @@ xirc2ps_release(struct pcmcia_device *link)
 
 	if (link->resource[2]->end) {
 		struct net_device *dev = link->priv;
-		local_info_t *local = netdev_priv(dev);
+		struct local_info *local = netdev_priv(dev);
 		if (local->dingo)
 			iounmap(local->dingo_ccr - 0x0800);
 	}
@@ -975,7 +975,7 @@ static irqreturn_t
 xirc2ps_interrupt(int irq, void *dev_id)
 {
     struct net_device *dev = (struct net_device *)dev_id;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr;
     u_char saved_page;
     unsigned bytes_rcvd;
@@ -1194,8 +1194,8 @@ xirc2ps_interrupt(int irq, void *dev_id)
 static void
 xirc2ps_tx_timeout_task(struct work_struct *work)
 {
-	local_info_t *local =
-		container_of(work, local_info_t, tx_timeout_task);
+	struct local_info *local =
+		container_of(work, struct local_info, tx_timeout_task);
 	struct net_device *dev = local->dev;
     /* reset the card */
     do_reset(dev,1);
@@ -1206,7 +1206,7 @@ xirc2ps_tx_timeout_task(struct work_struct *work)
 static void
 xirc_tx_timeout(struct net_device *dev)
 {
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     dev->stats.tx_errors++;
     netdev_notice(dev, "transmit timed out\n");
     schedule_work(&lp->tx_timeout_task);
@@ -1215,7 +1215,7 @@ xirc_tx_timeout(struct net_device *dev)
 static netdev_tx_t
 do_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     int okay;
     unsigned freespace;
@@ -1300,7 +1300,7 @@ static void set_address(struct set_address_info *sa_info, char *addr)
 static void set_addresses(struct net_device *dev)
 {
 	unsigned int ioaddr = dev->base_addr;
-	local_info_t *lp = netdev_priv(dev);
+	struct local_info *lp = netdev_priv(dev);
 	struct netdev_hw_addr *ha;
 	struct set_address_info sa_info;
 	int i;
@@ -1362,7 +1362,7 @@ set_multicast_list(struct net_device *dev)
 static int
 do_config(struct net_device *dev, struct ifmap *map)
 {
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
 
     pr_debug("do_config(%p)\n", dev);
     if (map->port != 255 && map->port != dev->if_port) {
@@ -1387,7 +1387,7 @@ do_config(struct net_device *dev, struct ifmap *map)
 static int
 do_open(struct net_device *dev)
 {
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
 
     dev_dbg(&link->dev, "do_open(%p)\n", dev);
@@ -1421,7 +1421,7 @@ static const struct ethtool_ops netdev_ethtool_ops = {
 static int
 do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     struct mii_ioctl_data *data = if_mii(rq);
 
@@ -1453,7 +1453,7 @@ do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 static void
 hardreset(struct net_device *dev)
 {
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
 
     SelectPage(4);
@@ -1470,7 +1470,7 @@ hardreset(struct net_device *dev)
 static void
 do_reset(struct net_device *dev, int full)
 {
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     unsigned value;
 
@@ -1631,7 +1631,7 @@ do_reset(struct net_device *dev, int full)
 static int
 init_mii(struct net_device *dev)
 {
-    local_info_t *local = netdev_priv(dev);
+    struct local_info *local = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
     unsigned control, status, linkpartner;
     int i;
@@ -1715,7 +1715,7 @@ static int
 do_stop(struct net_device *dev)
 {
     unsigned int ioaddr = dev->base_addr;
-    local_info_t *lp = netdev_priv(dev);
+    struct local_info *lp = netdev_priv(dev);
     struct pcmcia_device *link = lp->p_dev;
 
     dev_dbg(&link->dev, "do_stop(%p)\n", dev);
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index 7cc64eac0fa3..e5c7e6165a4b 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -90,7 +90,7 @@
 #define LMI_ANSI_LENGTH		  14
 
 
-typedef struct {
+struct fr_hdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	unsigned ea1:	1;
 	unsigned cr:	1;
@@ -112,14 +112,14 @@ typedef struct {
 	unsigned de:	1;
 	unsigned ea2:	1;
 #endif
-}__packed fr_hdr;
+} __packed;
 
 
-typedef struct pvc_device_struct {
+struct pvc_device {
 	struct net_device *frad;
 	struct net_device *main;
 	struct net_device *ether;	/* bridged Ethernet interface	*/
-	struct pvc_device_struct *next;	/* Sorted in ascending DLCI order */
+	struct pvc_device *next;	/* Sorted in ascending DLCI order */
 	int dlci;
 	int open_count;
 
@@ -132,11 +132,11 @@ typedef struct pvc_device_struct {
 		unsigned int becn: 1;
 		unsigned int bandwidth;	/* Cisco LMI reporting only */
 	}state;
-}pvc_device;
+};
 
 struct frad_state {
 	fr_proto settings;
-	pvc_device *first_pvc;
+	struct pvc_device *first_pvc;
 	int dce_pvc_count;
 
 	struct timer_list timer;
@@ -174,9 +174,9 @@ static inline struct frad_state* state(hdlc_device *hdlc)
 }
 
 
-static inline pvc_device* find_pvc(hdlc_device *hdlc, u16 dlci)
+static inline struct pvc_device *find_pvc(hdlc_device *hdlc, u16 dlci)
 {
-	pvc_device *pvc = state(hdlc)->first_pvc;
+	struct pvc_device *pvc = state(hdlc)->first_pvc;
 
 	while (pvc) {
 		if (pvc->dlci == dlci)
@@ -190,10 +190,10 @@ static inline pvc_device* find_pvc(hdlc_device *hdlc, u16 dlci)
 }
 
 
-static pvc_device* add_pvc(struct net_device *dev, u16 dlci)
+static struct pvc_device *add_pvc(struct net_device *dev, u16 dlci)
 {
 	hdlc_device *hdlc = dev_to_hdlc(dev);
-	pvc_device *pvc, **pvc_p = &state(hdlc)->first_pvc;
+	struct pvc_device *pvc, **pvc_p = &state(hdlc)->first_pvc;
 
 	while (*pvc_p) {
 		if ((*pvc_p)->dlci == dlci)
@@ -203,7 +203,7 @@ static pvc_device* add_pvc(struct net_device *dev, u16 dlci)
 		pvc_p = &(*pvc_p)->next;
 	}
 
-	pvc = kzalloc(sizeof(pvc_device), GFP_ATOMIC);
+	pvc = kzalloc(sizeof(*pvc), GFP_ATOMIC);
 #ifdef DEBUG_PVC
 	printk(KERN_DEBUG "add_pvc: allocated pvc %p, frad %p\n", pvc, dev);
 #endif
@@ -218,13 +218,13 @@ static pvc_device* add_pvc(struct net_device *dev, u16 dlci)
 }
 
 
-static inline int pvc_is_used(pvc_device *pvc)
+static inline int pvc_is_used(struct pvc_device *pvc)
 {
 	return pvc->main || pvc->ether;
 }
 
 
-static inline void pvc_carrier(int on, pvc_device *pvc)
+static inline void pvc_carrier(int on, struct pvc_device *pvc)
 {
 	if (on) {
 		if (pvc->main)
@@ -246,11 +246,11 @@ static inline void pvc_carrier(int on, pvc_device *pvc)
 
 static inline void delete_unused_pvcs(hdlc_device *hdlc)
 {
-	pvc_device **pvc_p = &state(hdlc)->first_pvc;
+	struct pvc_device **pvc_p = &state(hdlc)->first_pvc;
 
 	while (*pvc_p) {
 		if (!pvc_is_used(*pvc_p)) {
-			pvc_device *pvc = *pvc_p;
+			struct pvc_device *pvc = *pvc_p;
 #ifdef DEBUG_PVC
 			printk(KERN_DEBUG "freeing unused pvc: %p\n", pvc);
 #endif
@@ -263,7 +263,8 @@ static inline void delete_unused_pvcs(hdlc_device *hdlc)
 }
 
 
-static inline struct net_device** get_dev_p(pvc_device *pvc, int type)
+static inline struct net_device **get_dev_p(struct pvc_device *pvc,
+					    int type)
 {
 	if (type == ARPHRD_ETHER)
 		return &pvc->ether;
@@ -342,7 +343,7 @@ static int fr_hard_header(struct sk_buff **skb_p, u16 dlci)
 
 static int pvc_open(struct net_device *dev)
 {
-	pvc_device *pvc = dev->ml_priv;
+	struct pvc_device *pvc = dev->ml_priv;
 
 	if ((pvc->frad->flags & IFF_UP) == 0)
 		return -EIO;  /* Frad must be UP in order to activate PVC */
@@ -362,7 +363,7 @@ static int pvc_open(struct net_device *dev)
 
 static int pvc_close(struct net_device *dev)
 {
-	pvc_device *pvc = dev->ml_priv;
+	struct pvc_device *pvc = dev->ml_priv;
 
 	if (--pvc->open_count == 0) {
 		hdlc_device *hdlc = dev_to_hdlc(pvc->frad);
@@ -381,7 +382,7 @@ static int pvc_close(struct net_device *dev)
 
 static int pvc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
-	pvc_device *pvc = dev->ml_priv;
+	struct pvc_device *pvc = dev->ml_priv;
 	fr_proto_pvc_info info;
 
 	if (ifr->ifr_settings.type == IF_GET_PROTO) {
@@ -409,7 +410,7 @@ static int pvc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 
 static netdev_tx_t pvc_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	pvc_device *pvc = dev->ml_priv;
+	struct pvc_device *pvc = dev->ml_priv;
 
 	if (pvc->state.active) {
 		if (dev->type == ARPHRD_ETHER) {
@@ -444,7 +445,7 @@ static netdev_tx_t pvc_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-static inline void fr_log_dlci_active(pvc_device *pvc)
+static inline void fr_log_dlci_active(struct pvc_device *pvc)
 {
 	netdev_info(pvc->frad, "DLCI %d [%s%s%s]%s %s\n",
 		    pvc->dlci,
@@ -469,7 +470,7 @@ static void fr_lmi_send(struct net_device *dev, int fullrep)
 {
 	hdlc_device *hdlc = dev_to_hdlc(dev);
 	struct sk_buff *skb;
-	pvc_device *pvc = state(hdlc)->first_pvc;
+	struct pvc_device *pvc = state(hdlc)->first_pvc;
 	int lmi = state(hdlc)->settings.lmi;
 	int dce = state(hdlc)->settings.dce;
 	int len = lmi == LMI_ANSI ? LMI_ANSI_LENGTH : LMI_CCITT_CISCO_LENGTH;
@@ -566,7 +567,7 @@ static void fr_lmi_send(struct net_device *dev, int fullrep)
 static void fr_set_link_state(int reliable, struct net_device *dev)
 {
 	hdlc_device *hdlc = dev_to_hdlc(dev);
-	pvc_device *pvc = state(hdlc)->first_pvc;
+	struct pvc_device *pvc = state(hdlc)->first_pvc;
 
 	state(hdlc)->reliable = reliable;
 	if (reliable) {
@@ -652,7 +653,7 @@ static void fr_timer(unsigned long arg)
 static int fr_lmi_recv(struct net_device *dev, struct sk_buff *skb)
 {
 	hdlc_device *hdlc = dev_to_hdlc(dev);
-	pvc_device *pvc;
+	struct pvc_device *pvc;
 	u8 rxseq, txseq;
 	int lmi = state(hdlc)->settings.lmi;
 	int dce = state(hdlc)->settings.dce;
@@ -869,10 +870,10 @@ static int fr_rx(struct sk_buff *skb)
 {
 	struct net_device *frad = skb->dev;
 	hdlc_device *hdlc = dev_to_hdlc(frad);
-	fr_hdr *fh = (fr_hdr*)skb->data;
+	struct fr_hdr *fh = (struct fr_hdr *)skb->data;
 	u8 *data = skb->data;
 	u16 dlci;
-	pvc_device *pvc;
+	struct pvc_device *pvc;
 	struct net_device *dev = NULL;
 
 	if (skb->len <= 4 || fh->ea1 || data[2] != FR_UI)
@@ -1028,7 +1029,7 @@ static void fr_stop(struct net_device *dev)
 static void fr_close(struct net_device *dev)
 {
 	hdlc_device *hdlc = dev_to_hdlc(dev);
-	pvc_device *pvc = state(hdlc)->first_pvc;
+	struct pvc_device *pvc = state(hdlc)->first_pvc;
 
 	while (pvc) {		/* Shutdown all PVCs for this FRAD */
 		if (pvc->main)
@@ -1060,7 +1061,7 @@ static const struct net_device_ops pvc_ops = {
 static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
 {
 	hdlc_device *hdlc = dev_to_hdlc(frad);
-	pvc_device *pvc;
+	struct pvc_device *pvc;
 	struct net_device *dev;
 	int used;
 
@@ -1117,7 +1118,7 @@ static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
 
 static int fr_del_pvc(hdlc_device *hdlc, unsigned int dlci, int type)
 {
-	pvc_device *pvc;
+	struct pvc_device *pvc;
 	struct net_device *dev;
 
 	if ((pvc = find_pvc(hdlc, dlci)) == NULL)
@@ -1145,13 +1146,13 @@ static int fr_del_pvc(hdlc_device *hdlc, unsigned int dlci, int type)
 static void fr_destroy(struct net_device *frad)
 {
 	hdlc_device *hdlc = dev_to_hdlc(frad);
-	pvc_device *pvc = state(hdlc)->first_pvc;
+	struct pvc_device *pvc = state(hdlc)->first_pvc;
 	state(hdlc)->first_pvc = NULL; /* All PVCs destroyed */
 	state(hdlc)->dce_pvc_count = 0;
 	state(hdlc)->dce_changed = 1;
 
 	while (pvc) {
-		pvc_device *next = pvc->next;
+		struct pvc_device *next = pvc->next;
 		/* destructors will free_netdev() main and ether */
 		if (pvc->main)
 			unregister_netdevice(pvc->main);
diff --git a/drivers/net/wan/wanxl.c b/drivers/net/wan/wanxl.c
index f76aa9081585..1287c3eb33c2 100644
--- a/drivers/net/wan/wanxl.c
+++ b/drivers/net/wan/wanxl.c
@@ -54,24 +54,24 @@ static const char* version = "wanXL serial card driver version: 0.48";
 #define MBX2_MEMSZ_MASK 0xFFFF0000 /* PUTS Memory Size Register mask */
 
 
-typedef struct {
+struct port {
 	struct net_device *dev;
-	struct card_t *card;
+	struct card *card;
 	spinlock_t lock;	/* for wanxl_xmit */
         int node;		/* physical port #0 - 3 */
 	unsigned int clock_type;
 	int tx_in, tx_out;
 	struct sk_buff *tx_skbs[TX_BUFFERS];
-}port_t;
+};
 
 
-typedef struct {
+struct card_status {
 	desc_t rx_descs[RX_QUEUE_LENGTH];
 	port_status_t port_status[4];
-}card_status_t;
+};
 
 
-typedef struct card_t {
+struct card {
 	int n_ports;		/* 1, 2 or 4 ports */
 	u8 irq;
 
@@ -79,20 +79,20 @@ typedef struct card_t {
 	struct pci_dev *pdev;	/* for pci_name(pdev) */
 	int rx_in;
 	struct sk_buff *rx_skbs[RX_QUEUE_LENGTH];
-	card_status_t *status;	/* shared between host and card */
+	struct card_status *status;	/* shared between host and card */
 	dma_addr_t status_address;
-	port_t ports[0];	/* 1 - 4 port_t structures follow */
-}card_t;
+	struct port ports[0];	/* 1 - 4 port structures follow */
+};
 
 
 
-static inline port_t* dev_to_port(struct net_device *dev)
+static inline struct port *dev_to_port(struct net_device *dev)
 {
-        return (port_t *)dev_to_hdlc(dev)->priv;
+	return (struct port *)dev_to_hdlc(dev)->priv;
 }
 
 
-static inline port_status_t* get_status(port_t *port)
+static inline port_status_t *get_status(struct port *port)
 {
 	return &port->card->status->port_status[port->node];
 }
@@ -115,7 +115,7 @@ static inline dma_addr_t pci_map_single_debug(struct pci_dev *pdev, void *ptr,
 
 
 /* Cable and/or personality module change interrupt service */
-static inline void wanxl_cable_intr(port_t *port)
+static inline void wanxl_cable_intr(struct port *port)
 {
 	u32 value = get_status(port)->cable;
 	int valid = 1;
@@ -160,7 +160,7 @@ static inline void wanxl_cable_intr(port_t *port)
 
 
 /* Transmit complete interrupt service */
-static inline void wanxl_tx_intr(port_t *port)
+static inline void wanxl_tx_intr(struct port *port)
 {
 	struct net_device *dev = port->dev;
 	while (1) {
@@ -193,7 +193,7 @@ static inline void wanxl_tx_intr(port_t *port)
 
 
 /* Receive complete interrupt service */
-static inline void wanxl_rx_intr(card_t *card)
+static inline void wanxl_rx_intr(struct card *card)
 {
 	desc_t *desc;
 	while (desc = &card->status->rx_descs[card->rx_in],
@@ -203,7 +203,7 @@ static inline void wanxl_rx_intr(card_t *card)
 				pci_name(card->pdev));
 		else {
 			struct sk_buff *skb = card->rx_skbs[card->rx_in];
-			port_t *port = &card->ports[desc->stat &
+			struct port *port = &card->ports[desc->stat &
 						    PACKET_PORT_MASK];
 			struct net_device *dev = port->dev;
 
@@ -245,7 +245,7 @@ static inline void wanxl_rx_intr(card_t *card)
 
 static irqreturn_t wanxl_intr(int irq, void* dev_id)
 {
-        card_t *card = dev_id;
+	struct card *card = dev_id;
         int i;
         u32 stat;
         int handled = 0;
@@ -272,7 +272,7 @@ static irqreturn_t wanxl_intr(int irq, void* dev_id)
 
 static netdev_tx_t wanxl_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-        port_t *port = dev_to_port(dev);
+	struct port *port = dev_to_port(dev);
 	desc_t *desc;
 
         spin_lock(&port->lock);
@@ -319,7 +319,7 @@ static netdev_tx_t wanxl_xmit(struct sk_buff *skb, struct net_device *dev)
 static int wanxl_attach(struct net_device *dev, unsigned short encoding,
 			unsigned short parity)
 {
-	port_t *port = dev_to_port(dev);
+	struct port *port = dev_to_port(dev);
 
 	if (encoding != ENCODING_NRZ &&
 	    encoding != ENCODING_NRZI)
@@ -343,7 +343,7 @@ static int wanxl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	const size_t size = sizeof(sync_serial_settings);
 	sync_serial_settings line;
-	port_t *port = dev_to_port(dev);
+	struct port *port = dev_to_port(dev);
 
 	if (cmd != SIOCWANDEV)
 		return hdlc_ioctl(dev, ifr, cmd);
@@ -393,7 +393,7 @@ static int wanxl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 
 static int wanxl_open(struct net_device *dev)
 {
-	port_t *port = dev_to_port(dev);
+	struct port *port = dev_to_port(dev);
 	u8 __iomem *dbr = port->card->plx + PLX_DOORBELL_TO_CARD;
 	unsigned long timeout;
 	int i;
@@ -429,7 +429,7 @@ static int wanxl_open(struct net_device *dev)
 
 static int wanxl_close(struct net_device *dev)
 {
-	port_t *port = dev_to_port(dev);
+	struct port *port = dev_to_port(dev);
 	unsigned long timeout;
 	int i;
 
@@ -467,7 +467,7 @@ static int wanxl_close(struct net_device *dev)
 
 static struct net_device_stats *wanxl_get_stats(struct net_device *dev)
 {
-	port_t *port = dev_to_port(dev);
+	struct port *port = dev_to_port(dev);
 
 	dev->stats.rx_over_errors = get_status(port)->rx_overruns;
 	dev->stats.rx_frame_errors = get_status(port)->rx_frame_errors;
@@ -478,7 +478,7 @@ static struct net_device_stats *wanxl_get_stats(struct net_device *dev)
 
 
 
-static int wanxl_puts_command(card_t *card, u32 cmd)
+static int wanxl_puts_command(struct card *card, u32 cmd)
 {
 	unsigned long timeout = jiffies + 5 * HZ;
 
@@ -495,7 +495,7 @@ static int wanxl_puts_command(card_t *card, u32 cmd)
 
 
 
-static void wanxl_reset(card_t *card)
+static void wanxl_reset(struct card *card)
 {
 	u32 old_value = readl(card->plx + PLX_CONTROL) & ~PLX_CTL_RESET;
 
@@ -511,7 +511,7 @@ static void wanxl_reset(card_t *card)
 
 static void wanxl_pci_remove_one(struct pci_dev *pdev)
 {
-	card_t *card = pci_get_drvdata(pdev);
+	struct card *card = pci_get_drvdata(pdev);
 	int i;
 
 	for (i = 0; i < card->n_ports; i++) {
@@ -537,7 +537,7 @@ static void wanxl_pci_remove_one(struct pci_dev *pdev)
 		iounmap(card->plx);
 
 	if (card->status)
-		pci_free_consistent(pdev, sizeof(card_status_t),
+		pci_free_consistent(pdev, sizeof(struct card_status),
 				    card->status, card->status_address);
 
 	pci_release_regions(pdev);
@@ -560,7 +560,7 @@ static const struct net_device_ops wanxl_ops = {
 static int wanxl_pci_init_one(struct pci_dev *pdev,
 			      const struct pci_device_id *ent)
 {
-	card_t *card;
+	struct card *card;
 	u32 ramsize, stat;
 	unsigned long timeout;
 	u32 plx_phy;		/* PLX PCI base address */
@@ -601,7 +601,7 @@ static int wanxl_pci_init_one(struct pci_dev *pdev,
 	default: ports = 4;
 	}
 
-	alloc_size = sizeof(card_t) + ports * sizeof(port_t);
+	alloc_size = sizeof(struct card) + ports * sizeof(struct port);
 	card = kzalloc(alloc_size, GFP_KERNEL);
 	if (card == NULL) {
 		pci_release_regions(pdev);
@@ -612,7 +612,8 @@ static int wanxl_pci_init_one(struct pci_dev *pdev,
 	pci_set_drvdata(pdev, card);
 	card->pdev = pdev;
 
-	card->status = pci_alloc_consistent(pdev, sizeof(card_status_t),
+	card->status = pci_alloc_consistent(pdev,
+					    sizeof(struct card_status),
 					    &card->status_address);
 	if (card->status == NULL) {
 		wanxl_pci_remove_one(pdev);
@@ -766,7 +767,7 @@ static int wanxl_pci_init_one(struct pci_dev *pdev,
 
 	for (i = 0; i < ports; i++) {
 		hdlc_device *hdlc;
-		port_t *port = &card->ports[i];
+		struct port *port = &card->ports[i];
 		struct net_device *dev = alloc_hdlcdev(port);
 		if (!dev) {
 			pr_err("%s: unable to allocate memory\n",
diff --git a/drivers/net/wireless/airo_cs.c b/drivers/net/wireless/airo_cs.c
index 7e9ede6c5798..d9ed22b4cc6b 100644
--- a/drivers/net/wireless/airo_cs.c
+++ b/drivers/net/wireless/airo_cs.c
@@ -56,18 +56,18 @@ static void airo_release(struct pcmcia_device *link);
 
 static void airo_detach(struct pcmcia_device *p_dev);
 
-typedef struct local_info_t {
+struct local_info {
 	struct net_device *eth_dev;
-} local_info_t;
+};
 
 static int airo_probe(struct pcmcia_device *p_dev)
 {
-	local_info_t *local;
+	struct local_info *local;
 
 	dev_dbg(&p_dev->dev, "airo_attach()\n");
 
 	/* Allocate space for private device-specific data */
-	local = kzalloc(sizeof(local_info_t), GFP_KERNEL);
+	local = kzalloc(sizeof(*local), GFP_KERNEL);
 	if (!local)
 		return -ENOMEM;
 
@@ -82,10 +82,11 @@ static void airo_detach(struct pcmcia_device *link)
 
 	airo_release(link);
 
-	if (((local_info_t *)link->priv)->eth_dev) {
-		stop_airo_card(((local_info_t *)link->priv)->eth_dev, 0);
+	if (((struct local_info *)link->priv)->eth_dev) {
+		stop_airo_card(((struct local_info *)link->priv)->eth_dev,
+			       0);
 	}
-	((local_info_t *)link->priv)->eth_dev = NULL;
+	((struct local_info *)link->priv)->eth_dev = NULL;
 
 	kfree(link->priv);
 } /* airo_detach */
@@ -101,7 +102,7 @@ static int airo_cs_config_check(struct pcmcia_device *p_dev, void *priv_data)
 
 static int airo_config(struct pcmcia_device *link)
 {
-	local_info_t *dev;
+	struct local_info *dev;
 	int ret;
 
 	dev = link->priv;
@@ -121,10 +122,10 @@ static int airo_config(struct pcmcia_device *link)
 	ret = pcmcia_enable_device(link);
 	if (ret)
 		goto failed;
-	((local_info_t *)link->priv)->eth_dev =
+	((struct local_info *)link->priv)->eth_dev =
 		init_airo_card(link->irq,
 			       link->resource[0]->start, 1, &link->dev);
-	if (!((local_info_t *)link->priv)->eth_dev)
+	if (!((struct local_info *)link->priv)->eth_dev)
 		goto failed;
 
 	return 0;
@@ -142,7 +143,7 @@ static void airo_release(struct pcmcia_device *link)
 
 static int airo_suspend(struct pcmcia_device *link)
 {
-	local_info_t *local = link->priv;
+	struct local_info *local = link->priv;
 
 	netif_device_detach(local->eth_dev);
 
@@ -151,7 +152,7 @@ static int airo_suspend(struct pcmcia_device *link)
 
 static int airo_resume(struct pcmcia_device *link)
 {
-	local_info_t *local = link->priv;
+	struct local_info *local = link->priv;
 
 	if (link->open) {
 		reset_airo_card(local->eth_dev);
diff --git a/drivers/net/wireless/atmel.c b/drivers/net/wireless/atmel.c
index 1fe41af81a59..9183f1cf89a7 100644
--- a/drivers/net/wireless/atmel.c
+++ b/drivers/net/wireless/atmel.c
@@ -2598,11 +2598,11 @@ static const iw_handler atmel_private_handler[] =
 	NULL,				/* SIOCIWFIRSTPRIV */
 };
 
-typedef struct atmel_priv_ioctl {
+struct atmel_priv_ioctl {
 	char id[32];
 	unsigned char __user *data;
 	unsigned short len;
-} atmel_priv_ioctl;
+};
 
 #define ATMELFWL	SIOCIWFIRSTPRIV
 #define ATMELIDIFC	ATMELFWL + 1
@@ -2615,7 +2615,7 @@ static const struct iw_priv_args atmel_private_args[] = {
 		.cmd = ATMELFWL,
 		.set_args = IW_PRIV_TYPE_BYTE
 				| IW_PRIV_SIZE_FIXED
-				| sizeof (atmel_priv_ioctl),
+				| sizeof(struct atmel_priv_ioctl),
 		.get_args = IW_PRIV_TYPE_NONE,
 		.name = "atmelfwl"
 	}, {
@@ -2645,7 +2645,7 @@ static int atmel_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
 	int i, rc = 0;
 	struct atmel_private *priv = netdev_priv(dev);
-	atmel_priv_ioctl com;
+	struct atmel_priv_ioctl com;
 	struct iwreq *wrq = (struct iwreq *) rq;
 	unsigned char *new_firmware;
 	char domain[REGDOMAINSZ + 1];
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 48a55cda979b..bfd10cb9c8de 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -78,12 +78,8 @@ int xenvif_poll(struct napi_struct *napi, int budget)
 	/* This vif is rogue, we pretend we've there is nothing to do
 	 * for this vif to deschedule it from NAPI. But this interface
 	 * will be turned off in thread context later.
-	 * Also, if a guest doesn't post enough slots to receive data on one of
-	 * its queues, the carrier goes down and NAPI is descheduled here so
-	 * the guest can't send more packets until it's ready to receive.
 	 */
-	if (unlikely(queue->vif->disabled ||
-		     !netif_carrier_ok(queue->vif->dev))) {
+	if (unlikely(queue->vif->disabled)) {
 		napi_complete(napi);
 		return 0;
 	}
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index aa2093325be1..4734472aa620 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -2025,9 +2025,15 @@ int xenvif_kthread_guest_rx(void *data)
 		 * context so we defer it here, if this thread is
 		 * associated with queue 0.
 		 */
-		if (unlikely(queue->vif->disabled && queue->id == 0))
+		if (unlikely(queue->vif->disabled && queue->id == 0)) {
 			xenvif_carrier_off(queue->vif);
-		else if (unlikely(test_and_clear_bit(QUEUE_STATUS_RX_PURGE_EVENT,
+		} else if (unlikely(queue->vif->disabled)) {
+			/* kthread_stop() would be called upon this thread soon,
+			 * be a bit proactive
+			 */
+			skb_queue_purge(&queue->rx_queue);
+			queue->rx_last_skb_slots = 0;
+		} else if (unlikely(test_and_clear_bit(QUEUE_STATUS_RX_PURGE_EVENT,
 						     &queue->status))) {
 			xenvif_rx_purge_event(queue);
 		} else if (!netif_carrier_ok(queue->vif->dev)) {
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 28204bc4f369..ca82f545ec2c 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -628,9 +628,10 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	slots = DIV_ROUND_UP(offset + len, PAGE_SIZE) +
 		xennet_count_skb_frag_slots(skb);
 	if (unlikely(slots > MAX_SKB_FRAGS + 1)) {
-		net_alert_ratelimited(
-			"xennet: skb rides the rocket: %d slots\n", slots);
-		goto drop;
+		net_dbg_ratelimited("xennet: skb rides the rocket: %d slots, %d bytes\n",
+				    slots, skb->len);
+		if (skb_linearize(skb))
+			goto drop;
 	}
 
 	spin_lock_irqsave(&queue->tx_lock, flags);
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index f9a13867cb70..693208eb9047 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -151,7 +151,7 @@ config KIRKWOOD_THERMAL
 
 config DOVE_THERMAL
 	tristate "Temperature sensor on Marvell Dove SoCs"
-	depends on ARCH_DOVE
+	depends on ARCH_DOVE || MACH_DOVE
 	depends on OF
 	help
 	  Support for the Dove thermal sensor driver in the Linux thermal
@@ -243,4 +243,9 @@ depends on ARCH_EXYNOS
 source "drivers/thermal/samsung/Kconfig"
 endmenu
 
+menu "STMicroelectronics thermal drivers"
+depends on ARCH_STI && OF
+source "drivers/thermal/st/Kconfig"
+endmenu
+
 endif
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index de0636a57a64..31e232f84b6b 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_X86_PKG_TEMP_THERMAL)	+= x86_pkg_temp_thermal.o
 obj-$(CONFIG_INTEL_SOC_DTS_THERMAL)	+= intel_soc_dts_thermal.o
 obj-$(CONFIG_TI_SOC_THERMAL)	+= ti-soc-thermal/
 obj-$(CONFIG_ACPI_INT3403_THERMAL)	+= int3403_thermal.o
+obj-$(CONFIG_ST_THERMAL)	+= st/
diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index 84a75f89bf74..1ab0018271c5 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -305,7 +305,7 @@ static int cpufreq_apply_cooling(struct cpufreq_cooling_device *cpufreq_device,
  * @event: value showing cpufreq event for which this function invoked.
  * @data: callback-specific data
  *
- * Callback to highjack the notification on cpufreq policy transition.
+ * Callback to hijack the notification on cpufreq policy transition.
  * Every time there is a change in policy, we will intercept and
  * update the cpufreq policy with thermal constraints.
  *
diff --git a/drivers/thermal/int3403_thermal.c b/drivers/thermal/int3403_thermal.c
index e93f0253f6ed..17554eeb3953 100644
--- a/drivers/thermal/int3403_thermal.c
+++ b/drivers/thermal/int3403_thermal.c
@@ -33,6 +33,10 @@
 struct int3403_sensor {
 	struct thermal_zone_device *tzone;
 	unsigned long *thresholds;
+	unsigned long	crit_temp;
+	int		crit_trip_id;
+	unsigned long	psv_temp;
+	int		psv_trip_id;
 };
 
 static int sys_get_curr_temp(struct thermal_zone_device *tzone,
@@ -79,12 +83,18 @@ static int sys_get_trip_temp(struct thermal_zone_device *tzone,
 	struct acpi_device *device = tzone->devdata;
 	struct int3403_sensor *obj = acpi_driver_data(device);
 
-	/*
-	 * get_trip_temp is a mandatory callback but
-	 * PATx method doesn't return any value, so return
-	 * cached value, which was last set from user space.
-	 */
-	*temp = obj->thresholds[trip];
+	if (trip == obj->crit_trip_id)
+		*temp = obj->crit_temp;
+	else if (trip == obj->psv_trip_id)
+		*temp = obj->psv_temp;
+	else {
+		/*
+		 * get_trip_temp is a mandatory callback but
+		 * PATx method doesn't return any value, so return
+		 * cached value, which was last set from user space.
+		 */
+		*temp = obj->thresholds[trip];
+	}
 
 	return 0;
 }
@@ -92,8 +102,14 @@ static int sys_get_trip_temp(struct thermal_zone_device *tzone,
 static int sys_get_trip_type(struct thermal_zone_device *thermal,
 		int trip, enum thermal_trip_type *type)
 {
+	struct acpi_device *device = thermal->devdata;
+	struct int3403_sensor *obj = acpi_driver_data(device);
+
 	/* Mandatory callback, may not mean much here */
-	*type = THERMAL_TRIP_PASSIVE;
+	if (trip == obj->crit_trip_id)
+		*type = THERMAL_TRIP_CRITICAL;
+	else
+		*type = THERMAL_TRIP_PASSIVE;
 
 	return 0;
 }
@@ -155,6 +171,34 @@ static void acpi_thermal_notify(struct acpi_device *device, u32 event)
 	}
 }
 
+static int sys_get_trip_crt(struct acpi_device *device, unsigned long *temp)
+{
+	unsigned long long crt;
+	acpi_status status;
+
+	status = acpi_evaluate_integer(device->handle, "_CRT", NULL, &crt);
+	if (ACPI_FAILURE(status))
+		return -EIO;
+
+	*temp = DECI_KELVIN_TO_MILLI_CELSIUS(crt, KELVIN_OFFSET);
+
+	return 0;
+}
+
+static int sys_get_trip_psv(struct acpi_device *device, unsigned long *temp)
+{
+	unsigned long long psv;
+	acpi_status status;
+
+	status = acpi_evaluate_integer(device->handle, "_PSV", NULL, &psv);
+	if (ACPI_FAILURE(status))
+		return -EIO;
+
+	*temp = DECI_KELVIN_TO_MILLI_CELSIUS(psv, KELVIN_OFFSET);
+
+	return 0;
+}
+
 static int acpi_int3403_add(struct acpi_device *device)
 {
 	int result = 0;
@@ -194,6 +238,15 @@ static int acpi_int3403_add(struct acpi_device *device)
 			return -ENOMEM;
 		trip_mask = BIT(trip_cnt) - 1;
 	}
+
+	obj->psv_trip_id = -1;
+	if (!sys_get_trip_psv(device, &obj->psv_temp))
+		obj->psv_trip_id = trip_cnt++;
+
+	obj->crit_trip_id = -1;
+	if (!sys_get_trip_crt(device, &obj->crit_temp))
+		obj->crit_trip_id = trip_cnt++;
+
 	obj->tzone = thermal_zone_device_register(acpi_device_bid(device),
 				trip_cnt, trip_mask, device, &tzone_ops,
 				NULL, 0, 0);
diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c
index d7ca9f49c9cb..acbff14da3a4 100644
--- a/drivers/thermal/samsung/exynos_tmu.c
+++ b/drivers/thermal/samsung/exynos_tmu.c
@@ -505,6 +505,10 @@ static irqreturn_t exynos_tmu_irq(int irq, void *id)
 
 static const struct of_device_id exynos_tmu_match[] = {
 	{
+		.compatible = "samsung,exynos3250-tmu",
+		.data = (void *)EXYNOS3250_TMU_DRV_DATA,
+	},
+	{
 		.compatible = "samsung,exynos4210-tmu",
 		.data = (void *)EXYNOS4210_TMU_DRV_DATA,
 	},
@@ -677,7 +681,8 @@ static int exynos_tmu_probe(struct platform_device *pdev)
 		goto err_clk_sec;
 	}
 
-	if (pdata->type == SOC_ARCH_EXYNOS4210 ||
+	if (pdata->type == SOC_ARCH_EXYNOS3250 ||
+	    pdata->type == SOC_ARCH_EXYNOS4210 ||
 	    pdata->type == SOC_ARCH_EXYNOS4412 ||
 	    pdata->type == SOC_ARCH_EXYNOS5250 ||
 	    pdata->type == SOC_ARCH_EXYNOS5260 ||
@@ -759,10 +764,10 @@ static int exynos_tmu_remove(struct platform_device *pdev)
 {
 	struct exynos_tmu_data *data = platform_get_drvdata(pdev);
 
-	exynos_tmu_control(pdev, false);
-
 	exynos_unregister_thermal(data->reg_conf);
 
+	exynos_tmu_control(pdev, false);
+
 	clk_unprepare(data->clk);
 	if (!IS_ERR(data->clk_sec))
 		clk_unprepare(data->clk_sec);
diff --git a/drivers/thermal/samsung/exynos_tmu.h b/drivers/thermal/samsung/exynos_tmu.h
index edd08cf76729..1b4a6444ea61 100644
--- a/drivers/thermal/samsung/exynos_tmu.h
+++ b/drivers/thermal/samsung/exynos_tmu.h
@@ -40,7 +40,8 @@ enum calibration_mode {
 };
 
 enum soc_type {
-	SOC_ARCH_EXYNOS4210 = 1,
+	SOC_ARCH_EXYNOS3250 = 1,
+	SOC_ARCH_EXYNOS4210,
 	SOC_ARCH_EXYNOS4412,
 	SOC_ARCH_EXYNOS5250,
 	SOC_ARCH_EXYNOS5260,
diff --git a/drivers/thermal/samsung/exynos_tmu_data.c b/drivers/thermal/samsung/exynos_tmu_data.c
index c1d81dcd7819..aa8e0dee2055 100644
--- a/drivers/thermal/samsung/exynos_tmu_data.c
+++ b/drivers/thermal/samsung/exynos_tmu_data.c
@@ -90,6 +90,95 @@ struct exynos_tmu_init_data const exynos4210_default_tmu_data = {
 };
 #endif
 
+#if defined(CONFIG_SOC_EXYNOS3250)
+static const struct exynos_tmu_registers exynos3250_tmu_registers = {
+	.triminfo_data = EXYNOS_TMU_REG_TRIMINFO,
+	.triminfo_25_shift = EXYNOS_TRIMINFO_25_SHIFT,
+	.triminfo_85_shift = EXYNOS_TRIMINFO_85_SHIFT,
+	.tmu_ctrl = EXYNOS_TMU_REG_CONTROL,
+	.test_mux_addr_shift = EXYNOS4412_MUX_ADDR_SHIFT,
+	.buf_vref_sel_shift = EXYNOS_TMU_REF_VOLTAGE_SHIFT,
+	.buf_vref_sel_mask = EXYNOS_TMU_REF_VOLTAGE_MASK,
+	.therm_trip_mode_shift = EXYNOS_TMU_TRIP_MODE_SHIFT,
+	.therm_trip_mode_mask = EXYNOS_TMU_TRIP_MODE_MASK,
+	.therm_trip_en_shift = EXYNOS_TMU_THERM_TRIP_EN_SHIFT,
+	.buf_slope_sel_shift = EXYNOS_TMU_BUF_SLOPE_SEL_SHIFT,
+	.buf_slope_sel_mask = EXYNOS_TMU_BUF_SLOPE_SEL_MASK,
+	.core_en_shift = EXYNOS_TMU_CORE_EN_SHIFT,
+	.tmu_status = EXYNOS_TMU_REG_STATUS,
+	.tmu_cur_temp = EXYNOS_TMU_REG_CURRENT_TEMP,
+	.threshold_th0 = EXYNOS_THD_TEMP_RISE,
+	.threshold_th1 = EXYNOS_THD_TEMP_FALL,
+	.tmu_inten = EXYNOS_TMU_REG_INTEN,
+	.inten_rise0_shift = EXYNOS_TMU_INTEN_RISE0_SHIFT,
+	.inten_rise1_shift = EXYNOS_TMU_INTEN_RISE1_SHIFT,
+	.inten_rise2_shift = EXYNOS_TMU_INTEN_RISE2_SHIFT,
+	.inten_fall0_shift = EXYNOS_TMU_INTEN_FALL0_SHIFT,
+	.tmu_intstat = EXYNOS_TMU_REG_INTSTAT,
+	.tmu_intclear = EXYNOS_TMU_REG_INTCLEAR,
+	.intclr_fall_shift = EXYNOS_TMU_CLEAR_FALL_INT_SHIFT,
+	.intclr_rise_shift = EXYNOS_TMU_RISE_INT_SHIFT,
+	.intclr_rise_mask = EXYNOS_TMU_RISE_INT_MASK,
+	.intclr_fall_mask = EXYNOS_TMU_FALL_INT_MASK,
+	.emul_con = EXYNOS_EMUL_CON,
+	.emul_temp_shift = EXYNOS_EMUL_DATA_SHIFT,
+	.emul_time_shift = EXYNOS_EMUL_TIME_SHIFT,
+	.emul_time_mask = EXYNOS_EMUL_TIME_MASK,
+};
+
+#define EXYNOS3250_TMU_DATA \
+	.threshold_falling = 10, \
+	.trigger_levels[0] = 70, \
+	.trigger_levels[1] = 95, \
+	.trigger_levels[2] = 110, \
+	.trigger_levels[3] = 120, \
+	.trigger_enable[0] = true, \
+	.trigger_enable[1] = true, \
+	.trigger_enable[2] = true, \
+	.trigger_enable[3] = false, \
+	.trigger_type[0] = THROTTLE_ACTIVE, \
+	.trigger_type[1] = THROTTLE_ACTIVE, \
+	.trigger_type[2] = SW_TRIP, \
+	.trigger_type[3] = HW_TRIP, \
+	.max_trigger_level = 4, \
+	.gain = 8, \
+	.reference_voltage = 16, \
+	.noise_cancel_mode = 4, \
+	.cal_type = TYPE_TWO_POINT_TRIMMING, \
+	.efuse_value = 55, \
+	.min_efuse_value = 40, \
+	.max_efuse_value = 100, \
+	.first_point_trim = 25, \
+	.second_point_trim = 85, \
+	.default_temp_offset = 50, \
+	.freq_tab[0] = { \
+		.freq_clip_max = 800 * 1000, \
+		.temp_level = 70, \
+	}, \
+	.freq_tab[1] = { \
+		.freq_clip_max = 400 * 1000, \
+		.temp_level = 95, \
+	}, \
+	.freq_tab_count = 2, \
+	.registers = &exynos3250_tmu_registers, \
+	.features = (TMU_SUPPORT_EMULATION | \
+			TMU_SUPPORT_FALLING_TRIP | TMU_SUPPORT_READY_STATUS | \
+			TMU_SUPPORT_EMUL_TIME)
+#endif
+
+#if defined(CONFIG_SOC_EXYNOS3250)
+struct exynos_tmu_init_data const exynos3250_default_tmu_data = {
+	.tmu_data = {
+		{
+			EXYNOS3250_TMU_DATA,
+			.type = SOC_ARCH_EXYNOS3250,
+			.test_mux = EXYNOS4412_MUX_ADDR_VALUE,
+		},
+	},
+	.tmu_count = 1,
+};
+#endif
+
 #if defined(CONFIG_SOC_EXYNOS4412) || defined(CONFIG_SOC_EXYNOS5250)
 static const struct exynos_tmu_registers exynos4412_tmu_registers = {
 	.triminfo_data = EXYNOS_TMU_REG_TRIMINFO,
diff --git a/drivers/thermal/samsung/exynos_tmu_data.h b/drivers/thermal/samsung/exynos_tmu_data.h
index d268981b65e5..f0979e598491 100644
--- a/drivers/thermal/samsung/exynos_tmu_data.h
+++ b/drivers/thermal/samsung/exynos_tmu_data.h
@@ -148,6 +148,13 @@
 #define EXYNOS5440_TMU_TH_RISE4_SHIFT		24
 #define EXYNOS5440_EFUSE_SWAP_OFFSET		8
 
+#if defined(CONFIG_SOC_EXYNOS3250)
+extern struct exynos_tmu_init_data const exynos3250_default_tmu_data;
+#define EXYNOS3250_TMU_DRV_DATA (&exynos3250_default_tmu_data)
+#else
+#define EXYNOS3250_TMU_DRV_DATA (NULL)
+#endif
+
 #if defined(CONFIG_CPU_EXYNOS4210)
 extern struct exynos_tmu_init_data const exynos4210_default_tmu_data;
 #define EXYNOS4210_TMU_DRV_DATA (&exynos4210_default_tmu_data)
diff --git a/drivers/thermal/st/Kconfig b/drivers/thermal/st/Kconfig
new file mode 100644
index 000000000000..490fdbe22eea
--- /dev/null
+++ b/drivers/thermal/st/Kconfig
@@ -0,0 +1,12 @@
+config ST_THERMAL
+       tristate "Thermal sensors on STMicroelectronics STi series of SoCs"
+       help
+         Support for thermal sensors on STMicroelectronics STi series of SoCs.
+
+config ST_THERMAL_SYSCFG
+	select ST_THERMAL
+	tristate "STi series syscfg register access based thermal sensors"
+
+config ST_THERMAL_MEMMAP
+	select ST_THERMAL
+	tristate "STi series memory mapped access based thermal sensors"
diff --git a/drivers/thermal/st/Makefile b/drivers/thermal/st/Makefile
new file mode 100644
index 000000000000..b38878977bd8
--- /dev/null
+++ b/drivers/thermal/st/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_ST_THERMAL)		:= st_thermal.o
+obj-$(CONFIG_ST_THERMAL_SYSCFG)		+= st_thermal_syscfg.o
+obj-$(CONFIG_ST_THERMAL_MEMMAP)		+= st_thermal_memmap.o
diff --git a/drivers/thermal/st/st_thermal.c b/drivers/thermal/st/st_thermal.c
new file mode 100644
index 000000000000..90163b384660
--- /dev/null
+++ b/drivers/thermal/st/st_thermal.c
@@ -0,0 +1,313 @@
+/*
+ * ST Thermal Sensor Driver core routines
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *
+ * Copyright (C) 2003-2014 STMicroelectronics (R&D) Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/clk.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+
+#include "st_thermal.h"
+
+/* The Thermal Framework expects millidegrees */
+#define mcelsius(temp)			((temp) * 1000)
+
+/*
+ * Function to allocate regfields which are common
+ * between syscfg and memory mapped based sensors
+ */
+int st_thermal_alloc_regfields(struct st_thermal_sensor *sensor)
+{
+	struct device *dev = sensor->dev;
+	struct regmap *regmap = sensor->regmap;
+	const struct reg_field *reg_fields = sensor->cdata->reg_fields;
+
+	sensor->dcorrect = devm_regmap_field_alloc(dev, regmap,
+						   reg_fields[DCORRECT]);
+
+	sensor->overflow = devm_regmap_field_alloc(dev, regmap,
+						   reg_fields[OVERFLOW]);
+
+	sensor->temp_data = devm_regmap_field_alloc(dev, regmap,
+						    reg_fields[DATA]);
+
+	if (IS_ERR(sensor->dcorrect) ||
+	    IS_ERR(sensor->overflow) ||
+	    IS_ERR(sensor->temp_data)) {
+		dev_err(dev, "failed to allocate common regfields\n");
+		return -EINVAL;
+	}
+
+	return sensor->ops->alloc_regfields(sensor);
+}
+
+static int st_thermal_sensor_on(struct st_thermal_sensor *sensor)
+{
+	int ret;
+	struct device *dev = sensor->dev;
+
+	ret = clk_prepare_enable(sensor->clk);
+	if (ret) {
+		dev_err(dev, "failed to enable clk\n");
+		return ret;
+	}
+
+	ret = sensor->ops->power_ctrl(sensor, POWER_ON);
+	if (ret) {
+		dev_err(dev, "failed to power on sensor\n");
+		clk_disable_unprepare(sensor->clk);
+	}
+
+	return ret;
+}
+
+static int st_thermal_sensor_off(struct st_thermal_sensor *sensor)
+{
+	int ret;
+
+	ret = sensor->ops->power_ctrl(sensor, POWER_OFF);
+	if (ret)
+		return ret;
+
+	clk_disable_unprepare(sensor->clk);
+
+	return 0;
+}
+
+static int st_thermal_calibration(struct st_thermal_sensor *sensor)
+{
+	int ret;
+	unsigned int val;
+	struct device *dev = sensor->dev;
+
+	/* Check if sensor calibration data is already written */
+	ret = regmap_field_read(sensor->dcorrect, &val);
+	if (ret) {
+		dev_err(dev, "failed to read calibration data\n");
+		return ret;
+	}
+
+	if (!val) {
+		/*
+		 * Sensor calibration value not set by bootloader,
+		 * default calibration data to be used
+		 */
+		ret = regmap_field_write(sensor->dcorrect,
+					 sensor->cdata->calibration_val);
+		if (ret)
+			dev_err(dev, "failed to set calibration data\n");
+	}
+
+	return ret;
+}
+
+/* Callback to get temperature from HW*/
+static int st_thermal_get_temp(struct thermal_zone_device *th,
+		unsigned long *temperature)
+{
+	struct st_thermal_sensor *sensor = th->devdata;
+	struct device *dev = sensor->dev;
+	unsigned int temp;
+	unsigned int overflow;
+	int ret;
+
+	ret = regmap_field_read(sensor->overflow, &overflow);
+	if (ret)
+		return ret;
+	if (overflow)
+		return -EIO;
+
+	ret = regmap_field_read(sensor->temp_data, &temp);
+	if (ret)
+		return ret;
+
+	temp += sensor->cdata->temp_adjust_val;
+	temp = mcelsius(temp);
+
+	dev_dbg(dev, "temperature: %d\n", temp);
+
+	*temperature = temp;
+
+	return 0;
+}
+
+static int st_thermal_get_trip_type(struct thermal_zone_device *th,
+				int trip, enum thermal_trip_type *type)
+{
+	struct st_thermal_sensor *sensor = th->devdata;
+	struct device *dev = sensor->dev;
+
+	switch (trip) {
+	case 0:
+		*type = THERMAL_TRIP_CRITICAL;
+		break;
+	default:
+		dev_err(dev, "invalid trip point\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int st_thermal_get_trip_temp(struct thermal_zone_device *th,
+				    int trip, unsigned long *temp)
+{
+	struct st_thermal_sensor *sensor = th->devdata;
+	struct device *dev = sensor->dev;
+
+	switch (trip) {
+	case 0:
+		*temp = mcelsius(sensor->cdata->crit_temp);
+		break;
+	default:
+		dev_err(dev, "Invalid trip point\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct thermal_zone_device_ops st_tz_ops = {
+	.get_temp	= st_thermal_get_temp,
+	.get_trip_type	= st_thermal_get_trip_type,
+	.get_trip_temp	= st_thermal_get_trip_temp,
+};
+
+int st_thermal_register(struct platform_device *pdev,
+			const struct of_device_id *st_thermal_of_match)
+{
+	struct st_thermal_sensor *sensor;
+	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
+	const struct of_device_id *match;
+
+	int polling_delay;
+	int ret;
+
+	if (!np) {
+		dev_err(dev, "device tree node not found\n");
+		return -EINVAL;
+	}
+
+	sensor = devm_kzalloc(dev, sizeof(*sensor), GFP_KERNEL);
+	if (!sensor)
+		return -ENOMEM;
+
+	sensor->dev = dev;
+
+	match = of_match_device(st_thermal_of_match, dev);
+	if (!(match && match->data))
+		return -EINVAL;
+
+	sensor->cdata = match->data;
+	if (!sensor->cdata->ops)
+		return -EINVAL;
+
+	sensor->ops = sensor->cdata->ops;
+
+	ret = sensor->ops->regmap_init(sensor);
+	if (ret)
+		return ret;
+
+	ret = st_thermal_alloc_regfields(sensor);
+	if (ret)
+		return ret;
+
+	sensor->clk = devm_clk_get(dev, "thermal");
+	if (IS_ERR(sensor->clk)) {
+		dev_err(dev, "failed to fetch clock\n");
+		return PTR_ERR(sensor->clk);
+	}
+
+	if (sensor->ops->register_enable_irq) {
+		ret = sensor->ops->register_enable_irq(sensor);
+		if (ret)
+			return ret;
+	}
+
+	ret = st_thermal_sensor_on(sensor);
+	if (ret)
+		return ret;
+
+	ret = st_thermal_calibration(sensor);
+	if (ret)
+		goto sensor_off;
+
+	polling_delay = sensor->ops->register_enable_irq ? 0 : 1000;
+
+	sensor->thermal_dev =
+		thermal_zone_device_register(dev_name(dev), 1, 0, sensor,
+					     &st_tz_ops, NULL, 0, polling_delay);
+	if (IS_ERR(sensor->thermal_dev)) {
+		dev_err(dev, "failed to register thermal zone device\n");
+		ret = PTR_ERR(sensor->thermal_dev);
+		goto sensor_off;
+	}
+
+	platform_set_drvdata(pdev, sensor);
+
+	return 0;
+
+sensor_off:
+	st_thermal_sensor_off(sensor);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(st_thermal_register);
+
+int st_thermal_unregister(struct platform_device *pdev)
+{
+	struct st_thermal_sensor *sensor = platform_get_drvdata(pdev);
+
+	st_thermal_sensor_off(sensor);
+	thermal_zone_device_unregister(sensor->thermal_dev);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(st_thermal_unregister);
+
+static int st_thermal_suspend(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct st_thermal_sensor *sensor = platform_get_drvdata(pdev);
+
+	return st_thermal_sensor_off(sensor);
+}
+
+static int st_thermal_resume(struct device *dev)
+{
+	int ret;
+	struct platform_device *pdev = to_platform_device(dev);
+	struct st_thermal_sensor *sensor = platform_get_drvdata(pdev);
+
+	ret = st_thermal_sensor_on(sensor);
+	if (ret)
+		return ret;
+
+	ret = st_thermal_calibration(sensor);
+	if (ret)
+		return ret;
+
+	if (sensor->ops->enable_irq) {
+		ret = sensor->ops->enable_irq(sensor);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+SIMPLE_DEV_PM_OPS(st_thermal_pm_ops, st_thermal_suspend, st_thermal_resume);
+EXPORT_SYMBOL_GPL(st_thermal_pm_ops);
+
+MODULE_AUTHOR("STMicroelectronics (R&D) Limited <ajitpal.singh@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STi SoC Thermal Sensor Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/st/st_thermal.h b/drivers/thermal/st/st_thermal.h
new file mode 100644
index 000000000000..fecafbe10fa7
--- /dev/null
+++ b/drivers/thermal/st/st_thermal.h
@@ -0,0 +1,104 @@
+/*
+ * ST Thermal Sensor Driver for STi series of SoCs
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *
+ * Copyright (C) 2003-2014 STMicroelectronics (R&D) Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __STI_THERMAL_SYSCFG_H
+#define __STI_THERMAL_SYSCFG_H
+
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/thermal.h>
+
+enum st_thermal_regfield_ids {
+	INT_THRESH_HI = 0, /* Top two regfield IDs are mutually exclusive */
+	TEMP_PWR = 0,
+	DCORRECT,
+	OVERFLOW,
+	DATA,
+	INT_ENABLE,
+
+	MAX_REGFIELDS
+};
+
+/* Thermal sensor power states */
+enum st_thermal_power_state {
+	POWER_OFF = 0,
+	POWER_ON
+};
+
+struct st_thermal_sensor;
+
+/**
+ * Description of private thermal sensor ops.
+ *
+ * @power_ctrl:		Function for powering on/off a sensor. Clock to the
+ *			sensor is also controlled from this function.
+ * @alloc_regfields: 	Allocate regmap register fields, specific to a sensor.
+ * @do_memmap_regmap: 	Memory map the thermal register space and init regmap
+ *			instance or find regmap instance.
+ * @register_irq: 	Register an interrupt handler for a sensor.
+ */
+struct st_thermal_sensor_ops {
+	int (*power_ctrl)(struct st_thermal_sensor *, enum st_thermal_power_state);
+	int (*alloc_regfields)(struct st_thermal_sensor *);
+	int (*regmap_init)(struct st_thermal_sensor *);
+	int (*register_enable_irq)(struct st_thermal_sensor *);
+	int (*enable_irq)(struct st_thermal_sensor *);
+};
+
+/**
+ * Description of thermal driver compatible data.
+ *
+ * @reg_fields:		Pointer to the regfields array for a sensor.
+ * @sys_compat:		Pointer to the syscon node compatible string.
+ * @ops: 		Pointer to private thermal ops for a sensor.
+ * @calibration_val: 	Default calibration value to be written to the DCORRECT
+ *			register field for a sensor.
+ * @temp_adjust_val: 	Value to be added/subtracted from the data read from
+ *			the sensor. If value needs to be added please provide a
+ *			positive value and if it is to be subtracted please
+ * 			provide a negative value.
+ * @crit_temp: 		The temperature beyond which the SoC should be shutdown
+ * 			to prevent damage.
+ */
+struct st_thermal_compat_data {
+	char *sys_compat;
+	const struct reg_field *reg_fields;
+	const struct st_thermal_sensor_ops *ops;
+	unsigned int calibration_val;
+	int temp_adjust_val;
+	int crit_temp;
+};
+
+struct st_thermal_sensor {
+	struct device *dev;
+	struct thermal_zone_device *thermal_dev;
+	const struct st_thermal_sensor_ops *ops;
+	const struct st_thermal_compat_data *cdata;
+	struct clk *clk;
+	struct regmap *regmap;
+	struct regmap_field *pwr;
+	struct regmap_field *dcorrect;
+	struct regmap_field *overflow;
+	struct regmap_field *temp_data;
+	struct regmap_field *int_thresh_hi;
+	struct regmap_field *int_enable;
+	int irq;
+	void __iomem *mmio_base;
+};
+
+extern int st_thermal_register(struct platform_device *pdev,
+			       const struct of_device_id *st_thermal_of_match);
+extern int st_thermal_unregister(struct platform_device *pdev);
+extern const struct dev_pm_ops st_thermal_pm_ops;
+
+#endif /* __STI_RESET_SYSCFG_H */
diff --git a/drivers/thermal/st/st_thermal_memmap.c b/drivers/thermal/st/st_thermal_memmap.c
new file mode 100644
index 000000000000..39896ce2ee00
--- /dev/null
+++ b/drivers/thermal/st/st_thermal_memmap.c
@@ -0,0 +1,209 @@
+/*
+ * ST Thermal Sensor Driver for memory mapped sensors.
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *
+ * Copyright (C) 2003-2014 STMicroelectronics (R&D) Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/of.h>
+#include <linux/module.h>
+
+#include "st_thermal.h"
+
+#define STIH416_MPE_CONF			0x0
+#define STIH416_MPE_STATUS			0x4
+#define STIH416_MPE_INT_THRESH			0x8
+#define STIH416_MPE_INT_EN			0xC
+
+/* Power control bits for the memory mapped thermal sensor */
+#define THERMAL_PDN				BIT(4)
+#define THERMAL_SRSTN				BIT(10)
+
+static const struct reg_field st_mmap_thermal_regfields[MAX_REGFIELDS] = {
+	/*
+	 * According to the STIH416 MPE temp sensor data sheet -
+	 * the PDN (Power Down Bit) and SRSTN (Soft Reset Bit) need to be
+	 * written simultaneously for powering on and off the temperature
+	 * sensor. regmap_update_bits() will be used to update the register.
+	 */
+	[INT_THRESH_HI]	= REG_FIELD(STIH416_MPE_INT_THRESH, 	0,  7),
+	[DCORRECT]	= REG_FIELD(STIH416_MPE_CONF,		5,  9),
+	[OVERFLOW]	= REG_FIELD(STIH416_MPE_STATUS,		9,  9),
+	[DATA]		= REG_FIELD(STIH416_MPE_STATUS,		11, 18),
+	[INT_ENABLE]	= REG_FIELD(STIH416_MPE_INT_EN,		0,  0),
+};
+
+static irqreturn_t st_mmap_thermal_trip_handler(int irq, void *sdata)
+{
+	struct st_thermal_sensor *sensor = sdata;
+
+	thermal_zone_device_update(sensor->thermal_dev);
+
+	return IRQ_HANDLED;
+}
+
+/* Private ops for the Memory Mapped based thermal sensors */
+static int st_mmap_power_ctrl(struct st_thermal_sensor *sensor,
+			      enum st_thermal_power_state power_state)
+{
+	const unsigned int mask = (THERMAL_PDN | THERMAL_SRSTN);
+	const unsigned int val = power_state ? mask : 0;
+
+	return regmap_update_bits(sensor->regmap, STIH416_MPE_CONF, mask, val);
+}
+
+static int st_mmap_alloc_regfields(struct st_thermal_sensor *sensor)
+{
+	struct device *dev = sensor->dev;
+	struct regmap *regmap = sensor->regmap;
+	const struct reg_field *reg_fields = sensor->cdata->reg_fields;
+
+	sensor->int_thresh_hi = devm_regmap_field_alloc(dev, regmap,
+						reg_fields[INT_THRESH_HI]);
+	sensor->int_enable = devm_regmap_field_alloc(dev, regmap,
+						reg_fields[INT_ENABLE]);
+
+	if (IS_ERR(sensor->int_thresh_hi) || IS_ERR(sensor->int_enable)) {
+		dev_err(dev, "failed to alloc mmap regfields\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int st_mmap_enable_irq(struct st_thermal_sensor *sensor)
+{
+	int ret;
+
+	/* Set upper critical threshold */
+	ret = regmap_field_write(sensor->int_thresh_hi,
+				 sensor->cdata->crit_temp -
+				 sensor->cdata->temp_adjust_val);
+	if (ret)
+		return ret;
+
+	return regmap_field_write(sensor->int_enable, 1);
+}
+
+static int st_mmap_register_enable_irq(struct st_thermal_sensor *sensor)
+{
+	struct device *dev = sensor->dev;
+	struct platform_device *pdev = to_platform_device(dev);
+	int ret;
+
+	sensor->irq = platform_get_irq(pdev, 0);
+	if (sensor->irq < 0) {
+		dev_err(dev, "failed to register IRQ\n");
+		return sensor->irq;
+	}
+
+	ret = devm_request_threaded_irq(dev, sensor->irq,
+					NULL, st_mmap_thermal_trip_handler,
+					IRQF_TRIGGER_RISING | IRQF_ONESHOT,
+					dev->driver->name, sensor);
+	if (ret) {
+		dev_err(dev, "failed to register IRQ %d\n", sensor->irq);
+		return ret;
+	}
+
+	return st_mmap_enable_irq(sensor);
+}
+
+static const struct regmap_config st_416mpe_regmap_config = {
+	.reg_bits = 32,
+	.val_bits = 32,
+	.reg_stride = 4,
+};
+
+static int st_mmap_regmap_init(struct st_thermal_sensor *sensor)
+{
+	struct device *dev = sensor->dev;
+	struct platform_device *pdev = to_platform_device(dev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(dev, "no memory resources defined\n");
+		return -ENODEV;
+	}
+
+	sensor->mmio_base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(sensor->mmio_base)) {
+		dev_err(dev, "failed to remap IO\n");
+		return PTR_ERR(sensor->mmio_base);
+	}
+
+	sensor->regmap = devm_regmap_init_mmio(dev, sensor->mmio_base,
+				&st_416mpe_regmap_config);
+	if (IS_ERR(sensor->regmap)) {
+		dev_err(dev, "failed to initialise regmap\n");
+		return PTR_ERR(sensor->regmap);
+	}
+
+	return 0;
+}
+
+static const struct st_thermal_sensor_ops st_mmap_sensor_ops = {
+	.power_ctrl		= st_mmap_power_ctrl,
+	.alloc_regfields	= st_mmap_alloc_regfields,
+	.regmap_init		= st_mmap_regmap_init,
+	.register_enable_irq	= st_mmap_register_enable_irq,
+	.enable_irq		= st_mmap_enable_irq,
+};
+
+/* Compatible device data stih416 mpe thermal sensor */
+const struct st_thermal_compat_data st_416mpe_cdata = {
+	.reg_fields		= st_mmap_thermal_regfields,
+	.ops			= &st_mmap_sensor_ops,
+	.calibration_val	= 14,
+	.temp_adjust_val	= -95,
+	.crit_temp		= 120,
+};
+
+/* Compatible device data stih407 thermal sensor */
+const struct st_thermal_compat_data st_407_cdata = {
+	.reg_fields		= st_mmap_thermal_regfields,
+	.ops			= &st_mmap_sensor_ops,
+	.calibration_val	= 16,
+	.temp_adjust_val	= -95,
+	.crit_temp		= 120,
+};
+
+static struct of_device_id st_mmap_thermal_of_match[] = {
+	{ .compatible = "st,stih416-mpe-thermal", .data = &st_416mpe_cdata },
+	{ .compatible = "st,stih407-thermal",     .data = &st_407_cdata },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, st_mmap_thermal_of_match);
+
+int st_mmap_probe(struct platform_device *pdev)
+{
+	return st_thermal_register(pdev,  st_mmap_thermal_of_match);
+}
+
+int st_mmap_remove(struct platform_device *pdev)
+{
+	return st_thermal_unregister(pdev);
+}
+
+static struct platform_driver st_mmap_thermal_driver = {
+	.driver = {
+		.name	= "st_thermal_mmap",
+		.owner  = THIS_MODULE,
+		.pm     = &st_thermal_pm_ops,
+		.of_match_table = st_mmap_thermal_of_match,
+	},
+	.probe		= st_mmap_probe,
+	.remove		= st_mmap_remove,
+};
+
+module_platform_driver(st_mmap_thermal_driver);
+
+MODULE_AUTHOR("STMicroelectronics (R&D) Limited <ajitpal.singh@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STi SoC Thermal Sensor Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/st/st_thermal_syscfg.c b/drivers/thermal/st/st_thermal_syscfg.c
new file mode 100644
index 000000000000..888b58e64090
--- /dev/null
+++ b/drivers/thermal/st/st_thermal_syscfg.c
@@ -0,0 +1,179 @@
+/*
+ * ST Thermal Sensor Driver for syscfg based sensors.
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *
+ * Copyright (C) 2003-2014 STMicroelectronics (R&D) Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/of.h>
+#include <linux/module.h>
+#include <linux/mfd/syscon.h>
+
+#include "st_thermal.h"
+
+/* STiH415 */
+#define STIH415_SYSCFG_FRONT(num)		((num - 100) * 4)
+#define STIH415_SAS_THSENS_CONF			STIH415_SYSCFG_FRONT(178)
+#define STIH415_SAS_THSENS_STATUS		STIH415_SYSCFG_FRONT(198)
+#define STIH415_SYSCFG_MPE(num)			((num - 600) * 4)
+#define STIH415_MPE_THSENS_CONF			STIH415_SYSCFG_MPE(607)
+#define STIH415_MPE_THSENS_STATUS		STIH415_SYSCFG_MPE(667)
+
+/* STiH416 */
+#define STIH416_SYSCFG_FRONT(num)		((num - 1000) * 4)
+#define STIH416_SAS_THSENS_CONF			STIH416_SYSCFG_FRONT(1552)
+#define STIH416_SAS_THSENS_STATUS1		STIH416_SYSCFG_FRONT(1554)
+#define STIH416_SAS_THSENS_STATUS2		STIH416_SYSCFG_FRONT(1594)
+
+/* STiD127 */
+#define STID127_SYSCFG_CPU(num)			((num - 700) * 4)
+#define STID127_THSENS_CONF			STID127_SYSCFG_CPU(743)
+#define STID127_THSENS_STATUS			STID127_SYSCFG_CPU(767)
+
+static const struct reg_field st_415sas_regfields[MAX_REGFIELDS] = {
+	[TEMP_PWR] = REG_FIELD(STIH415_SAS_THSENS_CONF,   9,  9),
+	[DCORRECT] = REG_FIELD(STIH415_SAS_THSENS_CONF,   4,  8),
+	[OVERFLOW] = REG_FIELD(STIH415_SAS_THSENS_STATUS, 8,  8),
+	[DATA] 	   = REG_FIELD(STIH415_SAS_THSENS_STATUS, 10, 16),
+};
+
+static const struct reg_field st_415mpe_regfields[MAX_REGFIELDS] = {
+	[TEMP_PWR] = REG_FIELD(STIH415_MPE_THSENS_CONF,   8,  8),
+	[DCORRECT] = REG_FIELD(STIH415_MPE_THSENS_CONF,   3,  7),
+	[OVERFLOW] = REG_FIELD(STIH415_MPE_THSENS_STATUS, 9,  9),
+	[DATA]     = REG_FIELD(STIH415_MPE_THSENS_STATUS, 11, 18),
+};
+
+static const struct reg_field st_416sas_regfields[MAX_REGFIELDS] = {
+	[TEMP_PWR] = REG_FIELD(STIH416_SAS_THSENS_CONF,    9,  9),
+	[DCORRECT] = REG_FIELD(STIH416_SAS_THSENS_CONF,    4,  8),
+	[OVERFLOW] = REG_FIELD(STIH416_SAS_THSENS_STATUS1, 8,  8),
+	[DATA]     = REG_FIELD(STIH416_SAS_THSENS_STATUS2, 10, 16),
+};
+
+static const struct reg_field st_127_regfields[MAX_REGFIELDS] = {
+	[TEMP_PWR] = REG_FIELD(STID127_THSENS_CONF,   7,  7),
+	[DCORRECT] = REG_FIELD(STID127_THSENS_CONF,   2,  6),
+	[OVERFLOW] = REG_FIELD(STID127_THSENS_STATUS, 9,  9),
+	[DATA]     = REG_FIELD(STID127_THSENS_STATUS, 11, 18),
+};
+
+/* Private OPs for System Configuration Register based thermal sensors */
+static int st_syscfg_power_ctrl(struct st_thermal_sensor *sensor,
+				enum st_thermal_power_state power_state)
+{
+	return regmap_field_write(sensor->pwr, power_state);
+}
+
+static int st_syscfg_alloc_regfields(struct st_thermal_sensor *sensor)
+{
+	struct device *dev = sensor->dev;
+
+	sensor->pwr = devm_regmap_field_alloc(dev, sensor->regmap,
+					sensor->cdata->reg_fields[TEMP_PWR]);
+
+	if (IS_ERR(sensor->pwr)) {
+		dev_err(dev, "failed to alloc syscfg regfields\n");
+		return PTR_ERR(sensor->pwr);
+	}
+
+	return 0;
+}
+
+static int st_syscfg_regmap_init(struct st_thermal_sensor *sensor)
+{
+	sensor->regmap =
+		syscon_regmap_lookup_by_compatible(sensor->cdata->sys_compat);
+	if (IS_ERR(sensor->regmap)) {
+		dev_err(sensor->dev, "failed to find syscfg regmap\n");
+		return PTR_ERR(sensor->regmap);
+	}
+
+	return 0;
+}
+
+static const struct st_thermal_sensor_ops st_syscfg_sensor_ops = {
+	.power_ctrl		= st_syscfg_power_ctrl,
+	.alloc_regfields	= st_syscfg_alloc_regfields,
+	.regmap_init		= st_syscfg_regmap_init,
+};
+
+/* Compatible device data for stih415 sas thermal sensor */
+const struct st_thermal_compat_data st_415sas_cdata = {
+	.sys_compat		= "st,stih415-front-syscfg",
+	.reg_fields		= st_415sas_regfields,
+	.ops			= &st_syscfg_sensor_ops,
+	.calibration_val	= 16,
+	.temp_adjust_val	= 20,
+	.crit_temp		= 120,
+};
+
+/* Compatible device data for stih415 mpe thermal sensor */
+const struct st_thermal_compat_data st_415mpe_cdata = {
+	.sys_compat		= "st,stih415-system-syscfg",
+	.reg_fields		= st_415mpe_regfields,
+	.ops			= &st_syscfg_sensor_ops,
+	.calibration_val	= 16,
+	.temp_adjust_val	= -103,
+	.crit_temp		= 120,
+};
+
+/* Compatible device data for stih416 sas thermal sensor */
+const struct st_thermal_compat_data st_416sas_cdata = {
+	.sys_compat		= "st,stih416-front-syscfg",
+	.reg_fields		= st_416sas_regfields,
+	.ops			= &st_syscfg_sensor_ops,
+	.calibration_val	= 16,
+	.temp_adjust_val	= 20,
+	.crit_temp		= 120,
+};
+
+/* Compatible device data for stid127 thermal sensor */
+const struct st_thermal_compat_data st_127_cdata = {
+	.sys_compat		= "st,stid127-cpu-syscfg",
+	.reg_fields		= st_127_regfields,
+	.ops			= &st_syscfg_sensor_ops,
+	.calibration_val	= 8,
+	.temp_adjust_val	= -103,
+	.crit_temp		= 120,
+};
+
+static struct of_device_id st_syscfg_thermal_of_match[] = {
+	{ .compatible = "st,stih415-sas-thermal", .data = &st_415sas_cdata },
+	{ .compatible = "st,stih415-mpe-thermal", .data = &st_415mpe_cdata },
+	{ .compatible = "st,stih416-sas-thermal", .data = &st_416sas_cdata },
+	{ .compatible = "st,stid127-thermal",     .data = &st_127_cdata },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, st_syscfg_thermal_of_match);
+
+int st_syscfg_probe(struct platform_device *pdev)
+{
+	return st_thermal_register(pdev, st_syscfg_thermal_of_match);
+}
+
+int st_syscfg_remove(struct platform_device *pdev)
+{
+	return st_thermal_unregister(pdev);
+}
+
+static struct platform_driver st_syscfg_thermal_driver = {
+	.driver = {
+		.name	= "st_syscfg_thermal",
+		.owner  = THIS_MODULE,
+		.pm     = &st_thermal_pm_ops,
+		.of_match_table =  st_syscfg_thermal_of_match,
+	},
+	.probe		= st_syscfg_probe,
+	.remove		= st_syscfg_remove,
+};
+module_platform_driver(st_syscfg_thermal_driver);
+
+MODULE_AUTHOR("STMicroelectronics (R&D) Limited <ajitpal.singh@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STi SoC Thermal Sensor Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/fs/Makefile b/fs/Makefile
index 4030cbfbc9af..90c88529892b 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o \
-		stack.o fs_struct.o statfs.o
+		stack.o fs_struct.o statfs.o fs_pin.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 7c93953030fb..afd2b4408adf 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -218,8 +218,9 @@ static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
 	return -EIO;
 }
 
-static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry,
-		struct inode *new_dir, struct dentry *new_dentry)
+static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry,
+			     unsigned int flags)
 {
 	return -EIO;
 }
@@ -279,7 +280,7 @@ static const struct inode_operations bad_inode_ops =
 	.mkdir		= bad_inode_mkdir,
 	.rmdir		= bad_inode_rmdir,
 	.mknod		= bad_inode_mknod,
-	.rename		= bad_inode_rename,
+	.rename2	= bad_inode_rename2,
 	.readlink	= bad_inode_readlink,
 	/* follow_link must be no-op, otherwise unmounting this inode
 	   won't work */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3668048e16f8..3183742d6f0d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8476,6 +8476,16 @@ out_notrans:
 	return ret;
 }
 
+static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			 struct inode *new_dir, struct dentry *new_dentry,
+			 unsigned int flags)
+{
+	if (flags & ~RENAME_NOREPLACE)
+		return -EINVAL;
+
+	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 static void btrfs_run_delalloc_work(struct btrfs_work *work)
 {
 	struct btrfs_delalloc_work *delalloc_work;
@@ -9019,7 +9029,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
 	.link		= btrfs_link,
 	.mkdir		= btrfs_mkdir,
 	.rmdir		= btrfs_rmdir,
-	.rename		= btrfs_rename,
+	.rename2	= btrfs_rename2,
 	.symlink	= btrfs_symlink,
 	.setattr	= btrfs_setattr,
 	.mknod		= btrfs_mknod,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8e16bca69c56..67b48b9a03e0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -851,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb,
 	struct btrfs_path *path;
 	struct btrfs_key location;
 	struct inode *inode;
-	struct dentry *dentry;
 	u64 dir_id;
 	int new = 0;
 
@@ -922,13 +921,7 @@ setup_root:
 		return dget(sb->s_root);
 	}
 
-	dentry = d_obtain_alias(inode);
-	if (!IS_ERR(dentry)) {
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags &= ~DCACHE_DISCONNECTED;
-		spin_unlock(&dentry->d_lock);
-	}
-	return dentry;
+	return d_obtain_root(inode);
 }
 
 static int btrfs_fill_super(struct super_block *sb,
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 469f2e8657e8..cebf2ebefb55 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -172,14 +172,24 @@ out:
 int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
 {
 	struct posix_acl *default_acl, *acl;
+	umode_t new_mode = inode->i_mode;
 	int error;
 
-	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	error = posix_acl_create(dir, &new_mode, &default_acl, &acl);
 	if (error)
 		return error;
 
-	if (!default_acl && !acl)
+	if (!default_acl && !acl) {
 		cache_no_acl(inode);
+		if (new_mode != inode->i_mode) {
+			struct iattr newattrs = {
+				.ia_mode = new_mode,
+				.ia_valid = ATTR_MODE,
+			};
+			error = ceph_setattr(dentry, &newattrs);
+		}
+		return error;
+	}
 
 	if (default_acl) {
 		error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 1fde164b74b5..6d1cd45dca89 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3277,7 +3277,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 			rel->ino = cpu_to_le64(ceph_ino(inode));
 			rel->cap_id = cpu_to_le64(cap->cap_id);
 			rel->seq = cpu_to_le32(cap->seq);
-			rel->issue_seq = cpu_to_le32(cap->issue_seq),
+			rel->issue_seq = cpu_to_le32(cap->issue_seq);
 			rel->mseq = cpu_to_le32(cap->mseq);
 			rel->caps = cpu_to_le32(cap->implemented);
 			rel->wanted = cpu_to_le32(cap->mds_wanted);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 302085100c28..2eb02f80a0ab 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -423,6 +423,9 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 	dout("sync_read on file %p %llu~%u %s\n", file, off,
 	     (unsigned)len,
 	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+	if (!len)
+		return 0;
 	/*
 	 * flush any page cache pages in this range.  this
 	 * will make concurrent normal and sync io slow,
@@ -470,8 +473,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 			size_t left = ret;
 
 			while (left) {
-				int copy = min_t(size_t, PAGE_SIZE, left);
-				l = copy_page_to_iter(pages[k++], 0, copy, i);
+				size_t page_off = off & ~PAGE_MASK;
+				size_t copy = min_t(size_t,
+						    PAGE_SIZE - page_off, left);
+				l = copy_page_to_iter(pages[k++], page_off,
+						      copy, i);
 				off += l;
 				left -= l;
 				if (l < copy)
@@ -531,7 +537,7 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
  * objects, rollback on failure, etc.)
  */
 static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
@@ -547,7 +553,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
 	int check_caps = 0;
 	int ret;
 	struct timespec mtime = CURRENT_TIME;
-	loff_t pos = iocb->ki_pos;
 	size_t count = iov_iter_count(from);
 
 	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -646,7 +651,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
  * correct atomic write, we should e.g. take write locks on all
  * objects, rollback on failure, etc.)
  */
-static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
@@ -663,7 +669,6 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
 	int check_caps = 0;
 	int ret;
 	struct timespec mtime = CURRENT_TIME;
-	loff_t pos = iocb->ki_pos;
 	size_t count = iov_iter_count(from);
 
 	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -918,9 +923,9 @@ retry_snap:
 		/* we might need to revert back to that point */
 		data = *from;
 		if (file->f_flags & O_DIRECT)
-			written = ceph_sync_direct_write(iocb, &data);
+			written = ceph_sync_direct_write(iocb, &data, pos);
 		else
-			written = ceph_sync_write(iocb, &data);
+			written = ceph_sync_write(iocb, &data, pos);
 		if (written == -EOLDSNAPC) {
 			dout("aio_write %p %llx.%llx %llu~%u"
 				"got EOLDSNAPC, retrying\n",
@@ -1177,6 +1182,9 @@ static long ceph_fallocate(struct file *file, int mode,
 	loff_t endoff = 0;
 	loff_t size;
 
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
 	if (!S_ISREG(inode->i_mode))
 		return -EOPNOTSUPP;
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 92a2548278fc..bad07c09f91e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1904,6 +1904,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
 
 	if (req->r_got_unsafe) {
+		void *p;
 		/*
 		 * Replay.  Do not regenerate message (and rebuild
 		 * paths, etc.); just use the original message.
@@ -1924,8 +1925,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 
 		/* remove cap/dentry releases from message */
 		rhead->num_releases = 0;
-		msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
-		msg->front.iov_len = req->r_request_release_offset;
+
+		/* time stamp */
+		p = msg->front.iov_base + req->r_request_release_offset;
+		ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+
+		msg->front.iov_len = p - msg->front.iov_base;
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 		return 0;
 	}
 
@@ -2061,11 +2067,12 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 {
 	struct ceph_mds_request *req;
-	struct rb_node *p;
+	struct rb_node *p = rb_first(&mdsc->request_tree);
 
 	dout("kick_requests mds%d\n", mds);
-	for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
+	while (p) {
 		req = rb_entry(p, struct ceph_mds_request, r_node);
+		p = rb_next(p);
 		if (req->r_got_unsafe)
 			continue;
 		if (req->r_session &&
@@ -2248,6 +2255,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	 */
 	if (result == -ESTALE) {
 		dout("got ESTALE on request %llu", req->r_tid);
+		req->r_resend_mds = -1;
 		if (req->r_direct_mode != USE_AUTH_MDS) {
 			dout("not using auth, setting for that now");
 			req->r_direct_mode = USE_AUTH_MDS;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 06150fd745ac..f6e12377335c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -755,7 +755,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 				goto out;
 			}
 		} else {
-			root = d_obtain_alias(inode);
+			root = d_obtain_root(inode);
 		}
 		ceph_init_dentry(root);
 		dout("open_root_inode success, root dentry is %p\n", root);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index c9c2b887381e..12f58d22e017 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -592,12 +592,12 @@ start:
 		xattr_version = ci->i_xattrs.version;
 		spin_unlock(&ci->i_ceph_lock);
 
-		xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+		xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *),
 				 GFP_NOFS);
 		err = -ENOMEM;
 		if (!xattrs)
 			goto bad_lock;
-		memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
+
 		for (i = 0; i < numattr; i++) {
 			xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
 					    GFP_NOFS);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 888398067420..ac4f260155c8 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -848,7 +848,7 @@ const struct inode_operations cifs_dir_inode_ops = {
 	.link = cifs_hardlink,
 	.mkdir = cifs_mkdir,
 	.rmdir = cifs_rmdir,
-	.rename = cifs_rename,
+	.rename2 = cifs_rename2,
 	.permission = cifs_permission,
 /*	revalidate:cifs_revalidate,   */
 	.setattr = cifs_setattr,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 560480263336..b0fafa499505 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -68,8 +68,8 @@ extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
 extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
 extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
 extern int cifs_rmdir(struct inode *, struct dentry *);
-extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
-		       struct dentry *);
+extern int cifs_rename2(struct inode *, struct dentry *, struct inode *,
+			struct dentry *, unsigned int);
 extern int cifs_revalidate_file_attr(struct file *filp);
 extern int cifs_revalidate_dentry_attr(struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 41de3935caa0..426d6c6ad8bf 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1627,8 +1627,9 @@ do_rename_exit:
 }
 
 int
-cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
-	    struct inode *target_dir, struct dentry *target_dentry)
+cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
+	     struct inode *target_dir, struct dentry *target_dentry,
+	     unsigned int flags)
 {
 	char *from_name = NULL;
 	char *to_name = NULL;
@@ -1640,6 +1641,9 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 	unsigned int xid;
 	int rc, tmprc;
 
+	if (flags & ~RENAME_NOREPLACE)
+		return -EINVAL;
+
 	cifs_sb = CIFS_SB(source_dir->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
@@ -1667,6 +1671,12 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 	rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
 			    to_name);
 
+	/*
+	 * No-replace is the natural behavior for CIFS, so skip unlink hacks.
+	 */
+	if (flags & RENAME_NOREPLACE)
+		goto cifs_rename_exit;
+
 	if (rc == -EEXIST && tcon->unix_ext) {
 		/*
 		 * Are src and dst hardlinks of same inode? We can only tell
diff --git a/fs/dcache.c b/fs/dcache.c
index 06f65857a855..d30ce699ae4b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -731,8 +731,6 @@ EXPORT_SYMBOL(dget_parent);
 /**
  * d_find_alias - grab a hashed alias of inode
  * @inode: inode in question
- * @want_discon:  flag, used by d_splice_alias, to request
- *          that only a DISCONNECTED alias be returned.
  *
  * If inode has a hashed alias, or is a directory and has any alias,
  * acquire the reference to alias and return it. Otherwise return NULL.
@@ -741,10 +739,9 @@ EXPORT_SYMBOL(dget_parent);
  * of a filesystem.
  *
  * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
- * any other hashed alias over that one unless @want_discon is set,
- * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
+ * any other hashed alias over that one.
  */
-static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
+static struct dentry *__d_find_alias(struct inode *inode)
 {
 	struct dentry *alias, *discon_alias;
 
@@ -756,7 +753,7 @@ again:
 			if (IS_ROOT(alias) &&
 			    (alias->d_flags & DCACHE_DISCONNECTED)) {
 				discon_alias = alias;
-			} else if (!want_discon) {
+			} else {
 				__dget_dlock(alias);
 				spin_unlock(&alias->d_lock);
 				return alias;
@@ -768,12 +765,9 @@ again:
 		alias = discon_alias;
 		spin_lock(&alias->d_lock);
 		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
-			if (IS_ROOT(alias) &&
-			    (alias->d_flags & DCACHE_DISCONNECTED)) {
-				__dget_dlock(alias);
-				spin_unlock(&alias->d_lock);
-				return alias;
-			}
+			__dget_dlock(alias);
+			spin_unlock(&alias->d_lock);
+			return alias;
 		}
 		spin_unlock(&alias->d_lock);
 		goto again;
@@ -787,7 +781,7 @@ struct dentry *d_find_alias(struct inode *inode)
 
 	if (!hlist_empty(&inode->i_dentry)) {
 		spin_lock(&inode->i_lock);
-		de = __d_find_alias(inode, 0);
+		de = __d_find_alias(inode);
 		spin_unlock(&inode->i_lock);
 	}
 	return de;
@@ -1781,25 +1775,7 @@ struct dentry *d_find_any_alias(struct inode *inode)
 }
 EXPORT_SYMBOL(d_find_any_alias);
 
-/**
- * d_obtain_alias - find or allocate a dentry for a given inode
- * @inode: inode to allocate the dentry for
- *
- * Obtain a dentry for an inode resulting from NFS filehandle conversion or
- * similar open by handle operations.  The returned dentry may be anonymous,
- * or may have a full name (if the inode was already in the cache).
- *
- * When called on a directory inode, we must ensure that the inode only ever
- * has one dentry.  If a dentry is found, that is returned instead of
- * allocating a new one.
- *
- * On successful return, the reference to the inode has been transferred
- * to the dentry.  In case of an error the reference on the inode is released.
- * To make it easier to use in export operations a %NULL or IS_ERR inode may
- * be passed in and will be the error will be propagate to the return value,
- * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
- */
-struct dentry *d_obtain_alias(struct inode *inode)
+static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
 {
 	static const struct qstr anonstring = QSTR_INIT("/", 1);
 	struct dentry *tmp;
@@ -1830,7 +1806,10 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	}
 
 	/* attach a disconnected dentry */
-	add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED;
+	add_flags = d_flags_for_inode(inode);
+
+	if (disconnected)
+		add_flags |= DCACHE_DISCONNECTED;
 
 	spin_lock(&tmp->d_lock);
 	tmp->d_inode = inode;
@@ -1851,59 +1830,51 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	iput(inode);
 	return res;
 }
-EXPORT_SYMBOL(d_obtain_alias);
 
 /**
- * d_splice_alias - splice a disconnected dentry into the tree if one exists
- * @inode:  the inode which may have a disconnected dentry
- * @dentry: a negative dentry which we want to point to the inode.
- *
- * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
- * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
- * and return it, else simply d_add the inode to the dentry and return NULL.
+ * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
+ * @inode: inode to allocate the dentry for
  *
- * This is needed in the lookup routine of any filesystem that is exportable
- * (via knfsd) so that we can build dcache paths to directories effectively.
+ * Obtain a dentry for an inode resulting from NFS filehandle conversion or
+ * similar open by handle operations.  The returned dentry may be anonymous,
+ * or may have a full name (if the inode was already in the cache).
  *
- * If a dentry was found and moved, then it is returned.  Otherwise NULL
- * is returned.  This matches the expected return value of ->lookup.
+ * When called on a directory inode, we must ensure that the inode only ever
+ * has one dentry.  If a dentry is found, that is returned instead of
+ * allocating a new one.
  *
- * Cluster filesystems may call this function with a negative, hashed dentry.
- * In that case, we know that the inode will be a regular file, and also this
- * will only occur during atomic_open. So we need to check for the dentry
- * being already hashed only in the final case.
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry.  In case of an error the reference on the inode is released.
+ * To make it easier to use in export operations a %NULL or IS_ERR inode may
+ * be passed in and the error will be propagated to the return value,
+ * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
  */
-struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+struct dentry *d_obtain_alias(struct inode *inode)
 {
-	struct dentry *new = NULL;
-
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
+	return __d_obtain_alias(inode, 1);
+}
+EXPORT_SYMBOL(d_obtain_alias);
 
-	if (inode && S_ISDIR(inode->i_mode)) {
-		spin_lock(&inode->i_lock);
-		new = __d_find_alias(inode, 1);
-		if (new) {
-			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
-			spin_unlock(&inode->i_lock);
-			security_d_instantiate(new, inode);
-			d_move(new, dentry);
-			iput(inode);
-		} else {
-			/* already taking inode->i_lock, so d_add() by hand */
-			__d_instantiate(dentry, inode);
-			spin_unlock(&inode->i_lock);
-			security_d_instantiate(dentry, inode);
-			d_rehash(dentry);
-		}
-	} else {
-		d_instantiate(dentry, inode);
-		if (d_unhashed(dentry))
-			d_rehash(dentry);
-	}
-	return new;
+/**
+ * d_obtain_root - find or allocate a dentry for a given inode
+ * @inode: inode to allocate the dentry for
+ *
+ * Obtain an IS_ROOT dentry for the root of a filesystem.
+ *
+ * We must ensure that directory inodes only ever have one dentry.  If a
+ * dentry is found, that is returned instead of allocating a new one.
+ *
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry.  In case of an error the reference on the inode is
+ * released.  A %NULL or IS_ERR inode may be passed in and will be the
+ * error will be propagate to the return value, with a %NULL @inode
+ * replaced by ERR_PTR(-ESTALE).
+ */
+struct dentry *d_obtain_root(struct inode *inode)
+{
+	return __d_obtain_alias(inode, 0);
 }
-EXPORT_SYMBOL(d_splice_alias);
+EXPORT_SYMBOL(d_obtain_root);
 
 /**
  * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -2697,6 +2668,75 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 }
 
 /**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode:  the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has an IS_ROOT alias, then d_move that in
+ * place of the given dentry and return it, else simply d_add the inode
+ * to the dentry and return NULL.
+ *
+ * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
+ * we should error out: directories can't have multiple aliases.
+ *
+ * This is needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned.  Otherwise NULL
+ * is returned.  This matches the expected return value of ->lookup.
+ *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
+ */
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+{
+	struct dentry *new = NULL;
+
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (inode && S_ISDIR(inode->i_mode)) {
+		spin_lock(&inode->i_lock);
+		new = __d_find_any_alias(inode);
+		if (new) {
+			if (!IS_ROOT(new)) {
+				spin_unlock(&inode->i_lock);
+				dput(new);
+				return ERR_PTR(-EIO);
+			}
+			if (d_ancestor(new, dentry)) {
+				spin_unlock(&inode->i_lock);
+				dput(new);
+				return ERR_PTR(-EIO);
+			}
+			write_seqlock(&rename_lock);
+			__d_materialise_dentry(dentry, new);
+			write_sequnlock(&rename_lock);
+			__d_drop(new);
+			_d_rehash(new);
+			spin_unlock(&new->d_lock);
+			spin_unlock(&inode->i_lock);
+			security_d_instantiate(new, inode);
+			iput(inode);
+		} else {
+			/* already taking inode->i_lock, so d_add() by hand */
+			__d_instantiate(dentry, inode);
+			spin_unlock(&inode->i_lock);
+			security_d_instantiate(dentry, inode);
+			d_rehash(dentry);
+		}
+	} else {
+		d_instantiate(dentry, inode);
+		if (d_unhashed(dentry))
+			d_rehash(dentry);
+	}
+	return new;
+}
+EXPORT_SYMBOL(d_splice_alias);
+
+/**
  * d_materialise_unique - introduce an inode into the tree
  * @dentry: candidate dentry
  * @inode: inode to bind to the dentry, to which aliases may be attached
@@ -2724,7 +2764,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 		struct dentry *alias;
 
 		/* Does an aliased dentry already exist? */
-		alias = __d_find_alias(inode, 0);
+		alias = __d_find_alias(inode);
 		if (alias) {
 			actual = alias;
 			write_seqlock(&rename_lock);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 17e39b047de5..c3116404ab49 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
 	ssize_t ret;
 
-	ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE,
+	ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES,
 				&sdio->from);
 
 	if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 3750031cfa2f..b88edc05c230 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -161,7 +161,7 @@ static struct kmem_cache * ext2_inode_cachep;
 static struct inode *ext2_alloc_inode(struct super_block *sb)
 {
 	struct ext2_inode_info *ei;
-	ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
+	ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	ei->i_block_alloc_info = NULL;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3520ab8a6639..b147a67baa0d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3455,7 +3455,6 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.rmdir		= ext4_rmdir,
 	.mknod		= ext4_mknod,
 	.tmpfile	= ext4_tmpfile,
-	.rename		= ext4_rename,
 	.rename2	= ext4_rename2,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
new file mode 100644
index 000000000000..9368236ca100
--- /dev/null
+++ b/fs/fs_pin.c
@@ -0,0 +1,78 @@
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/fs_pin.h>
+#include "internal.h"
+#include "mount.h"
+
+static void pin_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct fs_pin, rcu));
+}
+
+static DEFINE_SPINLOCK(pin_lock);
+
+void pin_put(struct fs_pin *p)
+{
+	if (atomic_long_dec_and_test(&p->count))
+		call_rcu(&p->rcu, pin_free_rcu);
+}
+
+void pin_remove(struct fs_pin *pin)
+{
+	spin_lock(&pin_lock);
+	hlist_del(&pin->m_list);
+	hlist_del(&pin->s_list);
+	spin_unlock(&pin_lock);
+}
+
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+{
+	spin_lock(&pin_lock);
+	hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
+	hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
+	spin_unlock(&pin_lock);
+}
+
+void mnt_pin_kill(struct mount *m)
+{
+	while (1) {
+		struct hlist_node *p;
+		struct fs_pin *pin;
+		rcu_read_lock();
+		p = ACCESS_ONCE(m->mnt_pins.first);
+		if (!p) {
+			rcu_read_unlock();
+			break;
+		}
+		pin = hlist_entry(p, struct fs_pin, m_list);
+		if (!atomic_long_inc_not_zero(&pin->count)) {
+			rcu_read_unlock();
+			cpu_relax();
+			continue;
+		}
+		rcu_read_unlock();
+		pin->kill(pin);
+	}
+}
+
+void sb_pin_kill(struct super_block *sb)
+{
+	while (1) {
+		struct hlist_node *p;
+		struct fs_pin *pin;
+		rcu_read_lock();
+		p = ACCESS_ONCE(sb->s_pins.first);
+		if (!p) {
+			rcu_read_unlock();
+			break;
+		}
+		pin = hlist_entry(p, struct fs_pin, s_list);
+		if (!atomic_long_inc_not_zero(&pin->count)) {
+			rcu_read_unlock();
+			cpu_relax();
+			continue;
+		}
+		rcu_read_unlock();
+		pin->kill(pin);
+	}
+}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0c6048247a34..de1d84af9f7c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -845,12 +845,6 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
 	return err;
 }
 
-static int fuse_rename(struct inode *olddir, struct dentry *oldent,
-		       struct inode *newdir, struct dentry *newent)
-{
-	return fuse_rename2(olddir, oldent, newdir, newent, 0);
-}
-
 static int fuse_link(struct dentry *entry, struct inode *newdir,
 		     struct dentry *newent)
 {
@@ -2024,7 +2018,6 @@ static const struct inode_operations fuse_dir_inode_operations = {
 	.symlink	= fuse_symlink,
 	.unlink		= fuse_unlink,
 	.rmdir		= fuse_rmdir,
-	.rename		= fuse_rename,
 	.rename2	= fuse_rename2,
 	.link		= fuse_link,
 	.setattr	= fuse_setattr,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 40ac2628ddcf..912061ac4baf 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1303,10 +1303,10 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
 	while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
 		unsigned npages;
 		size_t start;
-		unsigned n = req->max_pages - req->num_pages;
 		ssize_t ret = iov_iter_get_pages(ii,
 					&req->pages[req->num_pages],
-					n * PAGE_SIZE, &start);
+					req->max_pages - req->num_pages,
+					&start);
 		if (ret < 0)
 			return ret;
 
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 9c88da0e855a..4fcd40d6f308 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -89,6 +89,7 @@ extern int do_mknod(const char *file, int mode, unsigned int major,
 extern int link_file(const char *from, const char *to);
 extern int hostfs_do_readlink(char *file, char *buf, int size);
 extern int rename_file(char *from, char *to);
+extern int rename2_file(char *from, char *to, unsigned int flags);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 		     long long *bfree_out, long long *bavail_out,
 		     long long *files_out, long long *ffree_out,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bb529f3b7f2b..fd62cae0fdcb 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -741,21 +741,31 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return err;
 }
 
-static int hostfs_rename(struct inode *from_ino, struct dentry *from,
-			 struct inode *to_ino, struct dentry *to)
+static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			  struct inode *new_dir, struct dentry *new_dentry,
+			  unsigned int flags)
 {
-	char *from_name, *to_name;
+	char *old_name, *new_name;
 	int err;
 
-	if ((from_name = dentry_name(from)) == NULL)
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+		return -EINVAL;
+
+	old_name = dentry_name(old_dentry);
+	if (old_name == NULL)
 		return -ENOMEM;
-	if ((to_name = dentry_name(to)) == NULL) {
-		__putname(from_name);
+	new_name = dentry_name(new_dentry);
+	if (new_name == NULL) {
+		__putname(old_name);
 		return -ENOMEM;
 	}
-	err = rename_file(from_name, to_name);
-	__putname(from_name);
-	__putname(to_name);
+	if (!flags)
+		err = rename_file(old_name, new_name);
+	else
+		err = rename2_file(old_name, new_name, flags);
+
+	__putname(old_name);
+	__putname(new_name);
 	return err;
 }
 
@@ -867,7 +877,7 @@ static const struct inode_operations hostfs_dir_iops = {
 	.mkdir		= hostfs_mkdir,
 	.rmdir		= hostfs_rmdir,
 	.mknod		= hostfs_mknod,
-	.rename		= hostfs_rename,
+	.rename2	= hostfs_rename2,
 	.permission	= hostfs_permission,
 	.setattr	= hostfs_setattr,
 };
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 67838f3aa20a..9765dab95cbd 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -14,6 +14,7 @@
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
+#include <sys/syscall.h>
 #include "hostfs.h"
 #include <utime.h>
 
@@ -360,6 +361,33 @@ int rename_file(char *from, char *to)
 	return 0;
 }
 
+int rename2_file(char *from, char *to, unsigned int flags)
+{
+	int err;
+
+#ifndef SYS_renameat2
+#  ifdef __x86_64__
+#    define SYS_renameat2 316
+#  endif
+#  ifdef __i386__
+#    define SYS_renameat2 353
+#  endif
+#endif
+
+#ifdef SYS_renameat2
+	err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags);
+	if (err < 0) {
+		if (errno != ENOSYS)
+			return -errno;
+		else
+			return -EINVAL;
+	}
+	return 0;
+#else
+	return -EINVAL;
+#endif
+}
+
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	      long long *bfree_out, long long *bavail_out,
 	      long long *files_out, long long *ffree_out,
diff --git a/fs/internal.h b/fs/internal.h
index 465742407466..e325b4f9c799 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -131,7 +131,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
 /*
  * read_write.c
  */
-extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
 extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
 
 /*
@@ -144,3 +143,9 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
  * pipe.c
  */
 extern const struct file_operations pipefifo_fops;
+
+/*
+ * fs_pin.c
+ */
+extern void sb_pin_kill(struct super_block *sb);
+extern void mnt_pin_kill(struct mount *m);
diff --git a/fs/mount.h b/fs/mount.h
index d55297f2fa05..6740a6215529 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -55,7 +55,7 @@ struct mount {
 	int mnt_id;			/* mount identifier */
 	int mnt_group_id;		/* peer group identifier */
 	int mnt_expiry_mark;		/* true if marked for expiry */
-	int mnt_pinned;
+	struct hlist_head mnt_pins;
 	struct path mnt_ex_mountpoint;
 };
 
diff --git a/fs/namei.c b/fs/namei.c
index 9eb787e5c167..a996bb48dfab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1091,10 +1091,10 @@ int follow_down_one(struct path *path)
 }
 EXPORT_SYMBOL(follow_down_one);
 
-static inline bool managed_dentry_might_block(struct dentry *dentry)
+static inline int managed_dentry_rcu(struct dentry *dentry)
 {
-	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
-		dentry->d_op->d_manage(dentry, true) < 0);
+	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+		dentry->d_op->d_manage(dentry, true) : 0;
 }
 
 /*
@@ -1110,11 +1110,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		 * Don't forget we might have a non-mountpoint managed dentry
 		 * that wants to block transit.
 		 */
-		if (unlikely(managed_dentry_might_block(path->dentry)))
+		switch (managed_dentry_rcu(path->dentry)) {
+		case -ECHILD:
+		default:
 			return false;
+		case -EISDIR:
+			return true;
+		case 0:
+			break;
+		}
 
 		if (!d_mountpoint(path->dentry))
-			return true;
+			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 
 		mounted = __lookup_mnt(path->mnt, path->dentry);
 		if (!mounted)
@@ -1130,7 +1137,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		 */
 		*inode = path->dentry->d_inode;
 	}
-	return read_seqretry(&mount_lock, nd->m_seq);
+	return read_seqretry(&mount_lock, nd->m_seq) &&
+		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 }
 
 static int follow_dotdot_rcu(struct nameidata *nd)
@@ -1402,11 +1410,8 @@ static int lookup_fast(struct nameidata *nd,
 		}
 		path->mnt = mnt;
 		path->dentry = dentry;
-		if (unlikely(!__follow_mount_rcu(nd, path, inode)))
-			goto unlazy;
-		if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-			goto unlazy;
-		return 0;
+		if (likely(__follow_mount_rcu(nd, path, inode)))
+			return 0;
 unlazy:
 		if (unlazy_walk(nd, dentry))
 			return -ECHILD;
@@ -4019,7 +4024,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  * The worst of all namespace operations - renaming directory. "Perverted"
  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
  * Problems:
- *	a) we can get into loop creation. Check is done in is_subdir().
+ *	a) we can get into loop creation.
  *	b) race potential - two innocent renames can create a loop together.
  *	   That's where 4.4 screws up. Current fix: serialization on
  *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
@@ -4075,7 +4080,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (error)
 		return error;
 
-	if (!old_dir->i_op->rename)
+	if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
 		return -EPERM;
 
 	if (flags && !old_dir->i_op->rename2)
@@ -4134,10 +4139,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (error)
 			goto out;
 	}
-	if (!flags) {
+	if (!old_dir->i_op->rename2) {
 		error = old_dir->i_op->rename(old_dir, old_dentry,
 					      new_dir, new_dentry);
 	} else {
+		WARN_ON(old_dir->i_op->rename != NULL);
 		error = old_dir->i_op->rename2(old_dir, old_dentry,
 					       new_dir, new_dentry, flags);
 	}
diff --git a/fs/namespace.c b/fs/namespace.c
index 0acabea58319..a01c7730e9af 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -16,7 +16,6 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/idr.h>
-#include <linux/acct.h>		/* acct_auto_close_mnt */
 #include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
@@ -779,6 +778,20 @@ static void attach_mnt(struct mount *mnt,
 	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 }
 
+static void attach_shadowed(struct mount *mnt,
+			struct mount *parent,
+			struct mount *shadows)
+{
+	if (shadows) {
+		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
+		list_add(&mnt->mnt_child, &shadows->mnt_child);
+	} else {
+		hlist_add_head_rcu(&mnt->mnt_hash,
+				m_hash(&parent->mnt, mnt->mnt_mountpoint));
+		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	}
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -797,12 +810,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
 
 	list_splice(&head, n->list.prev);
 
-	if (shadows)
-		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
-	else
-		hlist_add_head_rcu(&mnt->mnt_hash,
-				m_hash(&parent->mnt, mnt->mnt_mountpoint));
-	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	attach_shadowed(mnt, parent, shadows);
 	touch_mnt_namespace(n);
 }
 
@@ -951,7 +959,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 
 static void mntput_no_expire(struct mount *mnt)
 {
-put_again:
 	rcu_read_lock();
 	mnt_add_count(mnt, -1);
 	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
@@ -964,14 +971,6 @@ put_again:
 		unlock_mount_hash();
 		return;
 	}
-	if (unlikely(mnt->mnt_pinned)) {
-		mnt_add_count(mnt, mnt->mnt_pinned + 1);
-		mnt->mnt_pinned = 0;
-		rcu_read_unlock();
-		unlock_mount_hash();
-		acct_auto_close_mnt(&mnt->mnt);
-		goto put_again;
-	}
 	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
 		rcu_read_unlock();
 		unlock_mount_hash();
@@ -994,6 +993,8 @@ put_again:
 	 * so mnt_get_writers() below is safe.
 	 */
 	WARN_ON(mnt_get_writers(mnt));
+	if (unlikely(mnt->mnt_pins.first))
+		mnt_pin_kill(mnt);
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
@@ -1021,25 +1022,15 @@ struct vfsmount *mntget(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(mntget);
 
-void mnt_pin(struct vfsmount *mnt)
-{
-	lock_mount_hash();
-	real_mount(mnt)->mnt_pinned++;
-	unlock_mount_hash();
-}
-EXPORT_SYMBOL(mnt_pin);
-
-void mnt_unpin(struct vfsmount *m)
+struct vfsmount *mnt_clone_internal(struct path *path)
 {
-	struct mount *mnt = real_mount(m);
-	lock_mount_hash();
-	if (mnt->mnt_pinned) {
-		mnt_add_count(mnt, 1);
-		mnt->mnt_pinned--;
-	}
-	unlock_mount_hash();
+	struct mount *p;
+	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
+	if (IS_ERR(p))
+		return ERR_CAST(p);
+	p->mnt.mnt_flags |= MNT_INTERNAL;
+	return &p->mnt;
 }
-EXPORT_SYMBOL(mnt_unpin);
 
 static inline void mangle(struct seq_file *m, const char *s)
 {
@@ -1505,6 +1496,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 			continue;
 
 		for (s = r; s; s = next_mnt(s, r)) {
+			struct mount *t = NULL;
 			if (!(flag & CL_COPY_UNBINDABLE) &&
 			    IS_MNT_UNBINDABLE(s)) {
 				s = skip_mnt_tree(s);
@@ -1526,7 +1518,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 				goto out;
 			lock_mount_hash();
 			list_add_tail(&q->mnt_list, &res->mnt_list);
-			attach_mnt(q, parent, p->mnt_mp);
+			mnt_set_mountpoint(parent, p->mnt_mp, q);
+			if (!list_empty(&parent->mnt_mounts)) {
+				t = list_last_entry(&parent->mnt_mounts,
+					struct mount, mnt_child);
+				if (t->mnt_mp != p->mnt_mp)
+					t = NULL;
+			}
+			attach_shadowed(q, parent, t);
 			unlock_mount_hash();
 		}
 	}
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9b431f44fad9..cbb1797149d5 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -210,8 +210,7 @@ static void bl_end_io_read(struct bio *bio, int err)
 			SetPageUptodate(bvec->bv_page);
 
 	if (err) {
-		struct nfs_pgio_data *rdata = par->data;
-		struct nfs_pgio_header *header = rdata->header;
+		struct nfs_pgio_header *header = par->data;
 
 		if (!header->pnfs_error)
 			header->pnfs_error = -EIO;
@@ -224,43 +223,44 @@ static void bl_end_io_read(struct bio *bio, int err)
 static void bl_read_cleanup(struct work_struct *work)
 {
 	struct rpc_task *task;
-	struct nfs_pgio_data *rdata;
+	struct nfs_pgio_header *hdr;
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
-	rdata = container_of(task, struct nfs_pgio_data, task);
-	pnfs_ld_read_done(rdata);
+	hdr = container_of(task, struct nfs_pgio_header, task);
+	pnfs_ld_read_done(hdr);
 }
 
 static void
 bl_end_par_io_read(void *data, int unused)
 {
-	struct nfs_pgio_data *rdata = data;
+	struct nfs_pgio_header *hdr = data;
 
-	rdata->task.tk_status = rdata->header->pnfs_error;
-	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
-	schedule_work(&rdata->task.u.tk_work);
+	hdr->task.tk_status = hdr->pnfs_error;
+	INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
+	schedule_work(&hdr->task.u.tk_work);
 }
 
 static enum pnfs_try_status
-bl_read_pagelist(struct nfs_pgio_data *rdata)
+bl_read_pagelist(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *header = rdata->header;
+	struct nfs_pgio_header *header = hdr;
 	int i, hole;
 	struct bio *bio = NULL;
 	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
 	sector_t isect, extent_length = 0;
 	struct parallel_io *par;
-	loff_t f_offset = rdata->args.offset;
-	size_t bytes_left = rdata->args.count;
+	loff_t f_offset = hdr->args.offset;
+	size_t bytes_left = hdr->args.count;
 	unsigned int pg_offset, pg_len;
-	struct page **pages = rdata->args.pages;
-	int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+	struct page **pages = hdr->args.pages;
+	int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT;
 	const bool is_dio = (header->dreq != NULL);
 
 	dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
-	       rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
+		hdr->page_array.npages, f_offset,
+		(unsigned int)hdr->args.count);
 
-	par = alloc_parallel(rdata);
+	par = alloc_parallel(hdr);
 	if (!par)
 		goto use_mds;
 	par->pnfs_callback = bl_end_par_io_read;
@@ -268,7 +268,7 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
 
 	isect = (sector_t) (f_offset >> SECTOR_SHIFT);
 	/* Code assumes extents are page-aligned */
-	for (i = pg_index; i < rdata->pages.npages; i++) {
+	for (i = pg_index; i < hdr->page_array.npages; i++) {
 		if (!extent_length) {
 			/* We've used up the previous extent */
 			bl_put_extent(be);
@@ -317,7 +317,8 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
 			struct pnfs_block_extent *be_read;
 
 			be_read = (hole && cow_read) ? cow_read : be;
-			bio = do_add_page_to_bio(bio, rdata->pages.npages - i,
+			bio = do_add_page_to_bio(bio,
+						 hdr->page_array.npages - i,
 						 READ,
 						 isect, pages[i], be_read,
 						 bl_end_io_read, par,
@@ -332,10 +333,10 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
 		extent_length -= PAGE_CACHE_SECTORS;
 	}
 	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
-		rdata->res.eof = 1;
-		rdata->res.count = header->inode->i_size - rdata->args.offset;
+		hdr->res.eof = 1;
+		hdr->res.count = header->inode->i_size - hdr->args.offset;
 	} else {
-		rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset;
+		hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset;
 	}
 out:
 	bl_put_extent(be);
@@ -390,8 +391,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 	}
 
 	if (unlikely(err)) {
-		struct nfs_pgio_data *data = par->data;
-		struct nfs_pgio_header *header = data->header;
+		struct nfs_pgio_header *header = par->data;
 
 		if (!header->pnfs_error)
 			header->pnfs_error = -EIO;
@@ -405,8 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err)
 {
 	struct parallel_io *par = bio->bi_private;
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct nfs_pgio_data *data = par->data;
-	struct nfs_pgio_header *header = data->header;
+	struct nfs_pgio_header *header = par->data;
 
 	if (!uptodate) {
 		if (!header->pnfs_error)
@@ -423,32 +422,32 @@ static void bl_end_io_write(struct bio *bio, int err)
 static void bl_write_cleanup(struct work_struct *work)
 {
 	struct rpc_task *task;
-	struct nfs_pgio_data *wdata;
+	struct nfs_pgio_header *hdr;
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
-	wdata = container_of(task, struct nfs_pgio_data, task);
-	if (likely(!wdata->header->pnfs_error)) {
+	hdr = container_of(task, struct nfs_pgio_header, task);
+	if (likely(!hdr->pnfs_error)) {
 		/* Marks for LAYOUTCOMMIT */
-		mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
-				     wdata->args.offset, wdata->args.count);
+		mark_extents_written(BLK_LSEG2EXT(hdr->lseg),
+				     hdr->args.offset, hdr->args.count);
 	}
-	pnfs_ld_write_done(wdata);
+	pnfs_ld_write_done(hdr);
 }
 
 /* Called when last of bios associated with a bl_write_pagelist call finishes */
 static void bl_end_par_io_write(void *data, int num_se)
 {
-	struct nfs_pgio_data *wdata = data;
+	struct nfs_pgio_header *hdr = data;
 
-	if (unlikely(wdata->header->pnfs_error)) {
-		bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
+	if (unlikely(hdr->pnfs_error)) {
+		bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
 					num_se);
 	}
 
-	wdata->task.tk_status = wdata->header->pnfs_error;
-	wdata->verf.committed = NFS_FILE_SYNC;
-	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
-	schedule_work(&wdata->task.u.tk_work);
+	hdr->task.tk_status = hdr->pnfs_error;
+	hdr->verf.committed = NFS_FILE_SYNC;
+	INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
+	schedule_work(&hdr->task.u.tk_work);
 }
 
 /* FIXME STUB - mark intersection of layout and page as bad, so is not
@@ -673,18 +672,17 @@ check_page:
 }
 
 static enum pnfs_try_status
-bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
+bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 {
-	struct nfs_pgio_header *header = wdata->header;
 	int i, ret, npg_zero, pg_index, last = 0;
 	struct bio *bio = NULL;
 	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
 	sector_t isect, last_isect = 0, extent_length = 0;
 	struct parallel_io *par = NULL;
-	loff_t offset = wdata->args.offset;
-	size_t count = wdata->args.count;
+	loff_t offset = header->args.offset;
+	size_t count = header->args.count;
 	unsigned int pg_offset, pg_len, saved_len;
-	struct page **pages = wdata->args.pages;
+	struct page **pages = header->args.pages;
 	struct page *page;
 	pgoff_t index;
 	u64 temp;
@@ -699,11 +697,11 @@ bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
 		dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
 		goto out_mds;
 	}
-	/* At this point, wdata->pages is a (sequential) list of nfs_pages.
+	/* At this point, header->page_aray is a (sequential) list of nfs_pages.
 	 * We want to write each, and if there is an error set pnfs_error
 	 * to have it redone using nfs.
 	 */
-	par = alloc_parallel(wdata);
+	par = alloc_parallel(header);
 	if (!par)
 		goto out_mds;
 	par->pnfs_callback = bl_end_par_io_write;
@@ -790,8 +788,8 @@ next_page:
 	bio = bl_submit_bio(WRITE, bio);
 
 	/* Middle pages */
-	pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
-	for (i = pg_index; i < wdata->pages.npages; i++) {
+	pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+	for (i = pg_index; i < header->page_array.npages; i++) {
 		if (!extent_length) {
 			/* We've used up the previous extent */
 			bl_put_extent(be);
@@ -862,7 +860,8 @@ next_page:
 		}
 
 
-		bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
+		bio = do_add_page_to_bio(bio, header->page_array.npages - i,
+					 WRITE,
 					 isect, pages[i], be,
 					 bl_end_io_write, par,
 					 pg_offset, pg_len);
@@ -890,7 +889,7 @@ next_page:
 	}
 
 write_done:
-	wdata->res.count = wdata->args.count;
+	header->res.count = header->args.count;
 out:
 	bl_put_extent(be);
 	bl_put_extent(cow_read);
@@ -1063,7 +1062,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
+	pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS);
 	if (pages == NULL) {
 		kfree(dev);
 		return ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 073b4cf67ed9..54de482143cc 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -428,6 +428,18 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
 	if (p == NULL)
 		return 0;
 
+	/*
+	 * Did we get the acceptor from userland during the SETCLIENID
+	 * negotiation?
+	 */
+	if (clp->cl_acceptor)
+		return !strcmp(p, clp->cl_acceptor);
+
+	/*
+	 * Otherwise try to verify it using the cl_hostname. Note that this
+	 * doesn't work if a non-canonical hostname was used in the devname.
+	 */
+
 	/* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
 
 	if (memcmp(p, "nfs@", 4) != 0)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 180d1ec9c32e..1c5ff6d58385 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -110,8 +110,8 @@ struct nfs_subversion *get_nfs_version(unsigned int version)
 		mutex_unlock(&nfs_version_mutex);
 	}
 
-	if (!IS_ERR(nfs))
-		try_module_get(nfs->owner);
+	if (!IS_ERR(nfs) && !try_module_get(nfs->owner))
+		return ERR_PTR(-EAGAIN);
 	return nfs;
 }
 
@@ -158,7 +158,8 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 		goto error_0;
 
 	clp->cl_nfs_mod = cl_init->nfs_mod;
-	try_module_get(clp->cl_nfs_mod->owner);
+	if (!try_module_get(clp->cl_nfs_mod->owner))
+		goto error_dealloc;
 
 	clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
 
@@ -190,6 +191,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 
 error_cleanup:
 	put_nfs_version(clp->cl_nfs_mod);
+error_dealloc:
 	kfree(clp);
 error_0:
 	return ERR_PTR(err);
@@ -252,6 +254,7 @@ void nfs_free_client(struct nfs_client *clp)
 	put_net(clp->cl_net);
 	put_nfs_version(clp->cl_nfs_mod);
 	kfree(clp->cl_hostname);
+	kfree(clp->cl_acceptor);
 	kfree(clp);
 
 	dprintk("<-- nfs_free_client()\n");
@@ -482,8 +485,13 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
 	struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
 	const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
 
+	if (cl_init->hostname == NULL) {
+		WARN_ON(1);
+		return NULL;
+	}
+
 	dprintk("--> nfs_get_client(%s,v%u)\n",
-		cl_init->hostname ?: "", rpc_ops->version);
+		cl_init->hostname, rpc_ops->version);
 
 	/* see if the client already exists */
 	do {
@@ -510,7 +518,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
 	} while (!IS_ERR(new));
 
 	dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
-		cl_init->hostname ?: "", PTR_ERR(new));
+		cl_init->hostname, PTR_ERR(new));
 	return new;
 }
 EXPORT_SYMBOL_GPL(nfs_get_client);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5d8ccecf5f5c..5853f53db732 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -41,14 +41,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
 	set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
 
-/**
- * nfs_have_delegation - check if inode has a delegation
- * @inode: inode to check
- * @flags: delegation types to check for
- *
- * Returns one if inode has the indicated delegation, otherwise zero.
- */
-int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+static int
+nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
 {
 	struct nfs_delegation *delegation;
 	int ret = 0;
@@ -58,12 +52,34 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
 	if (delegation != NULL && (delegation->type & flags) == flags &&
 	    !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
-		nfs_mark_delegation_referenced(delegation);
+		if (mark)
+			nfs_mark_delegation_referenced(delegation);
 		ret = 1;
 	}
 	rcu_read_unlock();
 	return ret;
 }
+/**
+ * nfs_have_delegation - check if inode has a delegation, mark it
+ * NFS_DELEGATION_REFERENCED if there is one.
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
+int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+{
+	return nfs4_do_check_delegation(inode, flags, true);
+}
+
+/*
+ * nfs4_check_delegation - check if inode has a delegation, do not mark
+ * NFS_DELEGATION_REFERENCED if it has one.
+ */
+int nfs4_check_delegation(struct inode *inode, fmode_t flags)
+{
+	return nfs4_do_check_delegation(inode, flags, false);
+}
 
 static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
 {
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 9a79c7a99d6d..5c1cce39297f 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -59,6 +59,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
 
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
+int nfs4_check_delegation(struct inode *inode, fmode_t flags);
 
 #endif
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a3d4ef76127..36d921f0c602 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -988,9 +988,13 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
  * A check for whether or not the parent directory has changed.
  * In the case it has, we assume that the dentries are untrustworthy
  * and may need to be looked up again.
+ * If rcu_walk prevents us from performing a full check, return 0.
  */
-static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
+			      int rcu_walk)
 {
+	int ret;
+
 	if (IS_ROOT(dentry))
 		return 1;
 	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -998,7 +1002,11 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 	if (!nfs_verify_change_attribute(dir, dentry->d_time))
 		return 0;
 	/* Revalidate nfsi->cache_change_attribute before we declare a match */
-	if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+	if (rcu_walk)
+		ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
+	else
+		ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
+	if (ret < 0)
 		return 0;
 	if (!nfs_verify_change_attribute(dir, dentry->d_time))
 		return 0;
@@ -1042,6 +1050,8 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
 out:
 	return (inode->i_nlink == 0) ? -ENOENT : 0;
 out_force:
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
 	ret = __nfs_revalidate_inode(server, inode);
 	if (ret != 0)
 		return ret;
@@ -1054,6 +1064,9 @@ out_force:
  *
  * If parent mtime has changed, we revalidate, else we wait for a
  * period corresponding to the parent's attribute cache timeout value.
+ *
+ * If LOOKUP_RCU prevents us from performing a full check, return 1
+ * suggesting a reval is needed.
  */
 static inline
 int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
@@ -1064,7 +1077,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
 		return 0;
 	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
 		return 1;
-	return !nfs_check_verifier(dir, dentry);
+	return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
 }
 
 /*
@@ -1088,21 +1101,30 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 	struct nfs4_label *label = NULL;
 	int error;
 
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	parent = dget_parent(dentry);
-	dir = parent->d_inode;
+	if (flags & LOOKUP_RCU) {
+		parent = ACCESS_ONCE(dentry->d_parent);
+		dir = ACCESS_ONCE(parent->d_inode);
+		if (!dir)
+			return -ECHILD;
+	} else {
+		parent = dget_parent(dentry);
+		dir = parent->d_inode;
+	}
 	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
 	inode = dentry->d_inode;
 
 	if (!inode) {
-		if (nfs_neg_need_reval(dir, dentry, flags))
+		if (nfs_neg_need_reval(dir, dentry, flags)) {
+			if (flags & LOOKUP_RCU)
+				return -ECHILD;
 			goto out_bad;
+		}
 		goto out_valid_noent;
 	}
 
 	if (is_bad_inode(inode)) {
+		if (flags & LOOKUP_RCU)
+			return -ECHILD;
 		dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
 				__func__, dentry);
 		goto out_bad;
@@ -1112,12 +1134,20 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 		goto out_set_verifier;
 
 	/* Force a full look up iff the parent directory has changed */
-	if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) {
-		if (nfs_lookup_verify_inode(inode, flags))
+	if (!nfs_is_exclusive_create(dir, flags) &&
+	    nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
+
+		if (nfs_lookup_verify_inode(inode, flags)) {
+			if (flags & LOOKUP_RCU)
+				return -ECHILD;
 			goto out_zap_parent;
+		}
 		goto out_valid;
 	}
 
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	if (NFS_STALE(inode))
 		goto out_bad;
 
@@ -1153,13 +1183,18 @@ out_set_verifier:
 	/* Success: notify readdir to use READDIRPLUS */
 	nfs_advise_use_readdirplus(dir);
  out_valid_noent:
-	dput(parent);
+	if (flags & LOOKUP_RCU) {
+		if (parent != ACCESS_ONCE(dentry->d_parent))
+			return -ECHILD;
+	} else
+		dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
 			__func__, dentry);
 	return 1;
 out_zap_parent:
 	nfs_zap_caches(dir);
  out_bad:
+	WARN_ON(flags & LOOKUP_RCU);
 	nfs_free_fattr(fattr);
 	nfs_free_fhandle(fhandle);
 	nfs4_label_free(label);
@@ -1185,6 +1220,7 @@ out_zap_parent:
 			__func__, dentry);
 	return 0;
 out_error:
+	WARN_ON(flags & LOOKUP_RCU);
 	nfs_free_fattr(fattr);
 	nfs_free_fhandle(fhandle);
 	nfs4_label_free(label);
@@ -1529,14 +1565,9 @@ EXPORT_SYMBOL_GPL(nfs_atomic_open);
 
 static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 {
-	struct dentry *parent = NULL;
 	struct inode *inode;
-	struct inode *dir;
 	int ret = 0;
 
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
 	if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
 		goto no_open;
 	if (d_mountpoint(dentry))
@@ -1545,34 +1576,47 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 		goto no_open;
 
 	inode = dentry->d_inode;
-	parent = dget_parent(dentry);
-	dir = parent->d_inode;
 
 	/* We can't create new files in nfs_open_revalidate(), so we
 	 * optimize away revalidation of negative dentries.
 	 */
 	if (inode == NULL) {
+		struct dentry *parent;
+		struct inode *dir;
+
+		if (flags & LOOKUP_RCU) {
+			parent = ACCESS_ONCE(dentry->d_parent);
+			dir = ACCESS_ONCE(parent->d_inode);
+			if (!dir)
+				return -ECHILD;
+		} else {
+			parent = dget_parent(dentry);
+			dir = parent->d_inode;
+		}
 		if (!nfs_neg_need_reval(dir, dentry, flags))
 			ret = 1;
+		else if (flags & LOOKUP_RCU)
+			ret = -ECHILD;
+		if (!(flags & LOOKUP_RCU))
+			dput(parent);
+		else if (parent != ACCESS_ONCE(dentry->d_parent))
+			return -ECHILD;
 		goto out;
 	}
 
 	/* NFS only supports OPEN on regular files */
 	if (!S_ISREG(inode->i_mode))
-		goto no_open_dput;
+		goto no_open;
 	/* We cannot do exclusive creation on a positive dentry */
 	if (flags & LOOKUP_EXCL)
-		goto no_open_dput;
+		goto no_open;
 
 	/* Let f_op->open() actually open (and revalidate) the file */
 	ret = 1;
 
 out:
-	dput(parent);
 	return ret;
 
-no_open_dput:
-	dput(parent);
 no_open:
 	return nfs_lookup_revalidate(dentry, flags);
 }
@@ -2028,10 +2072,14 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
 static LIST_HEAD(nfs_access_lru_list);
 static atomic_long_t nfs_access_nr_entries;
 
+static unsigned long nfs_access_max_cachesize = ULONG_MAX;
+module_param(nfs_access_max_cachesize, ulong, 0644);
+MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
+
 static void nfs_access_free_entry(struct nfs_access_entry *entry)
 {
 	put_rpccred(entry->cred);
-	kfree(entry);
+	kfree_rcu(entry, rcu_head);
 	smp_mb__before_atomic();
 	atomic_long_dec(&nfs_access_nr_entries);
 	smp_mb__after_atomic();
@@ -2048,19 +2096,14 @@ static void nfs_access_free_list(struct list_head *head)
 	}
 }
 
-unsigned long
-nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+nfs_do_access_cache_scan(unsigned int nr_to_scan)
 {
 	LIST_HEAD(head);
 	struct nfs_inode *nfsi, *next;
 	struct nfs_access_entry *cache;
-	int nr_to_scan = sc->nr_to_scan;
-	gfp_t gfp_mask = sc->gfp_mask;
 	long freed = 0;
 
-	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
-		return SHRINK_STOP;
-
 	spin_lock(&nfs_access_lru_lock);
 	list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
 		struct inode *inode;
@@ -2094,11 +2137,39 @@ remove_lru_entry:
 }
 
 unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	int nr_to_scan = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+
+	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+		return SHRINK_STOP;
+	return nfs_do_access_cache_scan(nr_to_scan);
+}
+
+
+unsigned long
 nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
 	return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
 }
 
+static void
+nfs_access_cache_enforce_limit(void)
+{
+	long nr_entries = atomic_long_read(&nfs_access_nr_entries);
+	unsigned long diff;
+	unsigned int nr_to_scan;
+
+	if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize)
+		return;
+	nr_to_scan = 100;
+	diff = nr_entries - nfs_access_max_cachesize;
+	if (diff < nr_to_scan)
+		nr_to_scan = diff;
+	nfs_do_access_cache_scan(nr_to_scan);
+}
+
 static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
 {
 	struct rb_root *root_node = &nfsi->access_cache;
@@ -2186,6 +2257,38 @@ out_zap:
 	return -ENOENT;
 }
 
+static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+	/* Only check the most recently returned cache entry,
+	 * but do it without locking.
+	 */
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_access_entry *cache;
+	int err = -ECHILD;
+	struct list_head *lh;
+
+	rcu_read_lock();
+	if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+		goto out;
+	lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
+	cache = list_entry(lh, struct nfs_access_entry, lru);
+	if (lh == &nfsi->access_cache_entry_lru ||
+	    cred != cache->cred)
+		cache = NULL;
+	if (cache == NULL)
+		goto out;
+	if (!nfs_have_delegated_attributes(inode) &&
+	    !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+		goto out;
+	res->jiffies = cache->jiffies;
+	res->cred = cache->cred;
+	res->mask = cache->mask;
+	err = 0;
+out:
+	rcu_read_unlock();
+	return err;
+}
+
 static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -2229,6 +2332,11 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 	cache->cred = get_rpccred(set->cred);
 	cache->mask = set->mask;
 
+	/* The above field assignments must be visible
+	 * before this item appears on the lru.  We cannot easily
+	 * use rcu_assign_pointer, so just force the memory barrier.
+	 */
+	smp_wmb();
 	nfs_access_add_rbtree(inode, cache);
 
 	/* Update accounting */
@@ -2244,6 +2352,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 					&nfs_access_lru_list);
 		spin_unlock(&nfs_access_lru_lock);
 	}
+	nfs_access_cache_enforce_limit();
 }
 EXPORT_SYMBOL_GPL(nfs_access_add_cache);
 
@@ -2267,10 +2376,16 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 
 	trace_nfs_access_enter(inode);
 
-	status = nfs_access_get_cached(inode, cred, &cache);
+	status = nfs_access_get_cached_rcu(inode, cred, &cache);
+	if (status != 0)
+		status = nfs_access_get_cached(inode, cred, &cache);
 	if (status == 0)
 		goto out_cached;
 
+	status = -ECHILD;
+	if (mask & MAY_NOT_BLOCK)
+		goto out;
+
 	/* Be clever: ask server to check for all possible rights */
 	cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
 	cache.cred = cred;
@@ -2321,9 +2436,6 @@ int nfs_permission(struct inode *inode, int mask)
 	struct rpc_cred *cred;
 	int res = 0;
 
-	if (mask & MAY_NOT_BLOCK)
-		return -ECHILD;
-
 	nfs_inc_stats(inode, NFSIOS_VFSACCESS);
 
 	if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2350,12 +2462,23 @@ force_lookup:
 	if (!NFS_PROTO(inode)->access)
 		goto out_notsup;
 
-	cred = rpc_lookup_cred();
-	if (!IS_ERR(cred)) {
-		res = nfs_do_access(inode, cred, mask);
-		put_rpccred(cred);
-	} else
+	/* Always try fast lookups first */
+	rcu_read_lock();
+	cred = rpc_lookup_cred_nonblock();
+	if (!IS_ERR(cred))
+		res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
+	else
 		res = PTR_ERR(cred);
+	rcu_read_unlock();
+	if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
+		/* Fast lookup failed, try the slow way */
+		cred = rpc_lookup_cred();
+		if (!IS_ERR(cred)) {
+			res = nfs_do_access(inode, cred, mask);
+			put_rpccred(cred);
+		} else
+			res = PTR_ERR(cred);
+	}
 out:
 	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
 		res = -EACCES;
@@ -2364,6 +2487,9 @@ out:
 		inode->i_sb->s_id, inode->i_ino, mask, res);
 	return res;
 out_notsup:
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
 	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	if (res == 0)
 		res = generic_permission(inode, mask);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f11b9eed0de1..65ef6e00deee 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -148,8 +148,8 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
 {
 	struct nfs_writeverf *verfp;
 
-	verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
-				      hdr->data->ds_idx);
+	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+				      hdr->ds_idx);
 	WARN_ON_ONCE(verfp->committed >= 0);
 	memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
 	WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +169,8 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 {
 	struct nfs_writeverf *verfp;
 
-	verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
-					 hdr->data->ds_idx);
+	verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+					 hdr->ds_idx);
 	if (verfp->committed < 0) {
 		nfs_direct_set_hdr_verf(dreq, hdr);
 		return 0;
@@ -715,7 +715,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 {
 	struct nfs_direct_req *dreq = hdr->dreq;
 	struct nfs_commit_info cinfo;
-	int bit = -1;
+	bool request_commit = false;
 	struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 
 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
@@ -729,27 +729,20 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 		dreq->flags = 0;
 		dreq->error = hdr->error;
 	}
-	if (dreq->error != 0)
-		bit = NFS_IOHDR_ERROR;
-	else {
+	if (dreq->error == 0) {
 		dreq->count += hdr->good_bytes;
-		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
-			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-			bit = NFS_IOHDR_NEED_RESCHED;
-		} else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+		if (nfs_write_need_commit(hdr)) {
 			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
-				bit = NFS_IOHDR_NEED_RESCHED;
+				request_commit = true;
 			else if (dreq->flags == 0) {
 				nfs_direct_set_hdr_verf(dreq, hdr);
-				bit = NFS_IOHDR_NEED_COMMIT;
+				request_commit = true;
 				dreq->flags = NFS_ODIRECT_DO_COMMIT;
 			} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
-				if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) {
+				request_commit = true;
+				if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
 					dreq->flags =
 						NFS_ODIRECT_RESCHED_WRITES;
-					bit = NFS_IOHDR_NEED_RESCHED;
-				} else
-					bit = NFS_IOHDR_NEED_COMMIT;
 			}
 		}
 	}
@@ -759,9 +752,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 
 		req = nfs_list_entry(hdr->pages.next);
 		nfs_list_remove_request(req);
-		switch (bit) {
-		case NFS_IOHDR_NEED_RESCHED:
-		case NFS_IOHDR_NEED_COMMIT:
+		if (request_commit) {
 			kref_get(&req->wb_kref);
 			nfs_mark_request_commit(req, hdr->lseg, &cinfo);
 		}
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d2eba1c13b7e..1359c4a27393 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -84,45 +84,37 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
 	BUG();
 }
 
-static void filelayout_reset_write(struct nfs_pgio_data *data)
+static void filelayout_reset_write(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
-	struct rpc_task *task = &data->task;
+	struct rpc_task *task = &hdr->task;
 
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 		dprintk("%s Reset task %5u for i/o through MDS "
 			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
-			data->task.tk_pid,
+			hdr->task.tk_pid,
 			hdr->inode->i_sb->s_id,
 			(unsigned long long)NFS_FILEID(hdr->inode),
-			data->args.count,
-			(unsigned long long)data->args.offset);
+			hdr->args.count,
+			(unsigned long long)hdr->args.offset);
 
-		task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
-							&hdr->pages,
-							hdr->completion_ops,
-							hdr->dreq);
+		task->tk_status = pnfs_write_done_resend_to_mds(hdr);
 	}
 }
 
-static void filelayout_reset_read(struct nfs_pgio_data *data)
+static void filelayout_reset_read(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
-	struct rpc_task *task = &data->task;
+	struct rpc_task *task = &hdr->task;
 
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 		dprintk("%s Reset task %5u for i/o through MDS "
 			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
-			data->task.tk_pid,
+			hdr->task.tk_pid,
 			hdr->inode->i_sb->s_id,
 			(unsigned long long)NFS_FILEID(hdr->inode),
-			data->args.count,
-			(unsigned long long)data->args.offset);
+			hdr->args.count,
+			(unsigned long long)hdr->args.offset);
 
-		task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
-							&hdr->pages,
-							hdr->completion_ops,
-							hdr->dreq);
+		task->tk_status = pnfs_read_done_resend_to_mds(hdr);
 	}
 }
 
@@ -243,18 +235,17 @@ wait_on_recovery:
 /* NFS_PROTO call done callback routines */
 
 static int filelayout_read_done_cb(struct rpc_task *task,
-				struct nfs_pgio_data *data)
+				struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
 	int err;
 
-	trace_nfs4_pnfs_read(data, task->tk_status);
-	err = filelayout_async_handle_error(task, data->args.context->state,
-					    data->ds_clp, hdr->lseg);
+	trace_nfs4_pnfs_read(hdr, task->tk_status);
+	err = filelayout_async_handle_error(task, hdr->args.context->state,
+					    hdr->ds_clp, hdr->lseg);
 
 	switch (err) {
 	case -NFS4ERR_RESET_TO_MDS:
-		filelayout_reset_read(data);
+		filelayout_reset_read(hdr);
 		return task->tk_status;
 	case -EAGAIN:
 		rpc_restart_call_prepare(task);
@@ -270,15 +261,14 @@ static int filelayout_read_done_cb(struct rpc_task *task,
  * rfc5661 is not clear about which credential should be used.
  */
 static void
-filelayout_set_layoutcommit(struct nfs_pgio_data *wdata)
+filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = wdata->header;
 
 	if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
-	    wdata->res.verf->committed == NFS_FILE_SYNC)
+	    hdr->res.verf->committed == NFS_FILE_SYNC)
 		return;
 
-	pnfs_set_layoutcommit(wdata);
+	pnfs_set_layoutcommit(hdr);
 	dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
 		(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
@@ -305,83 +295,82 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
  */
 static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs_pgio_data *rdata = data;
+	struct nfs_pgio_header *hdr = data;
 
-	if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 		rpc_exit(task, -EIO);
 		return;
 	}
-	if (filelayout_reset_to_mds(rdata->header->lseg)) {
+	if (filelayout_reset_to_mds(hdr->lseg)) {
 		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
-		filelayout_reset_read(rdata);
+		filelayout_reset_read(hdr);
 		rpc_exit(task, 0);
 		return;
 	}
-	rdata->pgio_done_cb = filelayout_read_done_cb;
+	hdr->pgio_done_cb = filelayout_read_done_cb;
 
-	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
-			&rdata->args.seq_args,
-			&rdata->res.seq_res,
+	if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+			&hdr->args.seq_args,
+			&hdr->res.seq_res,
 			task))
 		return;
-	if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
-			rdata->args.lock_context, FMODE_READ) == -EIO)
+	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+			hdr->args.lock_context, FMODE_READ) == -EIO)
 		rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
 {
-	struct nfs_pgio_data *rdata = data;
+	struct nfs_pgio_header *hdr = data;
 
 	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
-	if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
 	    task->tk_status == 0) {
-		nfs41_sequence_done(task, &rdata->res.seq_res);
+		nfs41_sequence_done(task, &hdr->res.seq_res);
 		return;
 	}
 
 	/* Note this may cause RPC to be resent */
-	rdata->header->mds_ops->rpc_call_done(task, data);
+	hdr->mds_ops->rpc_call_done(task, data);
 }
 
 static void filelayout_read_count_stats(struct rpc_task *task, void *data)
 {
-	struct nfs_pgio_data *rdata = data;
+	struct nfs_pgio_header *hdr = data;
 
-	rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
+	rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
 
 static void filelayout_read_release(void *data)
 {
-	struct nfs_pgio_data *rdata = data;
-	struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout;
+	struct nfs_pgio_header *hdr = data;
+	struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
 
 	filelayout_fenceme(lo->plh_inode, lo);
-	nfs_put_client(rdata->ds_clp);
-	rdata->header->mds_ops->rpc_release(data);
+	nfs_put_client(hdr->ds_clp);
+	hdr->mds_ops->rpc_release(data);
 }
 
 static int filelayout_write_done_cb(struct rpc_task *task,
-				struct nfs_pgio_data *data)
+				struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
 	int err;
 
-	trace_nfs4_pnfs_write(data, task->tk_status);
-	err = filelayout_async_handle_error(task, data->args.context->state,
-					    data->ds_clp, hdr->lseg);
+	trace_nfs4_pnfs_write(hdr, task->tk_status);
+	err = filelayout_async_handle_error(task, hdr->args.context->state,
+					    hdr->ds_clp, hdr->lseg);
 
 	switch (err) {
 	case -NFS4ERR_RESET_TO_MDS:
-		filelayout_reset_write(data);
+		filelayout_reset_write(hdr);
 		return task->tk_status;
 	case -EAGAIN:
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
-	filelayout_set_layoutcommit(data);
+	filelayout_set_layoutcommit(hdr);
 	return 0;
 }
 
@@ -419,57 +408,57 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 
 static void filelayout_write_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs_pgio_data *wdata = data;
+	struct nfs_pgio_header *hdr = data;
 
-	if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 		rpc_exit(task, -EIO);
 		return;
 	}
-	if (filelayout_reset_to_mds(wdata->header->lseg)) {
+	if (filelayout_reset_to_mds(hdr->lseg)) {
 		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
-		filelayout_reset_write(wdata);
+		filelayout_reset_write(hdr);
 		rpc_exit(task, 0);
 		return;
 	}
-	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
-			&wdata->args.seq_args,
-			&wdata->res.seq_res,
+	if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
+			&hdr->args.seq_args,
+			&hdr->res.seq_res,
 			task))
 		return;
-	if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
-			wdata->args.lock_context, FMODE_WRITE) == -EIO)
+	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+			hdr->args.lock_context, FMODE_WRITE) == -EIO)
 		rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
 {
-	struct nfs_pgio_data *wdata = data;
+	struct nfs_pgio_header *hdr = data;
 
-	if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
 	    task->tk_status == 0) {
-		nfs41_sequence_done(task, &wdata->res.seq_res);
+		nfs41_sequence_done(task, &hdr->res.seq_res);
 		return;
 	}
 
 	/* Note this may cause RPC to be resent */
-	wdata->header->mds_ops->rpc_call_done(task, data);
+	hdr->mds_ops->rpc_call_done(task, data);
 }
 
 static void filelayout_write_count_stats(struct rpc_task *task, void *data)
 {
-	struct nfs_pgio_data *wdata = data;
+	struct nfs_pgio_header *hdr = data;
 
-	rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
+	rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
 
 static void filelayout_write_release(void *data)
 {
-	struct nfs_pgio_data *wdata = data;
-	struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout;
+	struct nfs_pgio_header *hdr = data;
+	struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
 
 	filelayout_fenceme(lo->plh_inode, lo);
-	nfs_put_client(wdata->ds_clp);
-	wdata->header->mds_ops->rpc_release(data);
+	nfs_put_client(hdr->ds_clp);
+	hdr->mds_ops->rpc_release(data);
 }
 
 static void filelayout_commit_prepare(struct rpc_task *task, void *data)
@@ -529,19 +518,18 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
 };
 
 static enum pnfs_try_status
-filelayout_read_pagelist(struct nfs_pgio_data *data)
+filelayout_read_pagelist(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
 	struct pnfs_layout_segment *lseg = hdr->lseg;
 	struct nfs4_pnfs_ds *ds;
 	struct rpc_clnt *ds_clnt;
-	loff_t offset = data->args.offset;
+	loff_t offset = hdr->args.offset;
 	u32 j, idx;
 	struct nfs_fh *fh;
 
 	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
 		__func__, hdr->inode->i_ino,
-		data->args.pgbase, (size_t)data->args.count, offset);
+		hdr->args.pgbase, (size_t)hdr->args.count, offset);
 
 	/* Retrieve the correct rpc_client for the byte range */
 	j = nfs4_fl_calc_j_index(lseg, offset);
@@ -559,30 +547,29 @@ filelayout_read_pagelist(struct nfs_pgio_data *data)
 
 	/* No multipath support. Use first DS */
 	atomic_inc(&ds->ds_clp->cl_count);
-	data->ds_clp = ds->ds_clp;
-	data->ds_idx = idx;
+	hdr->ds_clp = ds->ds_clp;
+	hdr->ds_idx = idx;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
 	if (fh)
-		data->args.fh = fh;
+		hdr->args.fh = fh;
 
-	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
-	data->mds_offset = offset;
+	hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
+	hdr->mds_offset = offset;
 
 	/* Perform an asynchronous read to ds */
-	nfs_initiate_pgio(ds_clnt, data,
+	nfs_initiate_pgio(ds_clnt, hdr,
 			    &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
 	return PNFS_ATTEMPTED;
 }
 
 /* Perform async writes. */
 static enum pnfs_try_status
-filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
+filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 {
-	struct nfs_pgio_header *hdr = data->header;
 	struct pnfs_layout_segment *lseg = hdr->lseg;
 	struct nfs4_pnfs_ds *ds;
 	struct rpc_clnt *ds_clnt;
-	loff_t offset = data->args.offset;
+	loff_t offset = hdr->args.offset;
 	u32 j, idx;
 	struct nfs_fh *fh;
 
@@ -598,21 +585,20 @@ filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
 		return PNFS_NOT_ATTEMPTED;
 
 	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
-		__func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
+		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
 		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
 
-	data->pgio_done_cb = filelayout_write_done_cb;
+	hdr->pgio_done_cb = filelayout_write_done_cb;
 	atomic_inc(&ds->ds_clp->cl_count);
-	data->ds_clp = ds->ds_clp;
-	data->ds_idx = idx;
+	hdr->ds_clp = ds->ds_clp;
+	hdr->ds_idx = idx;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
 	if (fh)
-		data->args.fh = fh;
-
-	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+		hdr->args.fh = fh;
+	hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
 
 	/* Perform an asynchronous write */
-	nfs_initiate_pgio(ds_clnt, data,
+	nfs_initiate_pgio(ds_clnt, hdr,
 				    &filelayout_write_call_ops, sync,
 				    RPC_TASK_SOFTCONN);
 	return PNFS_ATTEMPTED;
@@ -1023,6 +1009,7 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
 
 /* The generic layer is about to remove the req from the commit list.
  * If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this is must be called holding the inode (/cinfo) lock
  */
 static void
 filelayout_clear_request_commit(struct nfs_page *req,
@@ -1030,7 +1017,6 @@ filelayout_clear_request_commit(struct nfs_page *req,
 {
 	struct pnfs_layout_segment *freeme = NULL;
 
-	spin_lock(cinfo->lock);
 	if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
 		goto out;
 	cinfo->ds->nwritten--;
@@ -1045,22 +1031,25 @@ filelayout_clear_request_commit(struct nfs_page *req,
 	}
 out:
 	nfs_request_remove_commit_list(req, cinfo);
-	spin_unlock(cinfo->lock);
-	pnfs_put_lseg(freeme);
+	pnfs_put_lseg_async(freeme);
 }
 
-static struct list_head *
-filelayout_choose_commit_list(struct nfs_page *req,
-			      struct pnfs_layout_segment *lseg,
-			      struct nfs_commit_info *cinfo)
+static void
+filelayout_mark_request_commit(struct nfs_page *req,
+			       struct pnfs_layout_segment *lseg,
+			       struct nfs_commit_info *cinfo)
+
 {
 	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
 	u32 i, j;
 	struct list_head *list;
 	struct pnfs_commit_bucket *buckets;
 
-	if (fl->commit_through_mds)
-		return &cinfo->mds->list;
+	if (fl->commit_through_mds) {
+		list = &cinfo->mds->list;
+		spin_lock(cinfo->lock);
+		goto mds_commit;
+	}
 
 	/* Note that we are calling nfs4_fl_calc_j_index on each page
 	 * that ends up being committed to a data server.  An attractive
@@ -1084,19 +1073,22 @@ filelayout_choose_commit_list(struct nfs_page *req,
 	}
 	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 	cinfo->ds->nwritten++;
-	spin_unlock(cinfo->lock);
-	return list;
-}
 
-static void
-filelayout_mark_request_commit(struct nfs_page *req,
-			       struct pnfs_layout_segment *lseg,
-			       struct nfs_commit_info *cinfo)
-{
-	struct list_head *list;
-
-	list = filelayout_choose_commit_list(req, lseg, cinfo);
-	nfs_request_add_commit_list(req, list, cinfo);
+mds_commit:
+	/* nfs_request_add_commit_list(). We need to add req to list without
+	 * dropping cinfo lock.
+	 */
+	set_bit(PG_CLEAN, &(req)->wb_flags);
+	nfs_list_add_request(req, list);
+	cinfo->mds->ncommit++;
+	spin_unlock(cinfo->lock);
+	if (!cinfo->dreq) {
+		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+			     BDI_RECLAIMABLE);
+		__mark_inode_dirty(req->wb_context->dentry->d_inode,
+				   I_DIRTY_DATASYNC);
+	}
 }
 
 static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -1244,15 +1236,63 @@ restart:
 	spin_unlock(cinfo->lock);
 }
 
+/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
+ *				   for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+static struct nfs_page *
+filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
+{
+	struct nfs_page *freq, *t;
+	struct pnfs_commit_bucket *b;
+	int i;
+
+	/* Linearly search the commit lists for each bucket until a matching
+	 * request is found */
+	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+		list_for_each_entry_safe(freq, t, &b->written, wb_list) {
+			if (freq->wb_page == page)
+				return freq->wb_head;
+		}
+		list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
+			if (freq->wb_page == page)
+				return freq->wb_head;
+		}
+	}
+
+	return NULL;
+}
+
+static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
+{
+	struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+	struct pnfs_commit_bucket *bucket = fl_cinfo->buckets;
+	struct pnfs_layout_segment *freeme;
+	int i;
+
+	for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) {
+		if (list_empty(&bucket->committing))
+			continue;
+		nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
+		spin_lock(cinfo->lock);
+		freeme = bucket->clseg;
+		bucket->clseg = NULL;
+		spin_unlock(cinfo->lock);
+		pnfs_put_lseg(freeme);
+	}
+}
+
 static unsigned int
 alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
 {
 	struct pnfs_ds_commit_info *fl_cinfo;
 	struct pnfs_commit_bucket *bucket;
 	struct nfs_commit_data *data;
-	int i, j;
+	int i;
 	unsigned int nreq = 0;
-	struct pnfs_layout_segment *freeme;
 
 	fl_cinfo = cinfo->ds;
 	bucket = fl_cinfo->buckets;
@@ -1272,16 +1312,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
 	}
 
 	/* Clean up on error */
-	for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) {
-		if (list_empty(&bucket->committing))
-			continue;
-		nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
-		spin_lock(cinfo->lock);
-		freeme = bucket->clseg;
-		bucket->clseg = NULL;
-		spin_unlock(cinfo->lock);
-		pnfs_put_lseg(freeme);
-	}
+	filelayout_retry_commit(cinfo, i);
 	/* Caller will clean up entries put on list */
 	return nreq;
 }
@@ -1301,8 +1332,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 			data->lseg = NULL;
 			list_add(&data->pages, &list);
 			nreq++;
-		} else
+		} else {
 			nfs_retry_commit(mds_pages, NULL, cinfo);
+			filelayout_retry_commit(cinfo, 0);
+			cinfo->completion_ops->error_cleanup(NFS_I(inode));
+			return -ENOMEM;
+		}
 	}
 
 	nreq += alloc_ds_commits(cinfo, &list);
@@ -1380,6 +1415,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
 	.clear_request_commit	= filelayout_clear_request_commit,
 	.scan_commit_lists	= filelayout_scan_commit_lists,
 	.recover_commit_reqs	= filelayout_recover_commit_reqs,
+	.search_commit_reqs	= filelayout_search_commit_reqs,
 	.commit_pagelist	= filelayout_commit_pagelist,
 	.read_pagelist		= filelayout_read_pagelist,
 	.write_pagelist		= filelayout_write_pagelist,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index e2a0361e24c6..8540516f4d71 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -695,7 +695,7 @@ filelayout_get_device_info(struct inode *inode,
 	if (pdev == NULL)
 		return NULL;
 
-	pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
+	pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
 	if (pages == NULL) {
 		kfree(pdev);
 		return NULL;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b94f80420a58..880618a8b048 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
 	 * if the dentry tree reaches them; however if the dentry already
 	 * exists, we'll pick it up at this point and use it as the root
 	 */
-	ret = d_obtain_alias(inode);
+	ret = d_obtain_root(inode);
 	if (IS_ERR(ret)) {
 		dprintk("nfs_get_root: get root dentry failed\n");
 		goto out;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 68921b01b792..577a36f0a510 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1002,6 +1002,15 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 
+int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
+{
+	if (!(NFS_I(inode)->cache_validity &
+			(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+			&& !nfs_attribute_cache_expired(inode))
+		return NFS_STALE(inode) ? -ESTALE : 0;
+	return -ECHILD;
+}
+
 static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e2a45ae5014e..9056622d2230 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -247,11 +247,11 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
 int nfs_iocounter_wait(struct nfs_io_counter *c);
 
 extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
-void nfs_rw_header_free(struct nfs_pgio_header *);
-void nfs_pgio_data_release(struct nfs_pgio_data *);
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
+void nfs_pgio_header_free(struct nfs_pgio_header *);
+void nfs_pgio_data_destroy(struct nfs_pgio_header *);
 int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
-int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *,
+int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
 		      const struct rpc_call_ops *, int, int);
 void nfs_free_request(struct nfs_page *req);
 
@@ -451,6 +451,7 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
 void nfs_mark_request_commit(struct nfs_page *req,
 			     struct pnfs_layout_segment *lseg,
 			     struct nfs_commit_info *cinfo);
+int nfs_write_need_commit(struct nfs_pgio_header *);
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
 			    int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
@@ -491,7 +492,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 
 /* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_pgio_data *);
+extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 			    const struct rpc_timeout *timeparms,
 			    const char *ip_addr);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 8f854dde4150..d0fec260132a 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -256,7 +256,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
 	char *p = data + *result;
 
 	acl = get_acl(inode, type);
-	if (!acl)
+	if (IS_ERR_OR_NULL(acl))
 		return 0;
 
 	posix_acl_release(acl);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f0afa291fd58..809670eba52a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -795,41 +795,44 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return status;
 }
 
-static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-	struct inode *inode = data->header->inode;
+	struct inode *inode = hdr->inode;
 
 	if (nfs3_async_handle_jukebox(task, inode))
 		return -EAGAIN;
 
 	nfs_invalidate_atime(inode);
-	nfs_refresh_inode(inode, &data->fattr);
+	nfs_refresh_inode(inode, &hdr->fattr);
 	return 0;
 }
 
-static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
+				 struct rpc_message *msg)
 {
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
 }
 
-static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
+				      struct nfs_pgio_header *hdr)
 {
 	rpc_call_start(task);
 	return 0;
 }
 
-static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-	struct inode *inode = data->header->inode;
+	struct inode *inode = hdr->inode;
 
 	if (nfs3_async_handle_jukebox(task, inode))
 		return -EAGAIN;
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
 	return 0;
 }
 
-static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
+				  struct rpc_message *msg)
 {
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ba2affa51941..92193eddb41d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -54,7 +54,7 @@ struct nfs4_minor_version_ops {
 			const nfs4_stateid *);
 	int	(*find_root_sec)(struct nfs_server *, struct nfs_fh *,
 			struct nfs_fsinfo *);
-	int	(*free_lock_state)(struct nfs_server *,
+	void	(*free_lock_state)(struct nfs_server *,
 			struct nfs4_lock_state *);
 	const struct rpc_call_ops *call_sync_ops;
 	const struct nfs4_state_recovery_ops *reboot_recovery_ops;
@@ -129,27 +129,17 @@ enum {
  * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
  */
 
-struct nfs4_lock_owner {
-	unsigned int lo_type;
-#define NFS4_ANY_LOCK_TYPE	(0U)
-#define NFS4_FLOCK_LOCK_TYPE	(1U << 0)
-#define NFS4_POSIX_LOCK_TYPE	(1U << 1)
-	union {
-		fl_owner_t posix_owner;
-		pid_t flock_owner;
-	} lo_u;
-};
-
 struct nfs4_lock_state {
-	struct list_head	ls_locks;	/* Other lock stateids */
-	struct nfs4_state *	ls_state;	/* Pointer to open state */
+	struct list_head		ls_locks;   /* Other lock stateids */
+	struct nfs4_state *		ls_state;   /* Pointer to open state */
 #define NFS_LOCK_INITIALIZED 0
 #define NFS_LOCK_LOST        1
-	unsigned long		ls_flags;
+	unsigned long			ls_flags;
 	struct nfs_seqid_counter	ls_seqid;
-	nfs4_stateid		ls_stateid;
-	atomic_t		ls_count;
-	struct nfs4_lock_owner	ls_owner;
+	nfs4_stateid			ls_stateid;
+	atomic_t			ls_count;
+	fl_owner_t			ls_owner;
+	struct work_struct		ls_release;
 };
 
 /* bits for nfs4_state->flags */
@@ -337,11 +327,11 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
  */
 static inline void
 nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
-			 struct rpc_message *msg, struct nfs_pgio_data *wdata)
+			 struct rpc_message *msg, struct nfs_pgio_header *hdr)
 {
 	if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
 	    !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
-		wdata->args.stable = NFS_FILE_SYNC;
+		hdr->args.stable = NFS_FILE_SYNC;
 }
 #else /* CONFIG_NFS_v4_1 */
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -369,7 +359,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
 
 static inline void
 nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
-			 struct rpc_message *msg, struct nfs_pgio_data *wdata)
+			 struct rpc_message *msg, struct nfs_pgio_header *hdr)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index aa9ef4876046..53e435a95260 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -855,6 +855,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 	};
 	struct rpc_timeout ds_timeout;
 	struct nfs_client *clp;
+	char buf[INET6_ADDRSTRLEN + 1];
+
+	if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+		return ERR_PTR(-EINVAL);
+	cl_init.hostname = buf;
 
 	/*
 	 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4bf3d97cc5a0..75ae8d22f067 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1952,6 +1952,14 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
 	return status;
 }
 
+/*
+ * Additional permission checks in order to distinguish between an
+ * open for read, and an open for execute. This works around the
+ * fact that NFSv4 OPEN treats read and execute permissions as being
+ * the same.
+ * Note that in the non-execute case, we want to turn off permission
+ * checking if we just created a new file (POSIX open() semantics).
+ */
 static int nfs4_opendata_access(struct rpc_cred *cred,
 				struct nfs4_opendata *opendata,
 				struct nfs4_state *state, fmode_t fmode,
@@ -1966,14 +1974,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
 		return 0;
 
 	mask = 0;
-	/* don't check MAY_WRITE - a newly created file may not have
-	 * write mode bits, but POSIX allows the creating process to write.
-	 * use openflags to check for exec, because fmode won't
-	 * always have FMODE_EXEC set when file open for exec. */
+	/*
+	 * Use openflags to check for exec, because fmode won't
+	 * always have FMODE_EXEC set when file open for exec.
+	 */
 	if (openflags & __FMODE_EXEC) {
 		/* ONLY check for exec rights */
 		mask = MAY_EXEC;
-	} else if (fmode & FMODE_READ)
+	} else if ((fmode & FMODE_READ) && !opendata->file_created)
 		mask = MAY_READ;
 
 	cache.cred = cred;
@@ -2216,8 +2224,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
 
 	ret = _nfs4_proc_open(opendata);
-	if (ret != 0)
+	if (ret != 0) {
+		if (ret == -ENOENT) {
+			d_drop(opendata->dentry);
+			d_add(opendata->dentry, NULL);
+			nfs_set_verifier(opendata->dentry,
+					 nfs_save_change_attribute(opendata->dir->d_inode));
+		}
 		goto out;
+	}
 
 	state = nfs4_opendata_to_nfs4_state(opendata);
 	ret = PTR_ERR(state);
@@ -2647,6 +2662,48 @@ static const struct rpc_call_ops nfs4_close_ops = {
 	.rpc_release = nfs4_free_closedata,
 };
 
+static bool nfs4_state_has_opener(struct nfs4_state *state)
+{
+	/* first check existing openers */
+	if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
+	    state->n_rdonly != 0)
+		return true;
+
+	if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
+	    state->n_wronly != 0)
+		return true;
+
+	if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
+	    state->n_rdwr != 0)
+		return true;
+
+	return false;
+}
+
+static bool nfs4_roc(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_open_context *ctx;
+	struct nfs4_state *state;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		state = ctx->state;
+		if (state == NULL)
+			continue;
+		if (nfs4_state_has_opener(state)) {
+			spin_unlock(&inode->i_lock);
+			return false;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (nfs4_check_delegation(inode, FMODE_READ))
+		return false;
+
+	return pnfs_roc(inode);
+}
+
 /* 
  * It is possible for data to be read/written from a mem-mapped file 
  * after the sys_close call (which hits the vfs layer as a flush).
@@ -2697,7 +2754,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
-	calldata->roc = pnfs_roc(state->inode);
+	calldata->roc = nfs4_roc(state->inode);
 	nfs_sb_active(calldata->inode->i_sb);
 
 	msg.rpc_argp = &calldata->arg;
@@ -4033,24 +4090,25 @@ static bool nfs4_error_stateid_expired(int err)
 	return false;
 }
 
-void __nfs4_read_done_cb(struct nfs_pgio_data *data)
+void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
 {
-	nfs_invalidate_atime(data->header->inode);
+	nfs_invalidate_atime(hdr->inode);
 }
 
-static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-	struct nfs_server *server = NFS_SERVER(data->header->inode);
+	struct nfs_server *server = NFS_SERVER(hdr->inode);
 
-	trace_nfs4_read(data, task->tk_status);
-	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
+	trace_nfs4_read(hdr, task->tk_status);
+	if (nfs4_async_handle_error(task, server,
+				    hdr->args.context->state) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
-	__nfs4_read_done_cb(data);
+	__nfs4_read_done_cb(hdr);
 	if (task->tk_status > 0)
-		renew_lease(server, data->timestamp);
+		renew_lease(server, hdr->timestamp);
 	return 0;
 }
 
@@ -4068,54 +4126,59 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
 	return true;
 }
 
-static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
 
 	dprintk("--> %s\n", __func__);
 
-	if (!nfs4_sequence_done(task, &data->res.seq_res))
+	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
 		return -EAGAIN;
-	if (nfs4_read_stateid_changed(task, &data->args))
+	if (nfs4_read_stateid_changed(task, &hdr->args))
 		return -EAGAIN;
-	return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
-				    nfs4_read_done_cb(task, data);
+	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+				    nfs4_read_done_cb(task, hdr);
 }
 
-static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
+				 struct rpc_message *msg)
 {
-	data->timestamp   = jiffies;
-	data->pgio_done_cb = nfs4_read_done_cb;
+	hdr->timestamp   = jiffies;
+	hdr->pgio_done_cb = nfs4_read_done_cb;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
-	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
 }
 
-static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
+				      struct nfs_pgio_header *hdr)
 {
-	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
-			&data->args.seq_args,
-			&data->res.seq_res,
+	if (nfs4_setup_sequence(NFS_SERVER(hdr->inode),
+			&hdr->args.seq_args,
+			&hdr->res.seq_res,
 			task))
 		return 0;
-	if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
-				data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO)
+	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+				hdr->args.lock_context,
+				hdr->rw_ops->rw_mode) == -EIO)
 		return -EIO;
-	if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
 		return -EIO;
 	return 0;
 }
 
-static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done_cb(struct rpc_task *task,
+			      struct nfs_pgio_header *hdr)
 {
-	struct inode *inode = data->header->inode;
+	struct inode *inode = hdr->inode;
 	
-	trace_nfs4_write(data, task->tk_status);
-	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
+	trace_nfs4_write(hdr, task->tk_status);
+	if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+				    hdr->args.context->state) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 	if (task->tk_status >= 0) {
-		renew_lease(NFS_SERVER(inode), data->timestamp);
-		nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+		renew_lease(NFS_SERVER(inode), hdr->timestamp);
+		nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr);
 	}
 	return 0;
 }
@@ -4134,23 +4197,21 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,
 	return true;
 }
 
-static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-	if (!nfs4_sequence_done(task, &data->res.seq_res))
+	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
 		return -EAGAIN;
-	if (nfs4_write_stateid_changed(task, &data->args))
+	if (nfs4_write_stateid_changed(task, &hdr->args))
 		return -EAGAIN;
-	return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
-		nfs4_write_done_cb(task, data);
+	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+		nfs4_write_done_cb(task, hdr);
 }
 
 static
-bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
+bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
 {
-	const struct nfs_pgio_header *hdr = data->header;
-
 	/* Don't request attributes for pNFS or O_DIRECT writes */
-	if (data->ds_clp != NULL || hdr->dreq != NULL)
+	if (hdr->ds_clp != NULL || hdr->dreq != NULL)
 		return false;
 	/* Otherwise, request attributes if and only if we don't hold
 	 * a delegation
@@ -4158,23 +4219,24 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
 	return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
 }
 
-static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
+				  struct rpc_message *msg)
 {
-	struct nfs_server *server = NFS_SERVER(data->header->inode);
+	struct nfs_server *server = NFS_SERVER(hdr->inode);
 
-	if (!nfs4_write_need_cache_consistency_data(data)) {
-		data->args.bitmask = NULL;
-		data->res.fattr = NULL;
+	if (!nfs4_write_need_cache_consistency_data(hdr)) {
+		hdr->args.bitmask = NULL;
+		hdr->res.fattr = NULL;
 	} else
-		data->args.bitmask = server->cache_consistency_bitmask;
+		hdr->args.bitmask = server->cache_consistency_bitmask;
 
-	if (!data->pgio_done_cb)
-		data->pgio_done_cb = nfs4_write_done_cb;
-	data->res.server = server;
-	data->timestamp   = jiffies;
+	if (!hdr->pgio_done_cb)
+		hdr->pgio_done_cb = nfs4_write_done_cb;
+	hdr->res.server = server;
+	hdr->timestamp   = jiffies;
 
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
-	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1);
 }
 
 static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -4881,6 +4943,18 @@ nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
 		return scnprintf(buf, len, "tcp");
 }
 
+static void nfs4_setclientid_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_setclientid *sc = calldata;
+
+	if (task->tk_status == 0)
+		sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred);
+}
+
+static const struct rpc_call_ops nfs4_setclientid_ops = {
+	.rpc_call_done = nfs4_setclientid_done,
+};
+
 /**
  * nfs4_proc_setclientid - Negotiate client ID
  * @clp: state data structure
@@ -4907,6 +4981,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 		.rpc_resp = res,
 		.rpc_cred = cred,
 	};
+	struct rpc_task *task;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_setclientid_ops,
+		.callback_data = &setclientid,
+		.flags = RPC_TASK_TIMEOUT,
+	};
 	int status;
 
 	/* nfs_client_id4 */
@@ -4933,7 +5015,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 	dprintk("NFS call  setclientid auth=%s, '%.*s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
 		setclientid.sc_name_len, setclientid.sc_name);
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task)) {
+		status = PTR_ERR(task);
+		goto out;
+	}
+	status = task->tk_status;
+	if (setclientid.sc_cred) {
+		clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
+		put_rpccred(setclientid.sc_cred);
+	}
+	rpc_put_task(task);
+out:
 	trace_nfs4_setclientid(clp, status);
 	dprintk("NFS reply setclientid: %d\n", status);
 	return status;
@@ -4975,6 +5068,9 @@ struct nfs4_delegreturndata {
 	unsigned long timestamp;
 	struct nfs_fattr fattr;
 	int rpc_status;
+	struct inode *inode;
+	bool roc;
+	u32 roc_barrier;
 };
 
 static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
@@ -4988,7 +5084,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 	switch (task->tk_status) {
 	case 0:
 		renew_lease(data->res.server, data->timestamp);
-		break;
 	case -NFS4ERR_ADMIN_REVOKED:
 	case -NFS4ERR_DELEG_REVOKED:
 	case -NFS4ERR_BAD_STATEID:
@@ -4996,6 +5091,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 	case -NFS4ERR_STALE_STATEID:
 	case -NFS4ERR_EXPIRED:
 		task->tk_status = 0;
+		if (data->roc)
+			pnfs_roc_set_barrier(data->inode, data->roc_barrier);
 		break;
 	default:
 		if (nfs4_async_handle_error(task, data->res.server, NULL) ==
@@ -5009,6 +5106,10 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 
 static void nfs4_delegreturn_release(void *calldata)
 {
+	struct nfs4_delegreturndata *data = calldata;
+
+	if (data->roc)
+		pnfs_roc_release(data->inode);
 	kfree(calldata);
 }
 
@@ -5018,6 +5119,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
 
 	d_data = (struct nfs4_delegreturndata *)data;
 
+	if (d_data->roc &&
+	    pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
+		return;
+
 	nfs4_setup_sequence(d_data->res.server,
 			&d_data->args.seq_args,
 			&d_data->res.seq_res,
@@ -5061,6 +5166,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	nfs_fattr_init(data->res.fattr);
 	data->timestamp = jiffies;
 	data->rpc_status = 0;
+	data->inode = inode;
+	data->roc = list_empty(&NFS_I(inode)->open_files) ?
+		    pnfs_roc(inode) : false;
 
 	task_setup_data.callback_data = data;
 	msg.rpc_argp = &data->args;
@@ -5834,8 +5942,10 @@ struct nfs_release_lockowner_data {
 static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_release_lockowner_data *data = calldata;
-	nfs40_setup_sequence(data->server,
-				&data->args.seq_args, &data->res.seq_res, task);
+	struct nfs_server *server = data->server;
+	nfs40_setup_sequence(server, &data->args.seq_args,
+				&data->res.seq_res, task);
+	data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
 	data->timestamp = jiffies;
 }
 
@@ -5852,6 +5962,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
 		break;
 	case -NFS4ERR_STALE_CLIENTID:
 	case -NFS4ERR_EXPIRED:
+		nfs4_schedule_lease_recovery(server->nfs_client);
+		break;
 	case -NFS4ERR_LEASE_MOVED:
 	case -NFS4ERR_DELAY:
 		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
@@ -5872,7 +5984,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
 	.rpc_release = nfs4_release_lockowner_release,
 };
 
-static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
 	struct nfs_release_lockowner_data *data;
 	struct rpc_message msg = {
@@ -5880,11 +5993,11 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
 	};
 
 	if (server->nfs_client->cl_mvops->minor_version != 0)
-		return -EINVAL;
+		return;
 
 	data = kmalloc(sizeof(*data), GFP_NOFS);
 	if (!data)
-		return -ENOMEM;
+		return;
 	data->lsp = lsp;
 	data->server = server;
 	data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5895,7 +6008,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
 	msg.rpc_resp = &data->res;
 	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
 	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
-	return 0;
 }
 
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -8182,7 +8294,8 @@ static int nfs41_free_stateid(struct nfs_server *server,
 	return ret;
 }
 
-static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
 	struct rpc_task *task;
 	struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
@@ -8190,9 +8303,8 @@ static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_sta
 	task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
 	nfs4_free_lock_state(server, lsp);
 	if (IS_ERR(task))
-		return PTR_ERR(task);
+		return;
 	rpc_put_task(task);
-	return 0;
 }
 
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 42f121182167..a043f618cd5a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -787,33 +787,36 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
  * that is compatible with current->files
  */
 static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 {
 	struct nfs4_lock_state *pos;
 	list_for_each_entry(pos, &state->lock_states, ls_locks) {
-		if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
+		if (pos->ls_owner != fl_owner)
 			continue;
-		switch (pos->ls_owner.lo_type) {
-		case NFS4_POSIX_LOCK_TYPE:
-			if (pos->ls_owner.lo_u.posix_owner != fl_owner)
-				continue;
-			break;
-		case NFS4_FLOCK_LOCK_TYPE:
-			if (pos->ls_owner.lo_u.flock_owner != fl_pid)
-				continue;
-		}
 		atomic_inc(&pos->ls_count);
 		return pos;
 	}
 	return NULL;
 }
 
+static void
+free_lock_state_work(struct work_struct *work)
+{
+	struct nfs4_lock_state *lsp = container_of(work,
+					struct nfs4_lock_state, ls_release);
+	struct nfs4_state *state = lsp->ls_state;
+	struct nfs_server *server = state->owner->so_server;
+	struct nfs_client *clp = server->nfs_client;
+
+	clp->cl_mvops->free_lock_state(server, lsp);
+}
+
 /*
  * Return a compatible lock_state. If no initialized lock_state structure
  * exists, return an uninitialized one.
  *
  */
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 {
 	struct nfs4_lock_state *lsp;
 	struct nfs_server *server = state->owner->so_server;
@@ -824,21 +827,12 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 	nfs4_init_seqid_counter(&lsp->ls_seqid);
 	atomic_set(&lsp->ls_count, 1);
 	lsp->ls_state = state;
-	lsp->ls_owner.lo_type = type;
-	switch (lsp->ls_owner.lo_type) {
-	case NFS4_FLOCK_LOCK_TYPE:
-		lsp->ls_owner.lo_u.flock_owner = fl_pid;
-		break;
-	case NFS4_POSIX_LOCK_TYPE:
-		lsp->ls_owner.lo_u.posix_owner = fl_owner;
-		break;
-	default:
-		goto out_free;
-	}
+	lsp->ls_owner = fl_owner;
 	lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
 	if (lsp->ls_seqid.owner_id < 0)
 		goto out_free;
 	INIT_LIST_HEAD(&lsp->ls_locks);
+	INIT_WORK(&lsp->ls_release, free_lock_state_work);
 	return lsp;
 out_free:
 	kfree(lsp);
@@ -857,13 +851,13 @@ void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp
  * exists, return an uninitialized one.
  *
  */
-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
 {
 	struct nfs4_lock_state *lsp, *new = NULL;
 	
 	for(;;) {
 		spin_lock(&state->state_lock);
-		lsp = __nfs4_find_lock_state(state, owner, pid, type);
+		lsp = __nfs4_find_lock_state(state, owner);
 		if (lsp != NULL)
 			break;
 		if (new != NULL) {
@@ -874,7 +868,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
 			break;
 		}
 		spin_unlock(&state->state_lock);
-		new = nfs4_alloc_lock_state(state, owner, pid, type);
+		new = nfs4_alloc_lock_state(state, owner);
 		if (new == NULL)
 			return NULL;
 	}
@@ -902,13 +896,12 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
 	if (list_empty(&state->lock_states))
 		clear_bit(LK_STATE_IN_USE, &state->flags);
 	spin_unlock(&state->state_lock);
-	server = state->owner->so_server;
-	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-		struct nfs_client *clp = server->nfs_client;
-
-		clp->cl_mvops->free_lock_state(server, lsp);
-	} else
+	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
+		queue_work(nfsiod_workqueue, &lsp->ls_release);
+	else {
+		server = state->owner->so_server;
 		nfs4_free_lock_state(server, lsp);
+	}
 }
 
 static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -935,13 +928,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
 
 	if (fl->fl_ops != NULL)
 		return 0;
-	if (fl->fl_flags & FL_POSIX)
-		lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
-	else if (fl->fl_flags & FL_FLOCK)
-		lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
-				NFS4_FLOCK_LOCK_TYPE);
-	else
-		return -EINVAL;
+	lsp = nfs4_get_lock_state(state, fl->fl_owner);
 	if (lsp == NULL)
 		return -ENOMEM;
 	fl->fl_u.nfs4_fl.owner = lsp;
@@ -955,7 +942,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
 {
 	struct nfs4_lock_state *lsp;
 	fl_owner_t fl_owner;
-	pid_t fl_pid;
 	int ret = -ENOENT;
 
 
@@ -966,9 +952,8 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
 		goto out;
 
 	fl_owner = lockowner->l_owner;
-	fl_pid = lockowner->l_pid;
 	spin_lock(&state->state_lock);
-	lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
+	lsp = __nfs4_find_lock_state(state, fl_owner);
 	if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
 		ret = -EIO;
 	else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 0a744f3a86f6..1c32adbe728d 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -932,11 +932,11 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
 
 DECLARE_EVENT_CLASS(nfs4_read_event,
 		TP_PROTO(
-			const struct nfs_pgio_data *data,
+			const struct nfs_pgio_header *hdr,
 			int error
 		),
 
-		TP_ARGS(data, error),
+		TP_ARGS(hdr, error),
 
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
@@ -948,12 +948,12 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
 		),
 
 		TP_fast_assign(
-			const struct inode *inode = data->header->inode;
+			const struct inode *inode = hdr->inode;
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
-			__entry->offset = data->args.offset;
-			__entry->count = data->args.count;
+			__entry->offset = hdr->args.offset;
+			__entry->count = hdr->args.count;
 			__entry->error = error;
 		),
 
@@ -972,10 +972,10 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
 #define DEFINE_NFS4_READ_EVENT(name) \
 	DEFINE_EVENT(nfs4_read_event, name, \
 			TP_PROTO( \
-				const struct nfs_pgio_data *data, \
+				const struct nfs_pgio_header *hdr, \
 				int error \
 			), \
-			TP_ARGS(data, error))
+			TP_ARGS(hdr, error))
 DEFINE_NFS4_READ_EVENT(nfs4_read);
 #ifdef CONFIG_NFS_V4_1
 DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
@@ -983,11 +983,11 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
 
 DECLARE_EVENT_CLASS(nfs4_write_event,
 		TP_PROTO(
-			const struct nfs_pgio_data *data,
+			const struct nfs_pgio_header *hdr,
 			int error
 		),
 
-		TP_ARGS(data, error),
+		TP_ARGS(hdr, error),
 
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
@@ -999,12 +999,12 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
 		),
 
 		TP_fast_assign(
-			const struct inode *inode = data->header->inode;
+			const struct inode *inode = hdr->inode;
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
-			__entry->offset = data->args.offset;
-			__entry->count = data->args.count;
+			__entry->offset = hdr->args.offset;
+			__entry->count = hdr->args.count;
 			__entry->error = error;
 		),
 
@@ -1024,10 +1024,10 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
 #define DEFINE_NFS4_WRITE_EVENT(name) \
 	DEFINE_EVENT(nfs4_write_event, name, \
 			TP_PROTO( \
-				const struct nfs_pgio_data *data, \
+				const struct nfs_pgio_header *hdr, \
 				int error \
 			), \
-			TP_ARGS(data, error))
+			TP_ARGS(hdr, error))
 DEFINE_NFS4_WRITE_EVENT(nfs4_write);
 #ifdef CONFIG_NFS_V4_1
 DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 939ae606cfa4..e13b59d8d9aa 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7092,7 +7092,7 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
 	if (!status)
 		status = decode_sequence(xdr, &res->seq_res, rqstp);
 	if (!status)
-		status = decode_reclaim_complete(xdr, (void *)NULL);
+		status = decode_reclaim_complete(xdr, NULL);
 	return status;
 }
 
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 611320753db2..ae05278b3761 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -439,22 +439,21 @@ static void _read_done(struct ore_io_state *ios, void *private)
 	objlayout_read_done(&objios->oir, status, objios->sync);
 }
 
-int objio_read_pagelist(struct nfs_pgio_data *rdata)
+int objio_read_pagelist(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = rdata->header;
 	struct objio_state *objios;
 	int ret;
 
 	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
-			hdr->lseg, rdata->args.pages, rdata->args.pgbase,
-			rdata->args.offset, rdata->args.count, rdata,
+			hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+			hdr->args.offset, hdr->args.count, hdr,
 			GFP_KERNEL, &objios);
 	if (unlikely(ret))
 		return ret;
 
 	objios->ios->done = _read_done;
 	dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
-		rdata->args.offset, rdata->args.count);
+		hdr->args.offset, hdr->args.count);
 	ret = ore_read(objios->ios);
 	if (unlikely(ret))
 		objio_free_result(&objios->oir);
@@ -487,11 +486,11 @@ static void _write_done(struct ore_io_state *ios, void *private)
 static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 {
 	struct objio_state *objios = priv;
-	struct nfs_pgio_data *wdata = objios->oir.rpcdata;
-	struct address_space *mapping = wdata->header->inode->i_mapping;
+	struct nfs_pgio_header *hdr = objios->oir.rpcdata;
+	struct address_space *mapping = hdr->inode->i_mapping;
 	pgoff_t index = offset / PAGE_SIZE;
 	struct page *page;
-	loff_t i_size = i_size_read(wdata->header->inode);
+	loff_t i_size = i_size_read(hdr->inode);
 
 	if (offset >= i_size) {
 		*uptodate = true;
@@ -531,15 +530,14 @@ static const struct _ore_r4w_op _r4w_op = {
 	.put_page = &__r4w_put_page,
 };
 
-int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
+int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
 {
-	struct nfs_pgio_header *hdr = wdata->header;
 	struct objio_state *objios;
 	int ret;
 
 	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
-			hdr->lseg, wdata->args.pages, wdata->args.pgbase,
-			wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
+			hdr->lseg, hdr->args.pages, hdr->args.pgbase,
+			hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
 			&objios);
 	if (unlikely(ret))
 		return ret;
@@ -551,7 +549,7 @@ int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
 		objios->ios->done = _write_done;
 
 	dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
-		wdata->args.offset, wdata->args.count);
+		hdr->args.offset, hdr->args.count);
 	ret = ore_write(objios->ios);
 	if (unlikely(ret)) {
 		objio_free_result(&objios->oir);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 765d3f54e986..697a16d11fac 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -229,36 +229,36 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
 static void _rpc_read_complete(struct work_struct *work)
 {
 	struct rpc_task *task;
-	struct nfs_pgio_data *rdata;
+	struct nfs_pgio_header *hdr;
 
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
-	rdata = container_of(task, struct nfs_pgio_data, task);
+	hdr = container_of(task, struct nfs_pgio_header, task);
 
-	pnfs_ld_read_done(rdata);
+	pnfs_ld_read_done(hdr);
 }
 
 void
 objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-	struct nfs_pgio_data *rdata = oir->rpcdata;
+	struct nfs_pgio_header *hdr = oir->rpcdata;
 
-	oir->status = rdata->task.tk_status = status;
+	oir->status = hdr->task.tk_status = status;
 	if (status >= 0)
-		rdata->res.count = status;
+		hdr->res.count = status;
 	else
-		rdata->header->pnfs_error = status;
+		hdr->pnfs_error = status;
 	objlayout_iodone(oir);
 	/* must not use oir after this point */
 
 	dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
-		status, rdata->res.eof, sync);
+		status, hdr->res.eof, sync);
 
 	if (sync)
-		pnfs_ld_read_done(rdata);
+		pnfs_ld_read_done(hdr);
 	else {
-		INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
-		schedule_work(&rdata->task.u.tk_work);
+		INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete);
+		schedule_work(&hdr->task.u.tk_work);
 	}
 }
 
@@ -266,12 +266,11 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
  * Perform sync or async reads.
  */
 enum pnfs_try_status
-objlayout_read_pagelist(struct nfs_pgio_data *rdata)
+objlayout_read_pagelist(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = rdata->header;
 	struct inode *inode = hdr->inode;
-	loff_t offset = rdata->args.offset;
-	size_t count = rdata->args.count;
+	loff_t offset = hdr->args.offset;
+	size_t count = hdr->args.count;
 	int err;
 	loff_t eof;
 
@@ -279,23 +278,23 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
 	if (unlikely(offset + count > eof)) {
 		if (offset >= eof) {
 			err = 0;
-			rdata->res.count = 0;
-			rdata->res.eof = 1;
+			hdr->res.count = 0;
+			hdr->res.eof = 1;
 			/*FIXME: do we need to call pnfs_ld_read_done() */
 			goto out;
 		}
 		count = eof - offset;
 	}
 
-	rdata->res.eof = (offset + count) >= eof;
-	_fix_verify_io_params(hdr->lseg, &rdata->args.pages,
-			      &rdata->args.pgbase,
-			      rdata->args.offset, rdata->args.count);
+	hdr->res.eof = (offset + count) >= eof;
+	_fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+			      &hdr->args.pgbase,
+			      hdr->args.offset, hdr->args.count);
 
 	dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
-		__func__, inode->i_ino, offset, count, rdata->res.eof);
+		__func__, inode->i_ino, offset, count, hdr->res.eof);
 
-	err = objio_read_pagelist(rdata);
+	err = objio_read_pagelist(hdr);
  out:
 	if (unlikely(err)) {
 		hdr->pnfs_error = err;
@@ -312,38 +311,38 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
 static void _rpc_write_complete(struct work_struct *work)
 {
 	struct rpc_task *task;
-	struct nfs_pgio_data *wdata;
+	struct nfs_pgio_header *hdr;
 
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
-	wdata = container_of(task, struct nfs_pgio_data, task);
+	hdr = container_of(task, struct nfs_pgio_header, task);
 
-	pnfs_ld_write_done(wdata);
+	pnfs_ld_write_done(hdr);
 }
 
 void
 objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-	struct nfs_pgio_data *wdata = oir->rpcdata;
+	struct nfs_pgio_header *hdr = oir->rpcdata;
 
-	oir->status = wdata->task.tk_status = status;
+	oir->status = hdr->task.tk_status = status;
 	if (status >= 0) {
-		wdata->res.count = status;
-		wdata->verf.committed = oir->committed;
+		hdr->res.count = status;
+		hdr->verf.committed = oir->committed;
 	} else {
-		wdata->header->pnfs_error = status;
+		hdr->pnfs_error = status;
 	}
 	objlayout_iodone(oir);
 	/* must not use oir after this point */
 
 	dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
-		status, wdata->verf.committed, sync);
+		status, hdr->verf.committed, sync);
 
 	if (sync)
-		pnfs_ld_write_done(wdata);
+		pnfs_ld_write_done(hdr);
 	else {
-		INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
-		schedule_work(&wdata->task.u.tk_work);
+		INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete);
+		schedule_work(&hdr->task.u.tk_work);
 	}
 }
 
@@ -351,17 +350,15 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
  * Perform sync or async writes.
  */
 enum pnfs_try_status
-objlayout_write_pagelist(struct nfs_pgio_data *wdata,
-			 int how)
+objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how)
 {
-	struct nfs_pgio_header *hdr = wdata->header;
 	int err;
 
-	_fix_verify_io_params(hdr->lseg, &wdata->args.pages,
-			      &wdata->args.pgbase,
-			      wdata->args.offset, wdata->args.count);
+	_fix_verify_io_params(hdr->lseg, &hdr->args.pages,
+			      &hdr->args.pgbase,
+			      hdr->args.offset, hdr->args.count);
 
-	err = objio_write_pagelist(wdata, how);
+	err = objio_write_pagelist(hdr, how);
 	if (unlikely(err)) {
 		hdr->pnfs_error = err;
 		dprintk("%s: Returned Error %d\n", __func__, err);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 01e041029a6c..fd13f1d2f136 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
  */
 extern void objio_free_result(struct objlayout_io_res *oir);
 
-extern int objio_read_pagelist(struct nfs_pgio_data *rdata);
-extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how);
+extern int objio_read_pagelist(struct nfs_pgio_header *rdata);
+extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how);
 
 /*
  * callback API
@@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
 extern void objlayout_free_lseg(struct pnfs_layout_segment *);
 
 extern enum pnfs_try_status objlayout_read_pagelist(
-	struct nfs_pgio_data *);
+	struct nfs_pgio_header *);
 
 extern enum pnfs_try_status objlayout_write_pagelist(
-	struct nfs_pgio_data *,
+	struct nfs_pgio_header *,
 	int how);
 
 extern void objlayout_encode_layoutcommit(
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 0be5050638f7..ba491926df5f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -141,16 +141,24 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
  * @req - request in group that is to be locked
  *
  * this lock must be held if modifying the page group list
+ *
+ * returns result from wait_on_bit_lock: 0 on success, < 0 on error
  */
-void
-nfs_page_group_lock(struct nfs_page *req)
+int
+nfs_page_group_lock(struct nfs_page *req, bool wait)
 {
 	struct nfs_page *head = req->wb_head;
+	int ret;
 
 	WARN_ON_ONCE(head != head->wb_head);
 
-	wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+	do {
+		ret = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
 			TASK_UNINTERRUPTIBLE);
+	} while (wait && ret != 0);
+
+	WARN_ON_ONCE(ret > 0);
+	return ret;
 }
 
 /*
@@ -211,7 +219,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
 {
 	bool ret;
 
-	nfs_page_group_lock(req);
+	nfs_page_group_lock(req, true);
 	ret = nfs_page_group_sync_on_bit_locked(req, bit);
 	nfs_page_group_unlock(req);
 
@@ -454,123 +462,72 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
 
-static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr)
-{
-	return container_of(hdr, struct nfs_rw_header, header);
-}
-
-/**
- * nfs_rw_header_alloc - Allocate a header for a read or write
- * @ops: Read or write function vector
- */
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
 {
-	struct nfs_rw_header *header = ops->rw_alloc_header();
-
-	if (header) {
-		struct nfs_pgio_header *hdr = &header->header;
+	struct nfs_pgio_header *hdr = ops->rw_alloc_header();
 
+	if (hdr) {
 		INIT_LIST_HEAD(&hdr->pages);
 		spin_lock_init(&hdr->lock);
-		atomic_set(&hdr->refcnt, 0);
 		hdr->rw_ops = ops;
 	}
-	return header;
+	return hdr;
 }
-EXPORT_SYMBOL_GPL(nfs_rw_header_alloc);
+EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
 
 /*
- * nfs_rw_header_free - Free a read or write header
+ * nfs_pgio_header_free - Free a read or write header
  * @hdr: The header to free
  */
-void nfs_rw_header_free(struct nfs_pgio_header *hdr)
+void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
 {
-	hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr));
+	hdr->rw_ops->rw_free_header(hdr);
 }
-EXPORT_SYMBOL_GPL(nfs_rw_header_free);
+EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
 
 /**
- * nfs_pgio_data_alloc - Allocate pageio data
- * @hdr: The header making a request
- * @pagecount: Number of pages to create
- */
-static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr,
-						 unsigned int pagecount)
-{
-	struct nfs_pgio_data *data, *prealloc;
-
-	prealloc = &NFS_RW_HEADER(hdr)->rpc_data;
-	if (prealloc->header == NULL)
-		data = prealloc;
-	else
-		data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		goto out;
-
-	if (nfs_pgarray_set(&data->pages, pagecount)) {
-		data->header = hdr;
-		atomic_inc(&hdr->refcnt);
-	} else {
-		if (data != prealloc)
-			kfree(data);
-		data = NULL;
-	}
-out:
-	return data;
-}
-
-/**
- * nfs_pgio_data_release - Properly free pageio data
- * @data: The data to release
+ * nfs_pgio_data_destroy - make @hdr suitable for reuse
+ *
+ * Frees memory and releases refs from nfs_generic_pgio, so that it may
+ * be called again.
+ *
+ * @hdr: A header that has had nfs_generic_pgio called
  */
-void nfs_pgio_data_release(struct nfs_pgio_data *data)
+void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
-	struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr);
-
-	put_nfs_open_context(data->args.context);
-	if (data->pages.pagevec != data->pages.page_array)
-		kfree(data->pages.pagevec);
-	if (data == &pageio_header->rpc_data) {
-		data->header = NULL;
-		data = NULL;
-	}
-	if (atomic_dec_and_test(&hdr->refcnt))
-		hdr->completion_ops->completion(hdr);
-	/* Note: we only free the rpc_task after callbacks are done.
-	 * See the comment in rpc_free_task() for why
-	 */
-	kfree(data);
+	put_nfs_open_context(hdr->args.context);
+	if (hdr->page_array.pagevec != hdr->page_array.page_array)
+		kfree(hdr->page_array.pagevec);
 }
-EXPORT_SYMBOL_GPL(nfs_pgio_data_release);
+EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy);
 
 /**
  * nfs_pgio_rpcsetup - Set up arguments for a pageio call
- * @data: The pageio data
+ * @hdr: The pageio hdr
  * @count: Number of bytes to read
  * @offset: Initial offset
  * @how: How to commit data (writes only)
  * @cinfo: Commit information for the call (writes only)
  */
-static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
+static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
 			      unsigned int count, unsigned int offset,
 			      int how, struct nfs_commit_info *cinfo)
 {
-	struct nfs_page *req = data->header->req;
+	struct nfs_page *req = hdr->req;
 
 	/* Set up the RPC argument and reply structs
-	 * NB: take care not to mess about with data->commit et al. */
+	 * NB: take care not to mess about with hdr->commit et al. */
 
-	data->args.fh     = NFS_FH(data->header->inode);
-	data->args.offset = req_offset(req) + offset;
+	hdr->args.fh     = NFS_FH(hdr->inode);
+	hdr->args.offset = req_offset(req) + offset;
 	/* pnfs_set_layoutcommit needs this */
-	data->mds_offset = data->args.offset;
-	data->args.pgbase = req->wb_pgbase + offset;
-	data->args.pages  = data->pages.pagevec;
-	data->args.count  = count;
-	data->args.context = get_nfs_open_context(req->wb_context);
-	data->args.lock_context = req->wb_lock_context;
-	data->args.stable  = NFS_UNSTABLE;
+	hdr->mds_offset = hdr->args.offset;
+	hdr->args.pgbase = req->wb_pgbase + offset;
+	hdr->args.pages  = hdr->page_array.pagevec;
+	hdr->args.count  = count;
+	hdr->args.context = get_nfs_open_context(req->wb_context);
+	hdr->args.lock_context = req->wb_lock_context;
+	hdr->args.stable  = NFS_UNSTABLE;
 	switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
 	case 0:
 		break;
@@ -578,59 +535,59 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
 		if (nfs_reqs_to_commit(cinfo))
 			break;
 	default:
-		data->args.stable = NFS_FILE_SYNC;
+		hdr->args.stable = NFS_FILE_SYNC;
 	}
 
-	data->res.fattr   = &data->fattr;
-	data->res.count   = count;
-	data->res.eof     = 0;
-	data->res.verf    = &data->verf;
-	nfs_fattr_init(&data->fattr);
+	hdr->res.fattr   = &hdr->fattr;
+	hdr->res.count   = count;
+	hdr->res.eof     = 0;
+	hdr->res.verf    = &hdr->verf;
+	nfs_fattr_init(&hdr->fattr);
 }
 
 /**
- * nfs_pgio_prepare - Prepare pageio data to go over the wire
+ * nfs_pgio_prepare - Prepare pageio hdr to go over the wire
  * @task: The current task
- * @calldata: pageio data to prepare
+ * @calldata: pageio header to prepare
  */
 static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs_pgio_data *data = calldata;
+	struct nfs_pgio_header *hdr = calldata;
 	int err;
-	err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data);
+	err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr);
 	if (err)
 		rpc_exit(task, err);
 }
 
-int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data,
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
 		      const struct rpc_call_ops *call_ops, int how, int flags)
 {
 	struct rpc_task *task;
 	struct rpc_message msg = {
-		.rpc_argp = &data->args,
-		.rpc_resp = &data->res,
-		.rpc_cred = data->header->cred,
+		.rpc_argp = &hdr->args,
+		.rpc_resp = &hdr->res,
+		.rpc_cred = hdr->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = clnt,
-		.task = &data->task,
+		.task = &hdr->task,
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
-		.callback_data = data,
+		.callback_data = hdr,
 		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC | flags,
 	};
 	int ret = 0;
 
-	data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how);
+	hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
 
 	dprintk("NFS: %5u initiated pgio call "
 		"(req %s/%llu, %u bytes @ offset %llu)\n",
-		data->task.tk_pid,
-		data->header->inode->i_sb->s_id,
-		(unsigned long long)NFS_FILEID(data->header->inode),
-		data->args.count,
-		(unsigned long long)data->args.offset);
+		hdr->task.tk_pid,
+		hdr->inode->i_sb->s_id,
+		(unsigned long long)NFS_FILEID(hdr->inode),
+		hdr->args.count,
+		(unsigned long long)hdr->args.offset);
 
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task)) {
@@ -657,22 +614,23 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
 			  struct nfs_pgio_header *hdr)
 {
 	set_bit(NFS_IOHDR_REDO, &hdr->flags);
-	nfs_pgio_data_release(hdr->data);
-	hdr->data = NULL;
+	nfs_pgio_data_destroy(hdr);
+	hdr->completion_ops->completion(hdr);
 	desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 	return -ENOMEM;
 }
 
 /**
  * nfs_pgio_release - Release pageio data
- * @calldata: The pageio data to release
+ * @calldata: The pageio header to release
  */
 static void nfs_pgio_release(void *calldata)
 {
-	struct nfs_pgio_data *data = calldata;
-	if (data->header->rw_ops->rw_release)
-		data->header->rw_ops->rw_release(data);
-	nfs_pgio_data_release(data);
+	struct nfs_pgio_header *hdr = calldata;
+	if (hdr->rw_ops->rw_release)
+		hdr->rw_ops->rw_release(hdr);
+	nfs_pgio_data_destroy(hdr);
+	hdr->completion_ops->completion(hdr);
 }
 
 /**
@@ -713,22 +671,22 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init);
 /**
  * nfs_pgio_result - Basic pageio error handling
  * @task: The task that ran
- * @calldata: Pageio data to check
+ * @calldata: Pageio header to check
  */
 static void nfs_pgio_result(struct rpc_task *task, void *calldata)
 {
-	struct nfs_pgio_data *data = calldata;
-	struct inode *inode = data->header->inode;
+	struct nfs_pgio_header *hdr = calldata;
+	struct inode *inode = hdr->inode;
 
 	dprintk("NFS: %s: %5u, (status %d)\n", __func__,
 		task->tk_pid, task->tk_status);
 
-	if (data->header->rw_ops->rw_done(task, data, inode) != 0)
+	if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
 		return;
 	if (task->tk_status < 0)
-		nfs_set_pgio_error(data->header, task->tk_status, data->args.offset);
+		nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset);
 	else
-		data->header->rw_ops->rw_result(task, data);
+		hdr->rw_ops->rw_result(task, hdr);
 }
 
 /*
@@ -744,17 +702,16 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 {
 	struct nfs_page		*req;
 	struct page		**pages;
-	struct nfs_pgio_data	*data;
 	struct list_head *head = &desc->pg_list;
 	struct nfs_commit_info cinfo;
+	unsigned int pagecount;
 
-	data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base,
-							   desc->pg_count));
-	if (!data)
+	pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
+	if (!nfs_pgarray_set(&hdr->page_array, pagecount))
 		return nfs_pgio_error(desc, hdr);
 
 	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
-	pages = data->pages.pagevec;
+	pages = hdr->page_array.pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
@@ -767,8 +724,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
 
 	/* Set up the argument struct */
-	nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
-	hdr->data = data;
+	nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
 	desc->pg_rpc_callops = &nfs_pgio_common_ops;
 	return 0;
 }
@@ -776,25 +732,20 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
 
 static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_rw_header *rw_hdr;
 	struct nfs_pgio_header *hdr;
 	int ret;
 
-	rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops);
-	if (!rw_hdr) {
+	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+	if (!hdr) {
 		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		return -ENOMEM;
 	}
-	hdr = &rw_hdr->header;
-	nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
-	atomic_inc(&hdr->refcnt);
+	nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
 	ret = nfs_generic_pgio(desc, hdr);
 	if (ret == 0)
 		ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
-					hdr->data, desc->pg_rpc_callops,
+					hdr, desc->pg_rpc_callops,
 					desc->pg_ioflags, 0);
-	if (atomic_dec_and_test(&hdr->refcnt))
-		hdr->completion_ops->completion(hdr);
 	return ret;
 }
 
@@ -907,8 +858,13 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 	struct nfs_page *subreq;
 	unsigned int bytes_left = 0;
 	unsigned int offset, pgbase;
+	int ret;
 
-	nfs_page_group_lock(req);
+	ret = nfs_page_group_lock(req, false);
+	if (ret < 0) {
+		desc->pg_error = ret;
+		return 0;
+	}
 
 	subreq = req;
 	bytes_left = subreq->wb_bytes;
@@ -930,7 +886,11 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 			if (desc->pg_recoalesce)
 				return 0;
 			/* retry add_request for this subreq */
-			nfs_page_group_lock(req);
+			ret = nfs_page_group_lock(req, false);
+			if (ret < 0) {
+				desc->pg_error = ret;
+				return 0;
+			}
 			continue;
 		}
 
@@ -1005,7 +965,38 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 	} while (ret);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nfs_pageio_add_request);
+
+/*
+ * nfs_pageio_resend - Transfer requests to new descriptor and resend
+ * @hdr - the pgio header to move request from
+ * @desc - the pageio descriptor to add requests to
+ *
+ * Try to move each request (nfs_page) from @hdr to @desc then attempt
+ * to send them.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
+		      struct nfs_pgio_header *hdr)
+{
+	LIST_HEAD(failed);
+
+	desc->pg_dreq = hdr->dreq;
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+		nfs_list_remove_request(req);
+		if (!nfs_pageio_add_request(desc, req))
+			nfs_list_add_request(req, &failed);
+	}
+	nfs_pageio_complete(desc);
+	if (!list_empty(&failed)) {
+		list_move(&failed, &hdr->pages);
+		return -EIO;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_resend);
 
 /**
  * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
@@ -1021,7 +1012,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
 			break;
 	}
 }
-EXPORT_SYMBOL_GPL(nfs_pageio_complete);
 
 /**
  * nfs_pageio_cond_complete - Conditional I/O completion
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a8914b335617..a3851debf8a2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -361,6 +361,23 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 }
 EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 
+static void pnfs_put_lseg_async_work(struct work_struct *work)
+{
+	struct pnfs_layout_segment *lseg;
+
+	lseg = container_of(work, struct pnfs_layout_segment, pls_work);
+
+	pnfs_put_lseg(lseg);
+}
+
+void
+pnfs_put_lseg_async(struct pnfs_layout_segment *lseg)
+{
+	INIT_WORK(&lseg->pls_work, pnfs_put_lseg_async_work);
+	schedule_work(&lseg->pls_work);
+}
+EXPORT_SYMBOL_GPL(pnfs_put_lseg_async);
+
 static u64
 end_offset(u64 start, u64 len)
 {
@@ -1470,41 +1487,19 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
-int pnfs_write_done_resend_to_mds(struct inode *inode,
-				struct list_head *head,
-				const struct nfs_pgio_completion_ops *compl_ops,
-				struct nfs_direct_req *dreq)
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
 {
 	struct nfs_pageio_descriptor pgio;
-	LIST_HEAD(failed);
 
 	/* Resend all requests through the MDS */
-	nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops);
-	pgio.pg_dreq = dreq;
-	while (!list_empty(head)) {
-		struct nfs_page *req = nfs_list_entry(head->next);
-
-		nfs_list_remove_request(req);
-		if (!nfs_pageio_add_request(&pgio, req))
-			nfs_list_add_request(req, &failed);
-	}
-	nfs_pageio_complete(&pgio);
-
-	if (!list_empty(&failed)) {
-		/* For some reason our attempt to resend pages. Mark the
-		 * overall send request as having failed, and let
-		 * nfs_writeback_release_full deal with the error.
-		 */
-		list_move(&failed, head);
-		return -EIO;
-	}
-	return 0;
+	nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
+			      hdr->completion_ops);
+	return nfs_pageio_resend(&pgio, hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
 
-static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
 
 	dprintk("pnfs write error = %d\n", hdr->pnfs_error);
 	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
@@ -1512,50 +1507,42 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
 		pnfs_return_layout(hdr->inode);
 	}
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
-		data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
-							&hdr->pages,
-							hdr->completion_ops,
-							hdr->dreq);
+		hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
 }
 
 /*
  * Called by non rpc-based layout drivers
  */
-void pnfs_ld_write_done(struct nfs_pgio_data *data)
+void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
-
-	trace_nfs4_pnfs_write(data, hdr->pnfs_error);
+	trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
 	if (!hdr->pnfs_error) {
-		pnfs_set_layoutcommit(data);
-		hdr->mds_ops->rpc_call_done(&data->task, data);
+		pnfs_set_layoutcommit(hdr);
+		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
 	} else
-		pnfs_ld_handle_write_error(data);
-	hdr->mds_ops->rpc_release(data);
+		pnfs_ld_handle_write_error(hdr);
+	hdr->mds_ops->rpc_release(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 
 static void
 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
-		struct nfs_pgio_data *data)
+		struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
-
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 		list_splice_tail_init(&hdr->pages, &desc->pg_list);
 		nfs_pageio_reset_write_mds(desc);
 		desc->pg_recoalesce = 1;
 	}
-	nfs_pgio_data_release(data);
+	nfs_pgio_data_destroy(hdr);
 }
 
 static enum pnfs_try_status
-pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
+pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
 			const struct rpc_call_ops *call_ops,
 			struct pnfs_layout_segment *lseg,
 			int how)
 {
-	struct nfs_pgio_header *hdr = wdata->header;
 	struct inode *inode = hdr->inode;
 	enum pnfs_try_status trypnfs;
 	struct nfs_server *nfss = NFS_SERVER(inode);
@@ -1563,8 +1550,8 @@ pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
 	hdr->mds_ops = call_ops;
 
 	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
-		inode->i_ino, wdata->args.count, wdata->args.offset, how);
-	trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+		inode->i_ino, hdr->args.count, hdr->args.offset, how);
+	trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
 	if (trypnfs != PNFS_NOT_ATTEMPTED)
 		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
 	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1575,139 +1562,105 @@ static void
 pnfs_do_write(struct nfs_pageio_descriptor *desc,
 	      struct nfs_pgio_header *hdr, int how)
 {
-	struct nfs_pgio_data *data = hdr->data;
 	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
 	struct pnfs_layout_segment *lseg = desc->pg_lseg;
 	enum pnfs_try_status trypnfs;
 
 	desc->pg_lseg = NULL;
-	trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
+	trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
 	if (trypnfs == PNFS_NOT_ATTEMPTED)
-		pnfs_write_through_mds(desc, data);
+		pnfs_write_through_mds(desc, hdr);
 	pnfs_put_lseg(lseg);
 }
 
 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
 	pnfs_put_lseg(hdr->lseg);
-	nfs_rw_header_free(hdr);
+	nfs_pgio_header_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
 
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_rw_header *whdr;
 	struct nfs_pgio_header *hdr;
 	int ret;
 
-	whdr = nfs_rw_header_alloc(desc->pg_rw_ops);
-	if (!whdr) {
+	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+	if (!hdr) {
 		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		pnfs_put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
 		return -ENOMEM;
 	}
-	hdr = &whdr->header;
 	nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
 	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
-	atomic_inc(&hdr->refcnt);
 	ret = nfs_generic_pgio(desc, hdr);
 	if (ret != 0) {
 		pnfs_put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
 	} else
 		pnfs_do_write(desc, hdr, desc->pg_ioflags);
-	if (atomic_dec_and_test(&hdr->refcnt))
-		hdr->completion_ops->completion(hdr);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 
-int pnfs_read_done_resend_to_mds(struct inode *inode,
-				struct list_head *head,
-				const struct nfs_pgio_completion_ops *compl_ops,
-				struct nfs_direct_req *dreq)
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
 {
 	struct nfs_pageio_descriptor pgio;
-	LIST_HEAD(failed);
 
 	/* Resend all requests through the MDS */
-	nfs_pageio_init_read(&pgio, inode, true, compl_ops);
-	pgio.pg_dreq = dreq;
-	while (!list_empty(head)) {
-		struct nfs_page *req = nfs_list_entry(head->next);
-
-		nfs_list_remove_request(req);
-		if (!nfs_pageio_add_request(&pgio, req))
-			nfs_list_add_request(req, &failed);
-	}
-	nfs_pageio_complete(&pgio);
-
-	if (!list_empty(&failed)) {
-		list_move(&failed, head);
-		return -EIO;
-	}
-	return 0;
+	nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
+	return nfs_pageio_resend(&pgio, hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
 
-static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
-
 	dprintk("pnfs read error = %d\n", hdr->pnfs_error);
 	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
 	    PNFS_LAYOUTRET_ON_ERROR) {
 		pnfs_return_layout(hdr->inode);
 	}
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
-		data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
-							&hdr->pages,
-							hdr->completion_ops,
-							hdr->dreq);
+		hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
 }
 
 /*
  * Called by non rpc-based layout drivers
  */
-void pnfs_ld_read_done(struct nfs_pgio_data *data)
+void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
-
-	trace_nfs4_pnfs_read(data, hdr->pnfs_error);
+	trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
 	if (likely(!hdr->pnfs_error)) {
-		__nfs4_read_done_cb(data);
-		hdr->mds_ops->rpc_call_done(&data->task, data);
+		__nfs4_read_done_cb(hdr);
+		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
 	} else
-		pnfs_ld_handle_read_error(data);
-	hdr->mds_ops->rpc_release(data);
+		pnfs_ld_handle_read_error(hdr);
+	hdr->mds_ops->rpc_release(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
 
 static void
 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
-		struct nfs_pgio_data *data)
+		struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
-
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 		list_splice_tail_init(&hdr->pages, &desc->pg_list);
 		nfs_pageio_reset_read_mds(desc);
 		desc->pg_recoalesce = 1;
 	}
-	nfs_pgio_data_release(data);
+	nfs_pgio_data_destroy(hdr);
 }
 
 /*
  * Call the appropriate parallel I/O subsystem read function.
  */
 static enum pnfs_try_status
-pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
+pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
 		       const struct rpc_call_ops *call_ops,
 		       struct pnfs_layout_segment *lseg)
 {
-	struct nfs_pgio_header *hdr = rdata->header;
 	struct inode *inode = hdr->inode;
 	struct nfs_server *nfss = NFS_SERVER(inode);
 	enum pnfs_try_status trypnfs;
@@ -1715,9 +1668,9 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
 	hdr->mds_ops = call_ops;
 
 	dprintk("%s: Reading ino:%lu %u@%llu\n",
-		__func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+		__func__, inode->i_ino, hdr->args.count, hdr->args.offset);
 
-	trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
+	trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
 	if (trypnfs != PNFS_NOT_ATTEMPTED)
 		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
 	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1727,52 +1680,46 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
 static void
 pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_data *data = hdr->data;
 	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
 	struct pnfs_layout_segment *lseg = desc->pg_lseg;
 	enum pnfs_try_status trypnfs;
 
 	desc->pg_lseg = NULL;
-	trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
+	trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
 	if (trypnfs == PNFS_NOT_ATTEMPTED)
-		pnfs_read_through_mds(desc, data);
+		pnfs_read_through_mds(desc, hdr);
 	pnfs_put_lseg(lseg);
 }
 
 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
 {
 	pnfs_put_lseg(hdr->lseg);
-	nfs_rw_header_free(hdr);
+	nfs_pgio_header_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
 
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_rw_header *rhdr;
 	struct nfs_pgio_header *hdr;
 	int ret;
 
-	rhdr = nfs_rw_header_alloc(desc->pg_rw_ops);
-	if (!rhdr) {
+	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+	if (!hdr) {
 		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		ret = -ENOMEM;
 		pnfs_put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
 		return ret;
 	}
-	hdr = &rhdr->header;
 	nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
 	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
-	atomic_inc(&hdr->refcnt);
 	ret = nfs_generic_pgio(desc, hdr);
 	if (ret != 0) {
 		pnfs_put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
 	} else
 		pnfs_do_read(desc, hdr);
-	if (atomic_dec_and_test(&hdr->refcnt))
-		hdr->completion_ops->completion(hdr);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
@@ -1820,12 +1767,11 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
 
 void
-pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
+pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = wdata->header;
 	struct inode *inode = hdr->inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
-	loff_t end_pos = wdata->mds_offset + wdata->res.count;
+	loff_t end_pos = hdr->mds_offset + hdr->res.count;
 	bool mark_as_dirty = false;
 
 	spin_lock(&inode->i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4fb309a2b4c4..aca3dff5dae6 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -32,6 +32,7 @@
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/workqueue.h>
 
 enum {
 	NFS_LSEG_VALID = 0,	/* cleared when lseg is recalled/returned */
@@ -46,6 +47,7 @@ struct pnfs_layout_segment {
 	atomic_t pls_refcount;
 	unsigned long pls_flags;
 	struct pnfs_layout_hdr *pls_layout;
+	struct work_struct pls_work;
 };
 
 enum pnfs_try_status {
@@ -104,6 +106,8 @@ struct pnfs_layoutdriver_type {
 				  int max);
 	void (*recover_commit_reqs) (struct list_head *list,
 				     struct nfs_commit_info *cinfo);
+	struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+						struct page *page);
 	int (*commit_pagelist)(struct inode *inode,
 			       struct list_head *mds_pages,
 			       int how,
@@ -113,8 +117,8 @@ struct pnfs_layoutdriver_type {
 	 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
 	 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
 	 */
-	enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data);
-	enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how);
+	enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *);
+	enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
 
 	void (*free_deviceid_node) (struct nfs4_deviceid_node *);
 
@@ -179,6 +183,7 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 /* pnfs.c */
 void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
+void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg);
 
 void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
@@ -213,13 +218,13 @@ bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
-void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata);
+void pnfs_set_layoutcommit(struct nfs_pgio_header *);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
 int pnfs_commit_and_return_layout(struct inode *);
-void pnfs_ld_write_done(struct nfs_pgio_data *);
-void pnfs_ld_read_done(struct nfs_pgio_data *);
+void pnfs_ld_write_done(struct nfs_pgio_header *);
+void pnfs_ld_read_done(struct nfs_pgio_header *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       struct nfs_open_context *ctx,
 					       loff_t pos,
@@ -228,12 +233,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       gfp_t gfp_flags);
 
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
-int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
-			const struct nfs_pgio_completion_ops *compl_ops,
-			struct nfs_direct_req *dreq);
-int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
-			const struct nfs_pgio_completion_ops *compl_ops,
-			struct nfs_direct_req *dreq);
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 
 /* nfs4_deviceid_flags */
@@ -345,6 +346,17 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
 	NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 }
 
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+			struct page *page)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+	if (ld == NULL || ld->search_commit_reqs == NULL)
+		return NULL;
+	return ld->search_commit_reqs(cinfo, page);
+}
+
 /* Should the pNFS client commit and return the layout upon a setattr */
 static inline bool
 pnfs_ld_layoutret_on_setattr(struct inode *inode)
@@ -410,6 +422,10 @@ static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
 }
 
+static inline void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg)
+{
+}
+
 static inline int pnfs_return_layout(struct inode *ino)
 {
 	return 0;
@@ -496,6 +512,13 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
 {
 }
 
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+			struct page *page)
+{
+	return NULL;
+}
+
 static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
 	return 0;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c171ce1a8a30..b09cc23d6f43 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -578,46 +578,49 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return 0;
 }
 
-static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-	struct inode *inode = data->header->inode;
+	struct inode *inode = hdr->inode;
 
 	nfs_invalidate_atime(inode);
 	if (task->tk_status >= 0) {
-		nfs_refresh_inode(inode, data->res.fattr);
+		nfs_refresh_inode(inode, hdr->res.fattr);
 		/* Emulate the eof flag, which isn't normally needed in NFSv2
 		 * as it is guaranteed to always return the file attributes
 		 */
-		if (data->args.offset + data->res.count >= data->res.fattr->size)
-			data->res.eof = 1;
+		if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
+			hdr->res.eof = 1;
 	}
 	return 0;
 }
 
-static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_read_setup(struct nfs_pgio_header *hdr,
+				struct rpc_message *msg)
 {
 	msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
 }
 
-static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
+				     struct nfs_pgio_header *hdr)
 {
 	rpc_call_start(task);
 	return 0;
 }
 
-static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-	struct inode *inode = data->header->inode;
+	struct inode *inode = hdr->inode;
 
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
 	return 0;
 }
 
-static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
+				 struct rpc_message *msg)
 {
 	/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
-	data->args.stable = NFS_FILE_SYNC;
+	hdr->args.stable = NFS_FILE_SYNC;
 	msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
 }
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e818a475ca64..beff2769c5c5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -33,12 +33,12 @@ static const struct nfs_rw_ops nfs_rw_read_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
 
-static struct nfs_rw_header *nfs_readhdr_alloc(void)
+static struct nfs_pgio_header *nfs_readhdr_alloc(void)
 {
 	return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
 }
 
-static void nfs_readhdr_free(struct nfs_rw_header *rhdr)
+static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
 {
 	kmem_cache_free(nfs_rdata_cachep, rhdr);
 }
@@ -115,12 +115,6 @@ static void nfs_readpage_release(struct nfs_page *req)
 
 		unlock_page(req->wb_page);
 	}
-
-	dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
-			req->wb_context->dentry->d_inode->i_sb->s_id,
-			(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
-			req->wb_bytes,
-			(long long)req_offset(req));
 	nfs_release_request(req);
 }
 
@@ -172,14 +166,15 @@ out:
 	hdr->release(hdr);
 }
 
-static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_read(struct nfs_pgio_header *hdr,
+			      struct rpc_message *msg,
 			      struct rpc_task_setup *task_setup_data, int how)
 {
-	struct inode *inode = data->header->inode;
+	struct inode *inode = hdr->inode;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
 
 	task_setup_data->flags |= swap_flags;
-	NFS_PROTO(inode)->read_setup(data, msg);
+	NFS_PROTO(inode)->read_setup(hdr, msg);
 }
 
 static void
@@ -203,14 +198,15 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
  */
-static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_readpage_done(struct rpc_task *task,
+			     struct nfs_pgio_header *hdr,
 			     struct inode *inode)
 {
-	int status = NFS_PROTO(inode)->read_done(task, data);
+	int status = NFS_PROTO(inode)->read_done(task, hdr);
 	if (status != 0)
 		return status;
 
-	nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count);
+	nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count);
 
 	if (task->tk_status == -ESTALE) {
 		set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
@@ -219,34 +215,34 @@ static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
 	return 0;
 }
 
-static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_retry(struct rpc_task *task,
+			       struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_args *argp = &data->args;
-	struct nfs_pgio_res  *resp = &data->res;
+	struct nfs_pgio_args *argp = &hdr->args;
+	struct nfs_pgio_res  *resp = &hdr->res;
 
 	/* This is a short read! */
-	nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
+	nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
 	/* Has the server at least made some progress? */
 	if (resp->count == 0) {
-		nfs_set_pgio_error(data->header, -EIO, argp->offset);
+		nfs_set_pgio_error(hdr, -EIO, argp->offset);
 		return;
 	}
-	/* Yes, so retry the read at the end of the data */
-	data->mds_offset += resp->count;
+	/* Yes, so retry the read at the end of the hdr */
+	hdr->mds_offset += resp->count;
 	argp->offset += resp->count;
 	argp->pgbase += resp->count;
 	argp->count -= resp->count;
 	rpc_restart_call_prepare(task);
 }
 
-static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_result(struct rpc_task *task,
+				struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
-
-	if (data->res.eof) {
+	if (hdr->res.eof) {
 		loff_t bound;
 
-		bound = data->args.offset + data->res.count;
+		bound = hdr->args.offset + hdr->res.count;
 		spin_lock(&hdr->lock);
 		if (bound < hdr->io_start + hdr->good_bytes) {
 			set_bit(NFS_IOHDR_EOF, &hdr->flags);
@@ -254,8 +250,8 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *dat
 			hdr->good_bytes = bound - hdr->io_start;
 		}
 		spin_unlock(&hdr->lock);
-	} else if (data->res.count != data->args.count)
-		nfs_readpage_retry(task, data);
+	} else if (hdr->res.count != hdr->args.count)
+		nfs_readpage_retry(task, hdr);
 }
 
 /*
@@ -404,7 +400,7 @@ out:
 int __init nfs_init_readpagecache(void)
 {
 	nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
-					     sizeof(struct nfs_rw_header),
+					     sizeof(struct nfs_pgio_header),
 					     0, SLAB_HWCACHE_ALIGN,
 					     NULL);
 	if (nfs_rdata_cachep == NULL)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 084af1060d79..e4499d5b51e8 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1027,8 +1027,7 @@ static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
 			      rpc_authflavor_t flavor)
 {
 	unsigned int i;
-	unsigned int max_flavor_len = (sizeof(auth_info->flavors) /
-				       sizeof(auth_info->flavors[0]));
+	unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors);
 
 	/* make sure this flavor isn't already in the list */
 	for (i = 0; i < auth_info->flavor_len; i++) {
@@ -2180,7 +2179,7 @@ out_no_address:
 	return -EINVAL;
 }
 
-#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
 		| NFS_MOUNT_SECURE \
 		| NFS_MOUNT_TCP \
 		| NFS_MOUNT_VER3 \
@@ -2188,15 +2187,16 @@ out_no_address:
 		| NFS_MOUNT_NONLM \
 		| NFS_MOUNT_BROKEN_SUID \
 		| NFS_MOUNT_STRICTLOCK \
-		| NFS_MOUNT_UNSHARED \
-		| NFS_MOUNT_NORESVPORT \
 		| NFS_MOUNT_LEGACY_INTERFACE)
 
+#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \
+		~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT))
+
 static int
 nfs_compare_remount_data(struct nfs_server *nfss,
 			 struct nfs_parsed_mount_data *data)
 {
-	if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK ||
+	if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK ||
 	    data->rsize != nfss->rsize ||
 	    data->wsize != nfss->wsize ||
 	    data->version != nfss->nfs_client->rpc_ops->version ||
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 962c9ee758be..e3b5cf28bdc5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -47,6 +47,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
 static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
 static const struct nfs_rw_ops nfs_rw_write_ops;
 static void nfs_clear_request_commit(struct nfs_page *req);
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+				      struct inode *inode);
 
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -71,18 +73,18 @@ void nfs_commit_free(struct nfs_commit_data *p)
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
 
-static struct nfs_rw_header *nfs_writehdr_alloc(void)
+static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
-	struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
+	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
 	if (p)
 		memset(p, 0, sizeof(*p));
 	return p;
 }
 
-static void nfs_writehdr_free(struct nfs_rw_header *whdr)
+static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
-	mempool_free(whdr, nfs_wdata_mempool);
+	mempool_free(hdr, nfs_wdata_mempool);
 }
 
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -93,6 +95,38 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 }
 
 /*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+						struct page *page)
+{
+	struct nfs_page *freq, *t;
+	struct nfs_commit_info cinfo;
+	struct inode *inode = &nfsi->vfs_inode;
+
+	nfs_init_cinfo_from_inode(&cinfo, inode);
+
+	/* search through pnfs commit lists */
+	freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+	if (freq)
+		return freq->wb_head;
+
+	/* Linearly search the commit list for the correct request */
+	list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+		if (freq->wb_page == page)
+			return freq->wb_head;
+	}
+
+	return NULL;
+}
+
+/*
  * nfs_page_find_head_request_locked - find head request associated with @page
  *
  * must be called while holding the inode lock.
@@ -106,21 +140,12 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
 
 	if (PagePrivate(page))
 		req = (struct nfs_page *)page_private(page);
-	else if (unlikely(PageSwapCache(page))) {
-		struct nfs_page *freq, *t;
-
-		/* Linearly search the commit list for the correct req */
-		list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
-			if (freq->wb_page == page) {
-				req = freq->wb_head;
-				break;
-			}
-		}
-	}
+	else if (unlikely(PageSwapCache(page)))
+		req = nfs_page_search_commits_for_head_request_locked(nfsi,
+			page);
 
 	if (req) {
 		WARN_ON_ONCE(req->wb_head != req);
-
 		kref_get(&req->wb_kref);
 	}
 
@@ -216,7 +241,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
 	unsigned int pos = 0;
 	unsigned int len = nfs_page_length(req->wb_page);
 
-	nfs_page_group_lock(req);
+	nfs_page_group_lock(req, true);
 
 	do {
 		tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -379,8 +404,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 		subreq->wb_head = subreq;
 		subreq->wb_this_page = subreq;
 
-		nfs_clear_request_commit(subreq);
-
 		/* subreq is now totally disconnected from page group or any
 		 * write / commit lists. last chance to wake any waiters */
 		nfs_unlock_request(subreq);
@@ -456,7 +479,9 @@ try_again:
 	}
 
 	/* lock each request in the page group */
-	nfs_page_group_lock(head);
+	ret = nfs_page_group_lock(head, false);
+	if (ret < 0)
+		return ERR_PTR(ret);
 	subreq = head;
 	do {
 		/*
@@ -488,7 +513,7 @@ try_again:
 	 * Commit list removal accounting is done after locks are dropped */
 	subreq = head;
 	do {
-		nfs_list_remove_request(subreq);
+		nfs_clear_request_commit(subreq);
 		subreq = subreq->wb_this_page;
 	} while (subreq != head);
 
@@ -518,15 +543,11 @@ try_again:
 
 	nfs_page_group_unlock(head);
 
-	/* drop lock to clear_request_commit the head req and clean up
-	 * requests on destroy list */
+	/* drop lock to clean uprequests on destroy list */
 	spin_unlock(&inode->i_lock);
 
 	nfs_destroy_unlinked_subrequests(destroy_list, head);
 
-	/* clean up commit list state */
-	nfs_clear_request_commit(head);
-
 	/* still holds ref on head from nfs_page_find_head_request_locked
 	 * and still has lock on head from lock loop */
 	return head;
@@ -705,6 +726,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 
 	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
 		nfs_release_request(req);
+	else
+		WARN_ON_ONCE(1);
 }
 
 static void
@@ -808,6 +831,7 @@ nfs_clear_page_commit(struct page *page)
 	dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
 }
 
+/* Called holding inode (/cinfo) lock */
 static void
 nfs_clear_request_commit(struct nfs_page *req)
 {
@@ -817,20 +841,17 @@ nfs_clear_request_commit(struct nfs_page *req)
 
 		nfs_init_cinfo_from_inode(&cinfo, inode);
 		if (!pnfs_clear_request_commit(req, &cinfo)) {
-			spin_lock(cinfo.lock);
 			nfs_request_remove_commit_list(req, &cinfo);
-			spin_unlock(cinfo.lock);
 		}
 		nfs_clear_page_commit(req->wb_page);
 	}
 }
 
-static inline
-int nfs_write_need_commit(struct nfs_pgio_data *data)
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
 {
-	if (data->verf.committed == NFS_DATA_SYNC)
-		return data->header->lseg == NULL;
-	return data->verf.committed != NFS_FILE_SYNC;
+	if (hdr->verf.committed == NFS_DATA_SYNC)
+		return hdr->lseg == NULL;
+	return hdr->verf.committed != NFS_FILE_SYNC;
 }
 
 #else
@@ -856,8 +877,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 {
 }
 
-static inline
-int nfs_write_need_commit(struct nfs_pgio_data *data)
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
 {
 	return 0;
 }
@@ -883,11 +903,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 			nfs_context_set_write_error(req->wb_context, hdr->error);
 			goto remove_req;
 		}
-		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
-			nfs_mark_request_dirty(req);
-			goto next;
-		}
-		if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+		if (nfs_write_need_commit(hdr)) {
 			memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
 			nfs_mark_request_commit(req, hdr->lseg, &cinfo);
 			goto next;
@@ -1038,9 +1054,9 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 	else
 		req->wb_bytes = rqend - req->wb_offset;
 out_unlock:
-	spin_unlock(&inode->i_lock);
 	if (req)
 		nfs_clear_request_commit(req);
+	spin_unlock(&inode->i_lock);
 	return req;
 out_flushme:
 	spin_unlock(&inode->i_lock);
@@ -1241,17 +1257,18 @@ static int flush_task_priority(int how)
 	return RPC_PRIORITY_NORMAL;
 }
 
-static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+			       struct rpc_message *msg,
 			       struct rpc_task_setup *task_setup_data, int how)
 {
-	struct inode *inode = data->header->inode;
+	struct inode *inode = hdr->inode;
 	int priority = flush_task_priority(how);
 
 	task_setup_data->priority = priority;
-	NFS_PROTO(inode)->write_setup(data, msg);
+	NFS_PROTO(inode)->write_setup(hdr, msg);
 
 	nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
-				 &task_setup_data->rpc_client, msg, data);
+				 &task_setup_data->rpc_client, msg, hdr);
 }
 
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1313,21 +1330,9 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
 	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-static void nfs_writeback_release_common(struct nfs_pgio_data *data)
+static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_header *hdr = data->header;
-	int status = data->task.tk_status;
-
-	if ((status >= 0) && nfs_write_need_commit(data)) {
-		spin_lock(&hdr->lock);
-		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
-			; /* Do nothing */
-		else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
-			memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
-		else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
-			set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
-		spin_unlock(&hdr->lock);
-	}
+	/* do nothing! */
 }
 
 /*
@@ -1358,7 +1363,8 @@ static int nfs_should_remove_suid(const struct inode *inode)
 /*
  * This function is called when the WRITE call is complete.
  */
-static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_writeback_done(struct rpc_task *task,
+			      struct nfs_pgio_header *hdr,
 			      struct inode *inode)
 {
 	int status;
@@ -1370,13 +1376,14 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
 	 * another writer had changed the file, but some applications
 	 * depend on tighter cache coherency when writing.
 	 */
-	status = NFS_PROTO(inode)->write_done(task, data);
+	status = NFS_PROTO(inode)->write_done(task, hdr);
 	if (status != 0)
 		return status;
-	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count);
+	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
 
 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
-	if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {
+	if (hdr->res.verf->committed < hdr->args.stable &&
+	    task->tk_status >= 0) {
 		/* We tried a write call, but the server did not
 		 * commit data to stable storage even though we
 		 * requested it.
@@ -1392,7 +1399,7 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
 			dprintk("NFS:       faulty NFS server %s:"
 				" (committed = %d) != (stable = %d)\n",
 				NFS_SERVER(inode)->nfs_client->cl_hostname,
-				data->res.verf->committed, data->args.stable);
+				hdr->res.verf->committed, hdr->args.stable);
 			complain = jiffies + 300 * HZ;
 		}
 	}
@@ -1407,16 +1414,17 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
 /*
  * This function is called when the WRITE call is complete.
  */
-static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_writeback_result(struct rpc_task *task,
+				 struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_args	*argp = &data->args;
-	struct nfs_pgio_res	*resp = &data->res;
+	struct nfs_pgio_args	*argp = &hdr->args;
+	struct nfs_pgio_res	*resp = &hdr->res;
 
 	if (resp->count < argp->count) {
 		static unsigned long    complain;
 
 		/* This a short write! */
-		nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);
+		nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
 
 		/* Has the server at least made some progress? */
 		if (resp->count == 0) {
@@ -1426,14 +1434,14 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da
 				       argp->count);
 				complain = jiffies + 300 * HZ;
 			}
-			nfs_set_pgio_error(data->header, -EIO, argp->offset);
+			nfs_set_pgio_error(hdr, -EIO, argp->offset);
 			task->tk_status = -EIO;
 			return;
 		}
 		/* Was this an NFSv2 write or an NFSv3 stable write? */
 		if (resp->verf->committed != NFS_UNSTABLE) {
 			/* Resend from where the server left off */
-			data->mds_offset += resp->count;
+			hdr->mds_offset += resp->count;
 			argp->offset += resp->count;
 			argp->pgbase += resp->count;
 			argp->count -= resp->count;
@@ -1884,7 +1892,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 int __init nfs_init_writepagecache(void)
 {
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
-					     sizeof(struct nfs_rw_header),
+					     sizeof(struct nfs_pgio_header),
 					     0, SLAB_HWCACHE_ALIGN,
 					     NULL);
 	if (nfs_wdata_cachep == NULL)
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index ed628f71274c..538f142935ea 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -30,9 +30,6 @@
 
 MODULE_LICENSE("GPL");
 
-EXPORT_SYMBOL_GPL(nfsacl_encode);
-EXPORT_SYMBOL_GPL(nfsacl_decode);
-
 struct nfsacl_encode_desc {
 	struct xdr_array2_desc desc;
 	unsigned int count;
@@ -136,6 +133,7 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
 			  nfsacl_desc.desc.array_len;
 	return err;
 }
+EXPORT_SYMBOL_GPL(nfsacl_encode);
 
 struct nfsacl_decode_desc {
 	struct xdr_array2_desc desc;
@@ -295,3 +293,4 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
 	return 8 + nfsacl_desc.desc.elem_size *
 		   nfsacl_desc.desc.array_len;
 }
+EXPORT_SYMBOL_GPL(nfsacl_decode);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c519927b7b5e..228f5bdf0772 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -942,7 +942,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
 			iput(inode);
 		}
 	} else {
-		dentry = d_obtain_alias(inode);
+		dentry = d_obtain_root(inode);
 		if (IS_ERR(dentry)) {
 			ret = PTR_ERR(dentry);
 			goto failed_dentry;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 7f30bdc57d13..f2d0eee9d1f1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -96,13 +96,16 @@
  * Note that some things (eg. sb pointer, type, id) doesn't change during
  * the life of the dquot structure and so needn't to be protected by a lock
  *
- * Any operation working on dquots via inode pointers must hold dqptr_sem.  If
- * operation is just reading pointers from inode (or not using them at all) the
- * read lock is enough. If pointers are altered function must hold write lock.
+ * Operation accessing dquots via inode pointers are protected by dquot_srcu.
+ * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and
+ * synchronize_srcu(&dquot_srcu) is called after clearing pointers from
+ * inode and before dropping dquot references to avoid use of dquots after
+ * they are freed. dq_data_lock is used to serialize the pointer setting and
+ * clearing operations.
  * Special care needs to be taken about S_NOQUOTA inode flag (marking that
  * inode is a quota file). Functions adding pointers from inode to dquots have
- * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
- * have to do all pointer modifications before dropping dqptr_sem. This makes
+ * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they
+ * have to do all pointer modifications before dropping dq_data_lock. This makes
  * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
  * then drops all pointers to dquots from an inode.
  *
@@ -116,21 +119,15 @@
  * spinlock to internal buffers before writing.
  *
  * Lock ordering (including related VFS locks) is the following:
- *   dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock >
- *   dqio_mutex
+ *   dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
  * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc.
- * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
- * dqptr_sem. But filesystem has to count with the fact that functions such as
- * dquot_alloc_space() acquire dqptr_sem and they usually have to be called
- * from inside a transaction to keep filesystem consistency after a crash. Also
- * filesystems usually want to do some IO on dquot from ->mark_dirty which is
- * called with dqptr_sem held.
  */
 
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
+DEFINE_STATIC_SRCU(dquot_srcu);
 
 void __quota_error(struct super_block *sb, const char *func,
 		   const char *fmt, ...)
@@ -733,7 +730,6 @@ static struct shrinker dqcache_shrinker = {
 
 /*
  * Put reference to dquot
- * NOTE: If you change this function please check whether dqput_blocks() works right...
  */
 void dqput(struct dquot *dquot)
 {
@@ -963,46 +959,33 @@ static void add_dquot_ref(struct super_block *sb, int type)
 }
 
 /*
- * Return 0 if dqput() won't block.
- * (note that 1 doesn't necessarily mean blocking)
- */
-static inline int dqput_blocks(struct dquot *dquot)
-{
-	if (atomic_read(&dquot->dq_count) <= 1)
-		return 1;
-	return 0;
-}
-
-/*
  * Remove references to dquots from inode and add dquot to list for freeing
  * if we have the last reference to dquot
- * We can't race with anybody because we hold dqptr_sem for writing...
  */
-static int remove_inode_dquot_ref(struct inode *inode, int type,
-				  struct list_head *tofree_head)
+static void remove_inode_dquot_ref(struct inode *inode, int type,
+				   struct list_head *tofree_head)
 {
 	struct dquot *dquot = inode->i_dquot[type];
 
 	inode->i_dquot[type] = NULL;
-	if (dquot) {
-		if (dqput_blocks(dquot)) {
-#ifdef CONFIG_QUOTA_DEBUG
-			if (atomic_read(&dquot->dq_count) != 1)
-				quota_error(inode->i_sb, "Adding dquot with "
-					    "dq_count %d to dispose list",
-					    atomic_read(&dquot->dq_count));
-#endif
-			spin_lock(&dq_list_lock);
-			/* As dquot must have currently users it can't be on
-			 * the free list... */
-			list_add(&dquot->dq_free, tofree_head);
-			spin_unlock(&dq_list_lock);
-			return 1;
-		}
-		else
-			dqput(dquot);   /* We have guaranteed we won't block */
+	if (!dquot)
+		return;
+
+	if (list_empty(&dquot->dq_free)) {
+		/*
+		 * The inode still has reference to dquot so it can't be in the
+		 * free list
+		 */
+		spin_lock(&dq_list_lock);
+		list_add(&dquot->dq_free, tofree_head);
+		spin_unlock(&dq_list_lock);
+	} else {
+		/*
+		 * Dquot is already in a list to put so we won't drop the last
+		 * reference here.
+		 */
+		dqput(dquot);
 	}
-	return 0;
 }
 
 /*
@@ -1037,13 +1020,15 @@ static void remove_dquot_ref(struct super_block *sb, int type,
 		 *  We have to scan also I_NEW inodes because they can already
 		 *  have quota pointer initialized. Luckily, we need to touch
 		 *  only quota pointers and these have separate locking
-		 *  (dqptr_sem).
+		 *  (dq_data_lock).
 		 */
+		spin_lock(&dq_data_lock);
 		if (!IS_NOQUOTA(inode)) {
 			if (unlikely(inode_get_rsv_space(inode) > 0))
 				reserved = 1;
 			remove_inode_dquot_ref(inode, type, tofree_head);
 		}
+		spin_unlock(&dq_data_lock);
 	}
 	spin_unlock(&inode_sb_list_lock);
 #ifdef CONFIG_QUOTA_DEBUG
@@ -1061,9 +1046,8 @@ static void drop_dquot_ref(struct super_block *sb, int type)
 	LIST_HEAD(tofree_head);
 
 	if (sb->dq_op) {
-		down_write(&sb_dqopt(sb)->dqptr_sem);
 		remove_dquot_ref(sb, type, &tofree_head);
-		up_write(&sb_dqopt(sb)->dqptr_sem);
+		synchronize_srcu(&dquot_srcu);
 		put_dquot_list(&tofree_head);
 	}
 }
@@ -1394,21 +1378,16 @@ static int dquot_active(const struct inode *inode)
 /*
  * Initialize quota pointers in inode
  *
- * We do things in a bit complicated way but by that we avoid calling
- * dqget() and thus filesystem callbacks under dqptr_sem.
- *
  * It is better to call this function outside of any transaction as it
  * might need a lot of space in journal for dquot structure allocation.
  */
 static void __dquot_initialize(struct inode *inode, int type)
 {
-	int cnt;
+	int cnt, init_needed = 0;
 	struct dquot *got[MAXQUOTAS];
 	struct super_block *sb = inode->i_sb;
 	qsize_t rsv;
 
-	/* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
 	if (!dquot_active(inode))
 		return;
 
@@ -1418,6 +1397,15 @@ static void __dquot_initialize(struct inode *inode, int type)
 		got[cnt] = NULL;
 		if (type != -1 && cnt != type)
 			continue;
+		/*
+		 * The i_dquot should have been initialized in most cases,
+		 * we check it without locking here to avoid unnecessary
+		 * dqget()/dqput() calls.
+		 */
+		if (inode->i_dquot[cnt])
+			continue;
+		init_needed = 1;
+
 		switch (cnt) {
 		case USRQUOTA:
 			qid = make_kqid_uid(inode->i_uid);
@@ -1429,7 +1417,11 @@ static void __dquot_initialize(struct inode *inode, int type)
 		got[cnt] = dqget(sb, qid);
 	}
 
-	down_write(&sb_dqopt(sb)->dqptr_sem);
+	/* All required i_dquot has been initialized */
+	if (!init_needed)
+		return;
+
+	spin_lock(&dq_data_lock);
 	if (IS_NOQUOTA(inode))
 		goto out_err;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1449,15 +1441,12 @@ static void __dquot_initialize(struct inode *inode, int type)
 			 * did a write before quota was turned on
 			 */
 			rsv = inode_get_rsv_space(inode);
-			if (unlikely(rsv)) {
-				spin_lock(&dq_data_lock);
+			if (unlikely(rsv))
 				dquot_resv_space(inode->i_dquot[cnt], rsv);
-				spin_unlock(&dq_data_lock);
-			}
 		}
 	}
 out_err:
-	up_write(&sb_dqopt(sb)->dqptr_sem);
+	spin_unlock(&dq_data_lock);
 	/* Drop unused references */
 	dqput_all(got);
 }
@@ -1469,19 +1458,24 @@ void dquot_initialize(struct inode *inode)
 EXPORT_SYMBOL(dquot_initialize);
 
 /*
- * 	Release all quotas referenced by inode
+ * Release all quotas referenced by inode.
+ *
+ * This function only be called on inode free or converting
+ * a file to quota file, no other users for the i_dquot in
+ * both cases, so we needn't call synchronize_srcu() after
+ * clearing i_dquot.
  */
 static void __dquot_drop(struct inode *inode)
 {
 	int cnt;
 	struct dquot *put[MAXQUOTAS];
 
-	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		put[cnt] = inode->i_dquot[cnt];
 		inode->i_dquot[cnt] = NULL;
 	}
-	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	spin_unlock(&dq_data_lock);
 	dqput_all(put);
 }
 
@@ -1599,15 +1593,11 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
  */
 int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 {
-	int cnt, ret = 0;
+	int cnt, ret = 0, index;
 	struct dquot_warn warn[MAXQUOTAS];
 	struct dquot **dquots = inode->i_dquot;
 	int reserve = flags & DQUOT_SPACE_RESERVE;
 
-	/*
-	 * First test before acquiring mutex - solves deadlocks when we
-	 * re-enter the quota code and are already holding the mutex
-	 */
 	if (!dquot_active(inode)) {
 		inode_incr_space(inode, number, reserve);
 		goto out;
@@ -1616,7 +1606,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warn[cnt].w_type = QUOTA_NL_NOWARN;
 
-	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	index = srcu_read_lock(&dquot_srcu);
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!dquots[cnt])
@@ -1643,7 +1633,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 		goto out_flush_warn;
 	mark_all_dquot_dirty(dquots);
 out_flush_warn:
-	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	srcu_read_unlock(&dquot_srcu, index);
 	flush_warnings(warn);
 out:
 	return ret;
@@ -1655,17 +1645,16 @@ EXPORT_SYMBOL(__dquot_alloc_space);
  */
 int dquot_alloc_inode(const struct inode *inode)
 {
-	int cnt, ret = 0;
+	int cnt, ret = 0, index;
 	struct dquot_warn warn[MAXQUOTAS];
 	struct dquot * const *dquots = inode->i_dquot;
 
-	/* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
 	if (!dquot_active(inode))
 		return 0;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warn[cnt].w_type = QUOTA_NL_NOWARN;
-	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
+	index = srcu_read_lock(&dquot_srcu);
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!dquots[cnt])
@@ -1685,7 +1674,7 @@ warn_put_all:
 	spin_unlock(&dq_data_lock);
 	if (ret == 0)
 		mark_all_dquot_dirty(dquots);
-	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	srcu_read_unlock(&dquot_srcu, index);
 	flush_warnings(warn);
 	return ret;
 }
@@ -1696,14 +1685,14 @@ EXPORT_SYMBOL(dquot_alloc_inode);
  */
 int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
-	int cnt;
+	int cnt, index;
 
 	if (!dquot_active(inode)) {
 		inode_claim_rsv_space(inode, number);
 		return 0;
 	}
 
-	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	index = srcu_read_lock(&dquot_srcu);
 	spin_lock(&dq_data_lock);
 	/* Claim reserved quotas to allocated quotas */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1715,7 +1704,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 	inode_claim_rsv_space(inode, number);
 	spin_unlock(&dq_data_lock);
 	mark_all_dquot_dirty(inode->i_dquot);
-	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	srcu_read_unlock(&dquot_srcu, index);
 	return 0;
 }
 EXPORT_SYMBOL(dquot_claim_space_nodirty);
@@ -1725,14 +1714,14 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
  */
 void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
 {
-	int cnt;
+	int cnt, index;
 
 	if (!dquot_active(inode)) {
 		inode_reclaim_rsv_space(inode, number);
 		return;
 	}
 
-	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	index = srcu_read_lock(&dquot_srcu);
 	spin_lock(&dq_data_lock);
 	/* Claim reserved quotas to allocated quotas */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1744,7 +1733,7 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
 	inode_reclaim_rsv_space(inode, number);
 	spin_unlock(&dq_data_lock);
 	mark_all_dquot_dirty(inode->i_dquot);
-	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	srcu_read_unlock(&dquot_srcu, index);
 	return;
 }
 EXPORT_SYMBOL(dquot_reclaim_space_nodirty);
@@ -1757,16 +1746,14 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 	unsigned int cnt;
 	struct dquot_warn warn[MAXQUOTAS];
 	struct dquot **dquots = inode->i_dquot;
-	int reserve = flags & DQUOT_SPACE_RESERVE;
+	int reserve = flags & DQUOT_SPACE_RESERVE, index;
 
-	/* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
 	if (!dquot_active(inode)) {
 		inode_decr_space(inode, number, reserve);
 		return;
 	}
 
-	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	index = srcu_read_lock(&dquot_srcu);
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		int wtype;
@@ -1789,7 +1776,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 		goto out_unlock;
 	mark_all_dquot_dirty(dquots);
 out_unlock:
-	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	srcu_read_unlock(&dquot_srcu, index);
 	flush_warnings(warn);
 }
 EXPORT_SYMBOL(__dquot_free_space);
@@ -1802,13 +1789,12 @@ void dquot_free_inode(const struct inode *inode)
 	unsigned int cnt;
 	struct dquot_warn warn[MAXQUOTAS];
 	struct dquot * const *dquots = inode->i_dquot;
+	int index;
 
-	/* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
 	if (!dquot_active(inode))
 		return;
 
-	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	index = srcu_read_lock(&dquot_srcu);
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		int wtype;
@@ -1823,7 +1809,7 @@ void dquot_free_inode(const struct inode *inode)
 	}
 	spin_unlock(&dq_data_lock);
 	mark_all_dquot_dirty(dquots);
-	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	srcu_read_unlock(&dquot_srcu, index);
 	flush_warnings(warn);
 }
 EXPORT_SYMBOL(dquot_free_inode);
@@ -1837,6 +1823,8 @@ EXPORT_SYMBOL(dquot_free_inode);
  * This operation can block, but only after everything is updated
  * A transaction must be started when entering this function.
  *
+ * We are holding reference on transfer_from & transfer_to, no need to
+ * protect them by srcu_read_lock().
  */
 int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 {
@@ -1849,8 +1837,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	struct dquot_warn warn_from_inodes[MAXQUOTAS];
 	struct dquot_warn warn_from_space[MAXQUOTAS];
 
-	/* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return 0;
 	/* Initialize the arrays */
@@ -1859,12 +1845,12 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 		warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
 		warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
 	}
-	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
+	spin_lock(&dq_data_lock);
 	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */
-		up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+		spin_unlock(&dq_data_lock);
 		return 0;
 	}
-	spin_lock(&dq_data_lock);
 	cur_space = inode_get_bytes(inode);
 	rsv_space = inode_get_rsv_space(inode);
 	space = cur_space + rsv_space;
@@ -1918,7 +1904,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 		inode->i_dquot[cnt] = transfer_to[cnt];
 	}
 	spin_unlock(&dq_data_lock);
-	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 
 	mark_all_dquot_dirty(transfer_from);
 	mark_all_dquot_dirty(transfer_to);
@@ -1932,7 +1917,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	return 0;
 over_quota:
 	spin_unlock(&dq_data_lock);
-	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	flush_warnings(warn_to);
 	return ret;
 }
diff --git a/fs/quota/kqid.c b/fs/quota/kqid.c
index 2f97b0e2c501..ebc5e6285800 100644
--- a/fs/quota/kqid.c
+++ b/fs/quota/kqid.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL(qid_lt);
 /**
  *	from_kqid - Create a qid from a kqid user-namespace pair.
  *	@targ: The user namespace we want a qid in.
- *	@kuid: The kernel internal quota identifier to start with.
+ *	@kqid: The kernel internal quota identifier to start with.
  *
  *	Map @kqid into the user-namespace specified by @targ and
  *	return the resulting qid.
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 72d29177998e..bb2869f5dfd8 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -32,8 +32,7 @@ static struct genl_family quota_genl_family = {
 
 /**
  * quota_send_warning - Send warning to userspace about exceeded quota
- * @type: The quota type: USRQQUOTA, GRPQUOTA,...
- * @id: The user or group id of the quota that was exceeded
+ * @qid: The kernel internal quota identifier.
  * @dev: The device on which the fs is mounted (sb->s_dev)
  * @warntype: The type of the warning: QUOTA_NL_...
  *
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ff3f0b3cfdb3..75621649dbd7 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -79,13 +79,13 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
 {
 	__u32 fmt;
 
-	down_read(&sb_dqopt(sb)->dqptr_sem);
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	if (!sb_has_quota_active(sb, type)) {
-		up_read(&sb_dqopt(sb)->dqptr_sem);
+		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 		return -ESRCH;
 	}
 	fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
-	up_read(&sb_dqopt(sb)->dqptr_sem);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	if (copy_to_user(addr, &fmt, sizeof(fmt)))
 		return -EFAULT;
 	return 0;
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 5739cb99de7b..9c02d96d3a42 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -286,12 +286,14 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
 	return 0;
 }
 
-static void balance_leaf_insert_left(struct tree_balance *tb,
-				     struct item_head *ih, const char *body)
+static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
+					     struct item_head *const ih,
+					     const char * const body)
 {
 	int ret;
 	struct buffer_info bi;
 	int n = B_NR_ITEMS(tb->L[0]);
+	unsigned body_shift_bytes = 0;
 
 	if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
 		/* part of new item falls into L[0] */
@@ -329,7 +331,7 @@ static void balance_leaf_insert_left(struct tree_balance *tb,
 
 		put_ih_item_len(ih, new_item_len);
 		if (tb->lbytes > tb->zeroes_num) {
-			body += (tb->lbytes - tb->zeroes_num);
+			body_shift_bytes = tb->lbytes - tb->zeroes_num;
 			tb->zeroes_num = 0;
 		} else
 			tb->zeroes_num -= tb->lbytes;
@@ -349,11 +351,12 @@ static void balance_leaf_insert_left(struct tree_balance *tb,
 		tb->insert_size[0] = 0;
 		tb->zeroes_num = 0;
 	}
+	return body_shift_bytes;
 }
 
 static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
-						 struct item_head *ih,
-						 const char *body)
+						 struct item_head * const ih,
+						 const char * const body)
 {
 	int n = B_NR_ITEMS(tb->L[0]);
 	struct buffer_info bi;
@@ -413,17 +416,18 @@ static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
 	tb->pos_in_item -= tb->lbytes;
 }
 
-static void balance_leaf_paste_left_shift(struct tree_balance *tb,
-					  struct item_head *ih,
-					  const char *body)
+static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
+						  struct item_head * const ih,
+						  const char * const body)
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 	int n = B_NR_ITEMS(tb->L[0]);
 	struct buffer_info bi;
+	int body_shift_bytes = 0;
 
 	if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
 		balance_leaf_paste_left_shift_dirent(tb, ih, body);
-		return;
+		return 0;
 	}
 
 	RFALSE(tb->lbytes <= 0,
@@ -497,7 +501,7 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb,
 		 * insert_size[0]
 		 */
 		if (l_n > tb->zeroes_num) {
-			body += (l_n - tb->zeroes_num);
+			body_shift_bytes = l_n - tb->zeroes_num;
 			tb->zeroes_num = 0;
 		} else
 			tb->zeroes_num -= l_n;
@@ -526,13 +530,14 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb,
 		 */
 		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
 	}
+	return body_shift_bytes;
 }
 
 
 /* appended item will be in L[0] in whole */
 static void balance_leaf_paste_left_whole(struct tree_balance *tb,
-					  struct item_head *ih,
-					  const char *body)
+					  struct item_head * const ih,
+					  const char * const body)
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 	int n = B_NR_ITEMS(tb->L[0]);
@@ -584,39 +589,44 @@ static void balance_leaf_paste_left_whole(struct tree_balance *tb,
 	tb->zeroes_num = 0;
 }
 
-static void balance_leaf_paste_left(struct tree_balance *tb,
-				    struct item_head *ih, const char *body)
+static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
+					    struct item_head * const ih,
+					    const char * const body)
 {
 	/* we must shift the part of the appended item */
 	if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
-		balance_leaf_paste_left_shift(tb, ih, body);
+		return balance_leaf_paste_left_shift(tb, ih, body);
 	else
 		balance_leaf_paste_left_whole(tb, ih, body);
+	return 0;
 }
 
 /* Shift lnum[0] items from S[0] to the left neighbor L[0] */
-static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih,
-			      const char *body, int flag)
+static unsigned int balance_leaf_left(struct tree_balance *tb,
+				      struct item_head * const ih,
+				      const char * const body, int flag)
 {
 	if (tb->lnum[0] <= 0)
-		return;
+		return 0;
 
 	/* new item or it part falls to L[0], shift it too */
 	if (tb->item_pos < tb->lnum[0]) {
 		BUG_ON(flag != M_INSERT && flag != M_PASTE);
 
 		if (flag == M_INSERT)
-			balance_leaf_insert_left(tb, ih, body);
+			return balance_leaf_insert_left(tb, ih, body);
 		else /* M_PASTE */
-			balance_leaf_paste_left(tb, ih, body);
+			return balance_leaf_paste_left(tb, ih, body);
 	} else
 		/* new item doesn't fall into L[0] */
 		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+	return 0;
 }
 
 
 static void balance_leaf_insert_right(struct tree_balance *tb,
-				      struct item_head *ih, const char *body)
+				      struct item_head * const ih,
+				      const char * const body)
 {
 
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
@@ -704,7 +714,8 @@ static void balance_leaf_insert_right(struct tree_balance *tb,
 
 
 static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
-				     struct item_head *ih, const char *body)
+				     struct item_head * const ih,
+				     const char * const body)
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 	struct buffer_info bi;
@@ -754,7 +765,8 @@ static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
 }
 
 static void balance_leaf_paste_right_shift(struct tree_balance *tb,
-				     struct item_head *ih, const char *body)
+				     struct item_head * const ih,
+				     const char * const body)
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 	int n_shift, n_rem, r_zeroes_number, version;
@@ -831,7 +843,8 @@ static void balance_leaf_paste_right_shift(struct tree_balance *tb,
 }
 
 static void balance_leaf_paste_right_whole(struct tree_balance *tb,
-				     struct item_head *ih, const char *body)
+				     struct item_head * const ih,
+				     const char * const body)
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 	int n = B_NR_ITEMS(tbS0);
@@ -874,7 +887,8 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb,
 }
 
 static void balance_leaf_paste_right(struct tree_balance *tb,
-				     struct item_head *ih, const char *body)
+				     struct item_head * const ih,
+				     const char * const body)
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 	int n = B_NR_ITEMS(tbS0);
@@ -896,8 +910,9 @@ static void balance_leaf_paste_right(struct tree_balance *tb,
 }
 
 /* shift rnum[0] items from S[0] to the right neighbor R[0] */
-static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
-			       const char *body, int flag)
+static void balance_leaf_right(struct tree_balance *tb,
+			       struct item_head * const ih,
+			       const char * const body, int flag)
 {
 	if (tb->rnum[0] <= 0)
 		return;
@@ -911,8 +926,8 @@ static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
 }
 
 static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
-					  struct item_head *ih,
-					  const char *body,
+					  struct item_head * const ih,
+					  const char * const body,
 					  struct item_head *insert_key,
 					  struct buffer_head **insert_ptr,
 					  int i)
@@ -1003,8 +1018,8 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
 
 /* we append to directory item */
 static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
-					 struct item_head *ih,
-					 const char *body,
+					 struct item_head * const ih,
+					 const char * const body,
 					 struct item_head *insert_key,
 					 struct buffer_head **insert_ptr,
 					 int i)
@@ -1058,8 +1073,8 @@ static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
 }
 
 static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
-					 struct item_head *ih,
-					 const char *body,
+					 struct item_head * const ih,
+					 const char * const body,
 					 struct item_head *insert_key,
 					 struct buffer_head **insert_ptr,
 					 int i)
@@ -1131,8 +1146,8 @@ static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
 }
 
 static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
-					       struct item_head *ih,
-					       const char *body,
+					       struct item_head * const ih,
+					       const char * const body,
 					       struct item_head *insert_key,
 					       struct buffer_head **insert_ptr,
 					       int i)
@@ -1184,8 +1199,8 @@ static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
 
 }
 static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
-					 struct item_head *ih,
-					 const char *body,
+					 struct item_head * const ih,
+					 const char * const body,
 					 struct item_head *insert_key,
 					 struct buffer_head **insert_ptr,
 					 int i)
@@ -1214,8 +1229,8 @@ static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
 
 /* Fill new nodes that appear in place of S[0] */
 static void balance_leaf_new_nodes(struct tree_balance *tb,
-				   struct item_head *ih,
-				   const char *body,
+				   struct item_head * const ih,
+				   const char * const body,
 				   struct item_head *insert_key,
 				   struct buffer_head **insert_ptr,
 				   int flag)
@@ -1254,8 +1269,8 @@ static void balance_leaf_new_nodes(struct tree_balance *tb,
 }
 
 static void balance_leaf_finish_node_insert(struct tree_balance *tb,
-					    struct item_head *ih,
-					    const char *body)
+					    struct item_head * const ih,
+					    const char * const body)
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 	struct buffer_info bi;
@@ -1271,8 +1286,8 @@ static void balance_leaf_finish_node_insert(struct tree_balance *tb,
 }
 
 static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
-						  struct item_head *ih,
-						  const char *body)
+						  struct item_head * const ih,
+						  const char * const body)
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 	struct item_head *pasted = item_head(tbS0, tb->item_pos);
@@ -1305,8 +1320,8 @@ static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
 }
 
 static void balance_leaf_finish_node_paste(struct tree_balance *tb,
-					   struct item_head *ih,
-					   const char *body)
+					   struct item_head * const ih,
+					   const char * const body)
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 	struct buffer_info bi;
@@ -1349,8 +1364,8 @@ static void balance_leaf_finish_node_paste(struct tree_balance *tb,
  * of the affected item which remains in S
  */
 static void balance_leaf_finish_node(struct tree_balance *tb,
-				      struct item_head *ih,
-				      const char *body, int flag)
+				      struct item_head * const ih,
+				      const char * const body, int flag)
 {
 	/* if we must insert or append into buffer S[0] */
 	if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
@@ -1402,7 +1417,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
 	    && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
 		tb->pos_in_item *= UNFM_P_SIZE;
 
-	balance_leaf_left(tb, ih, body, flag);
+	body += balance_leaf_left(tb, ih, body, flag);
 
 	/* tb->lnum[0] > 0 */
 	/* Calculate new item position */
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e8870de4627e..a88b1b3e7db3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1947,8 +1947,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
 		}
 	}
 
-	/* wait for all commits to finish */
-	cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
 
 	/*
 	 * We must release the write lock here because
@@ -1956,8 +1954,14 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
 	 */
 	reiserfs_write_unlock(sb);
 
+	/*
+	 * Cancel flushing of old commits. Note that neither of these works
+	 * will be requeued because superblock is being shutdown and doesn't
+	 * have MS_ACTIVE set.
+	 */
 	cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
-	flush_workqueue(REISERFS_SB(sb)->commit_wq);
+	/* wait for all commits to finish */
+	cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
 
 	free_journal_ram(sb);
 
@@ -4292,9 +4296,15 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
 	if (flush) {
 		flush_commit_list(sb, jl, 1);
 		flush_journal_list(sb, jl, 1);
-	} else if (!(jl->j_state & LIST_COMMIT_PENDING))
-		queue_delayed_work(REISERFS_SB(sb)->commit_wq,
-				   &journal->j_work, HZ / 10);
+	} else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
+		/*
+		 * Avoid queueing work when sb is being shut down. Transaction
+		 * will be flushed on journal shutdown.
+		 */
+		if (sb->s_flags & MS_ACTIVE)
+			queue_delayed_work(REISERFS_SB(sb)->commit_wq,
+					   &journal->j_work, HZ / 10);
+	}
 
 	/*
 	 * if the next transaction has any chance of wrapping, flush
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 814dda3ec998..249594a821e0 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -899,8 +899,9 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
 
 /* insert item into the leaf node in position before */
 void leaf_insert_into_buf(struct buffer_info *bi, int before,
-			  struct item_head *inserted_item_ih,
-			  const char *inserted_item_body, int zeros_number)
+			  struct item_head * const inserted_item_ih,
+			  const char * const inserted_item_body,
+			  int zeros_number)
 {
 	struct buffer_head *bh = bi->bi_bh;
 	int nr, free_space;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index bf53888c7f59..735c2c2b4536 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -3216,11 +3216,12 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
 void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
 		       int del_num, int del_bytes);
 void leaf_insert_into_buf(struct buffer_info *bi, int before,
-			  struct item_head *inserted_item_ih,
-			  const char *inserted_item_body, int zeros_number);
-void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
-			  int pos_in_item, int paste_size, const char *body,
+			  struct item_head * const inserted_item_ih,
+			  const char * const inserted_item_body,
 			  int zeros_number);
+void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
+			  int pos_in_item, int paste_size,
+			  const char * const body, int zeros_number);
 void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
 			  int pos_in_item, int cut_size);
 void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 709ea92d716f..d46e88a33b02 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -100,7 +100,11 @@ void reiserfs_schedule_old_flush(struct super_block *s)
 	struct reiserfs_sb_info *sbi = REISERFS_SB(s);
 	unsigned long delay;
 
-	if (s->s_flags & MS_RDONLY)
+	/*
+	 * Avoid scheduling flush when sb is being shut down. It can race
+	 * with journal shutdown and free still queued delayed work.
+	 */
+	if (s->s_flags & MS_RDONLY || !(s->s_flags & MS_ACTIVE))
 		return;
 
 	spin_lock(&sbi->old_work_lock);
diff --git a/fs/super.c b/fs/super.c
index d20d5b11dedf..b9a214d2fe98 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,7 +22,6 @@
 
 #include <linux/export.h>
 #include <linux/slab.h>
-#include <linux/acct.h>
 #include <linux/blkdev.h>
 #include <linux/mount.h>
 #include <linux/security.h>
@@ -218,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
 	mutex_init(&s->s_dquot.dqio_mutex);
 	mutex_init(&s->s_dquot.dqonoff_mutex);
-	init_rwsem(&s->s_dquot.dqptr_sem);
 	s->s_maxbytes = MAX_NON_LFS;
 	s->s_op = &default_op;
 	s->s_time_gran = 1000000000;
@@ -702,12 +700,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 		return -EACCES;
 #endif
 
-	if (flags & MS_RDONLY)
-		acct_auto_close(sb);
-	shrink_dcache_sb(sb);
-
 	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
 
+	if (remount_ro) {
+		if (sb->s_pins.first) {
+			up_write(&sb->s_umount);
+			sb_pin_kill(sb);
+			down_write(&sb->s_umount);
+			if (!sb->s_root)
+				return 0;
+			if (sb->s_writers.frozen != SB_UNFROZEN)
+				return -EBUSY;
+			remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+		}
+	}
+	shrink_dcache_sb(sb);
+
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
 	if (remount_ro) {
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index ff8229340cd5..aa13ad053b14 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -174,7 +174,6 @@ static int do_commit(struct ubifs_info *c)
 	if (err)
 		goto out;
 
-	mutex_lock(&c->mst_mutex);
 	c->mst_node->cmt_no      = cpu_to_le64(c->cmt_no);
 	c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);
 	c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);
@@ -204,7 +203,6 @@ static int do_commit(struct ubifs_info *c)
 	else
 		c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
 	err = ubifs_write_master(c);
-	mutex_unlock(&c->mst_mutex);
 	if (err)
 		goto out;
 
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 2290d5866725..fb08b0c514b6 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -431,7 +431,7 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
 
 /**
  * wbuf_timer_callback - write-buffer timer callback function.
- * @data: timer data (write-buffer descriptor)
+ * @timer: timer data (write-buffer descriptor)
  *
  * This function is called when the write-buffer timer expires.
  */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index a902c5919e42..a47ddfc9be6b 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -240,6 +240,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
 
 	if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
 		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+		ubifs_assert(c->lhead_lnum != c->ltail_lnum);
 		c->lhead_offs = 0;
 	}
 
@@ -404,15 +405,14 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
 	/* Switch to the next log LEB */
 	if (c->lhead_offs) {
 		c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+		ubifs_assert(c->lhead_lnum != c->ltail_lnum);
 		c->lhead_offs = 0;
 	}
 
-	if (c->lhead_offs == 0) {
-		/* Must ensure next LEB has been unmapped */
-		err = ubifs_leb_unmap(c, c->lhead_lnum);
-		if (err)
-			goto out;
-	}
+	/* Must ensure next LEB has been unmapped */
+	err = ubifs_leb_unmap(c, c->lhead_lnum);
+	if (err)
+		goto out;
 
 	len = ALIGN(len, c->min_io_size);
 	dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index d46b19ec1815..421bd0a80424 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1464,7 +1464,6 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
 			return ERR_CAST(nnode);
 	}
 	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-	shft -= UBIFS_LPT_FANOUT_SHIFT;
 	pnode = ubifs_get_pnode(c, nnode, iip);
 	if (IS_ERR(pnode))
 		return ERR_CAST(pnode);
@@ -1604,7 +1603,6 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
 			return ERR_CAST(nnode);
 	}
 	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-	shft -= UBIFS_LPT_FANOUT_SHIFT;
 	pnode = ubifs_get_pnode(c, nnode, iip);
 	if (IS_ERR(pnode))
 		return ERR_CAST(pnode);
@@ -1964,7 +1962,6 @@ again:
 		}
 	}
 	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-	shft -= UBIFS_LPT_FANOUT_SHIFT;
 	pnode = scan_get_pnode(c, path + h, nnode, iip);
 	if (IS_ERR(pnode)) {
 		err = PTR_ERR(pnode);
@@ -2198,6 +2195,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 					  lprops->dirty);
 				return -EINVAL;
 			}
+			break;
 		case LPROPS_FREEABLE:
 		case LPROPS_FRDI_IDX:
 			if (lprops->free + lprops->dirty != c->leb_size) {
@@ -2206,6 +2204,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 					  lprops->dirty);
 				return -EINVAL;
 			}
+			break;
 		}
 	}
 	return 0;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 45d4e96a6bac..d9c02928e992 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -304,7 +304,6 @@ static int layout_cnodes(struct ubifs_info *c)
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 		}
-		done_ltab = 1;
 		c->ltab_lnum = lnum;
 		c->ltab_offs = offs;
 		offs += c->ltab_sz;
@@ -514,7 +513,6 @@ static int write_cnodes(struct ubifs_info *c)
 			if (err)
 				return err;
 		}
-		done_ltab = 1;
 		ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
 		offs += c->ltab_sz;
 		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
@@ -1941,6 +1939,11 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 				pr_err("LEB %d:%d, nnode, ",
 				       lnum, offs);
 			err = ubifs_unpack_nnode(c, p, &nnode);
+			if (err) {
+				pr_err("failed to unpack_node, error %d\n",
+				       err);
+				break;
+			}
 			for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
 				pr_cont("%d:%d", nnode.nbranch[i].lnum,
 				       nnode.nbranch[i].offs);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index ab83ace9910a..1a4bb9e8b3b8 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -352,10 +352,9 @@ int ubifs_read_master(struct ubifs_info *c)
  * ubifs_write_master - write master node.
  * @c: UBIFS file-system description object
  *
- * This function writes the master node. The caller has to take the
- * @c->mst_mutex lock before calling this function. Returns zero in case of
- * success and a negative error code in case of failure. The master node is
- * written twice to enable recovery.
+ * This function writes the master node. Returns zero in case of success and a
+ * negative error code in case of failure. The master node is written twice to
+ * enable recovery.
  */
 int ubifs_write_master(struct ubifs_info *c)
 {
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index f1c3e5a1b315..4409f486ecef 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -346,7 +346,6 @@ static int write_orph_nodes(struct ubifs_info *c, int atomic)
 		int lnum;
 
 		/* Unmap any unused LEBs after consolidation */
-		lnum = c->ohead_lnum + 1;
 		for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
 			err = ubifs_leb_unmap(c, lnum);
 			if (err)
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index c14adb2f420c..c640938f62f0 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -596,7 +596,6 @@ static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
  * drop_last_node - drop the last node.
  * @sleb: scanned LEB information
  * @offs: offset of dropped nodes is returned here
- * @grouped: non-zero if whole group of nodes have to be dropped
  *
  * This is a helper function for 'ubifs_recover_leb()' which drops the last
  * node of the scanned LEB.
@@ -629,8 +628,8 @@ static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
  *
  * This function does a scan of a LEB, but caters for errors that might have
  * been caused by the unclean unmount from which we are attempting to recover.
- * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is
- * found, and a negative error code in case of failure.
+ * Returns the scanned information on success and a negative error code on
+ * failure.
  */
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 					 int offs, void *sbuf, int jhead)
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 4c37607a958e..79c6dbbc0e04 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -332,6 +332,8 @@ static int create_default_filesystem(struct ubifs_info *c)
 	cs->ch.node_type = UBIFS_CS_NODE;
 	err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0);
 	kfree(cs);
+	if (err)
+		return err;
 
 	ubifs_msg("default file-system created");
 	return 0;
@@ -447,7 +449,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
 		goto failed;
 	}
 
-	if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
+	if (c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
 		err = 13;
 		goto failed;
 	}
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 58aa05df2bb6..89adbc4d08ac 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -131,7 +131,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
  * @offs: offset to start at (usually zero)
  * @sbuf: scan buffer (must be c->leb_size)
  *
- * This function returns %0 on success and a negative error code on failure.
+ * This function returns the scanned information on success and a negative error
+ * code on failure.
  */
 struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
 					int offs, void *sbuf)
@@ -157,9 +158,10 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
 		return ERR_PTR(err);
 	}
 
-	if (err == -EBADMSG)
-		sleb->ecc = 1;
-
+	/*
+	 * Note, we ignore integrity errors (EBASMSG) because all the nodes are
+	 * protected by CRC checksums.
+	 */
 	return sleb;
 }
 
@@ -169,8 +171,6 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
  * @sleb: scanning information
  * @lnum: logical eraseblock number
  * @offs: offset to start at (usually zero)
- *
- * This function returns %0 on success and a negative error code on failure.
  */
 void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		    int lnum, int offs)
@@ -257,7 +257,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
  * @quiet: print no messages
  *
  * This function scans LEB number @lnum and returns complete information about
- * its contents. Returns the scaned information in case of success and,
+ * its contents. Returns the scanned information in case of success and,
  * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
  * of failure.
  *
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 3904c8574ef9..106bf20629ce 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -75,7 +75,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode)
 		return 1;
 	}
 
-	if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
+	if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
 		ubifs_err("unknown compression type %d", ui->compr_type);
 		return 2;
 	}
@@ -424,19 +424,19 @@ static int ubifs_show_options(struct seq_file *s, struct dentry *root)
 	struct ubifs_info *c = root->d_sb->s_fs_info;
 
 	if (c->mount_opts.unmount_mode == 2)
-		seq_printf(s, ",fast_unmount");
+		seq_puts(s, ",fast_unmount");
 	else if (c->mount_opts.unmount_mode == 1)
-		seq_printf(s, ",norm_unmount");
+		seq_puts(s, ",norm_unmount");
 
 	if (c->mount_opts.bulk_read == 2)
-		seq_printf(s, ",bulk_read");
+		seq_puts(s, ",bulk_read");
 	else if (c->mount_opts.bulk_read == 1)
-		seq_printf(s, ",no_bulk_read");
+		seq_puts(s, ",no_bulk_read");
 
 	if (c->mount_opts.chk_data_crc == 2)
-		seq_printf(s, ",chk_data_crc");
+		seq_puts(s, ",chk_data_crc");
 	else if (c->mount_opts.chk_data_crc == 1)
-		seq_printf(s, ",no_chk_data_crc");
+		seq_puts(s, ",no_chk_data_crc");
 
 	if (c->mount_opts.override_compr) {
 		seq_printf(s, ",compr=%s",
@@ -796,8 +796,8 @@ static int alloc_wbufs(struct ubifs_info *c)
 {
 	int i, err;
 
-	c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead),
-			   GFP_KERNEL);
+	c->jheads = kcalloc(c->jhead_cnt, sizeof(struct ubifs_jhead),
+			    GFP_KERNEL);
 	if (!c->jheads)
 		return -ENOMEM;
 
@@ -1963,7 +1963,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
 		mutex_init(&c->lp_mutex);
 		mutex_init(&c->tnc_mutex);
 		mutex_init(&c->log_mutex);
-		mutex_init(&c->mst_mutex);
 		mutex_init(&c->umount_mutex);
 		mutex_init(&c->bu_mutex);
 		mutex_init(&c->write_reserve_mutex);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 8a40cf9c02d7..6793db0754f6 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -3294,7 +3294,6 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
 		goto out_unlock;
 
 	if (err) {
-		err = -EINVAL;
 		key = &from_key;
 		goto out_dump;
 	}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 3600994f8411..7a205e046776 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -389,7 +389,6 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
 				ubifs_dump_lprops(c);
 			}
 			/* Try to commit anyway */
-			err = 0;
 			break;
 		}
 		p++;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c1f71fe17cc0..c4fe900c67ab 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -314,7 +314,6 @@ struct ubifs_scan_node {
  * @nodes_cnt: number of nodes scanned
  * @nodes: list of struct ubifs_scan_node
  * @endpt: end point (and therefore the start of empty space)
- * @ecc: read returned -EBADMSG
  * @buf: buffer containing entire LEB scanned
  */
 struct ubifs_scan_leb {
@@ -322,7 +321,6 @@ struct ubifs_scan_leb {
 	int nodes_cnt;
 	struct list_head nodes;
 	int endpt;
-	int ecc;
 	void *buf;
 };
 
@@ -1051,7 +1049,6 @@ struct ubifs_debug_info;
  *
  * @mst_node: master node
  * @mst_offs: offset of valid master node
- * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
  *
  * @max_bu_buf_len: maximum bulk-read buffer length
  * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
@@ -1292,7 +1289,6 @@ struct ubifs_info {
 
 	struct ubifs_mst_node *mst_node;
 	int mst_offs;
-	struct mutex mst_mutex;
 
 	int max_bu_buf_len;
 	struct mutex bu_mutex;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index d80738fdf424..86c6743ec1fe 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -27,7 +27,7 @@
 
 #include "udfdecl.h"
 #include <linux/fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/string.h> /* memset */
 #include <linux/capability.h>
@@ -100,24 +100,6 @@ static int udf_adinicb_write_begin(struct file *file,
 	return 0;
 }
 
-static int udf_adinicb_write_end(struct file *file,
-			struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct page *page, void *fsdata)
-{
-	struct inode *inode = mapping->host;
-	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-	char *kaddr;
-	struct udf_inode_info *iinfo = UDF_I(inode);
-
-	kaddr = kmap_atomic(page);
-	memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
-		kaddr + offset, copied);
-	kunmap_atomic(kaddr);
-
-	return simple_write_end(file, mapping, pos, len, copied, page, fsdata);
-}
-
 static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
 				     struct iov_iter *iter,
 				     loff_t offset)
@@ -130,7 +112,7 @@ const struct address_space_operations udf_adinicb_aops = {
 	.readpage	= udf_adinicb_readpage,
 	.writepage	= udf_adinicb_writepage,
 	.write_begin	= udf_adinicb_write_begin,
-	.write_end	= udf_adinicb_write_end,
+	.write_end	= simple_write_end,
 	.direct_IO	= udf_adinicb_direct_IO,
 };
 
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 6583fe9b0645..6ad5a453af97 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -21,7 +21,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/cdrom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "udf_sb.h"
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3286db047a40..813da94d447b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -63,7 +63,7 @@
 #include "udf_i.h"
 
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define VDS_POS_PRIMARY_VOL_DESC	0
 #define VDS_POS_UNALLOC_SPACE_DESC	1
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index d7c6dbe4194b..6fb7945c1e6e 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -20,7 +20,7 @@
  */
 
 #include "udfdecl.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/time.h>
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 44b815e57f94..afd470e588ff 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -412,7 +412,6 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
 	int extIndex = 0, newExtIndex = 0, hasExt = 0;
 	unsigned short valueCRC;
 	uint8_t curr;
-	const uint8_t hexChar[] = "0123456789ABCDEF";
 
 	if (udfName[0] == '.' &&
 	    (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
@@ -477,10 +476,10 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
 			newIndex = 250;
 		newName[newIndex++] = CRC_MARK;
 		valueCRC = crc_itu_t(0, fidName, fidNameLen);
-		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
-		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
-		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
-		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
+		newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
+		newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
+		newName[newIndex++] = hex_asc_upper_hi(valueCRC);
+		newName[newIndex++] = hex_asc_upper_lo(valueCRC);
 
 		if (hasExt) {
 			newName[newIndex++] = EXT_MARK;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 399e8cec6e60..5d47b4df61ea 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,7 @@
 config XFS_FS
 	tristate "XFS filesystem support"
 	depends on BLOCK
+	depends on (64BIT || LBDAF)
 	select EXPORTFS
 	select LIBCRC32C
 	help
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c21f43506661..d61799949580 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -17,6 +17,7 @@
 #
 
 ccflags-y += -I$(src)			# needed for trace events
+ccflags-y += -I$(src)/libxfs
 
 ccflags-$(CONFIG_XFS_DEBUG) += -g
 
@@ -25,6 +26,39 @@ obj-$(CONFIG_XFS_FS)		+= xfs.o
 # this one should be compiled first, as the tracing macros can easily blow up
 xfs-y				+= xfs_trace.o
 
+# build the libxfs code first
+xfs-y				+= $(addprefix libxfs/, \
+				   xfs_alloc.o \
+				   xfs_alloc_btree.o \
+				   xfs_attr.o \
+				   xfs_attr_leaf.o \
+				   xfs_attr_remote.o \
+				   xfs_bmap.o \
+				   xfs_bmap_btree.o \
+				   xfs_btree.o \
+				   xfs_da_btree.o \
+				   xfs_da_format.o \
+				   xfs_dir2.o \
+				   xfs_dir2_block.o \
+				   xfs_dir2_data.o \
+				   xfs_dir2_leaf.o \
+				   xfs_dir2_node.o \
+				   xfs_dir2_sf.o \
+				   xfs_dquot_buf.o \
+				   xfs_ialloc.o \
+				   xfs_ialloc_btree.o \
+				   xfs_inode_fork.o \
+				   xfs_inode_buf.o \
+				   xfs_log_rlimit.o \
+				   xfs_sb.o \
+				   xfs_symlink_remote.o \
+				   xfs_trans_resv.o \
+				   )
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT)		+= $(addprefix libxfs/, \
+				   xfs_rtbitmap.o \
+				   )
+
 # highlevel code
 xfs-y				+= xfs_aops.o \
 				   xfs_attr_inactive.o \
@@ -45,53 +79,27 @@ xfs-y				+= xfs_aops.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
 				   xfs_iops.o \
+				   xfs_inode.o \
 				   xfs_itable.o \
 				   xfs_message.o \
 				   xfs_mount.o \
 				   xfs_mru_cache.o \
 				   xfs_super.o \
 				   xfs_symlink.o \
+				   xfs_sysfs.o \
 				   xfs_trans.o \
 				   xfs_xattr.o \
 				   kmem.o \
 				   uuid.o
 
-# code shared with libxfs
-xfs-y				+= xfs_alloc.o \
-				   xfs_alloc_btree.o \
-				   xfs_attr.o \
-				   xfs_attr_leaf.o \
-				   xfs_attr_remote.o \
-				   xfs_bmap.o \
-				   xfs_bmap_btree.o \
-				   xfs_btree.o \
-				   xfs_da_btree.o \
-				   xfs_da_format.o \
-				   xfs_dir2.o \
-				   xfs_dir2_block.o \
-				   xfs_dir2_data.o \
-				   xfs_dir2_leaf.o \
-				   xfs_dir2_node.o \
-				   xfs_dir2_sf.o \
-				   xfs_dquot_buf.o \
-				   xfs_ialloc.o \
-				   xfs_ialloc_btree.o \
-				   xfs_icreate_item.o \
-				   xfs_inode.o \
-				   xfs_inode_fork.o \
-				   xfs_inode_buf.o \
-				   xfs_log_recover.o \
-				   xfs_log_rlimit.o \
-				   xfs_sb.o \
-				   xfs_symlink_remote.o \
-				   xfs_trans_resv.o
-
 # low-level transaction/log code
 xfs-y				+= xfs_log.o \
 				   xfs_log_cil.o \
 				   xfs_buf_item.o \
 				   xfs_extfree_item.o \
+				   xfs_icreate_item.o \
 				   xfs_inode_item.o \
+				   xfs_log_recover.o \
 				   xfs_trans_ail.o \
 				   xfs_trans_buf.o \
 				   xfs_trans_extfree.o \
@@ -107,8 +115,7 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
 				   xfs_quotaops.o
 
 # xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o \
-				   xfs_rtbitmap.o
+xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
 
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_PROC_FS)		+= xfs_stats.o
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 6e247a99f5db..6e247a99f5db 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index d43813267a80..4bffffe038a1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -483,9 +483,9 @@ xfs_agfl_read_verify(
 		return;
 
 	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (!xfs_agfl_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error)
 		xfs_verifier_error(bp);
@@ -503,7 +503,7 @@ xfs_agfl_write_verify(
 		return;
 
 	if (!xfs_agfl_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
@@ -559,7 +559,7 @@ xfs_alloc_update_counters(
 	xfs_trans_agblocks_delta(tp, len);
 	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
 		     be32_to_cpu(agf->agf_length)))
-		return EFSCORRUPTED;
+		return -EFSCORRUPTED;
 
 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
 	return 0;
@@ -2234,11 +2234,11 @@ xfs_agf_read_verify(
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
 	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
 				XFS_ERRTAG_ALLOC_READ_AGF,
 				XFS_RANDOM_ALLOC_READ_AGF))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error)
 		xfs_verifier_error(bp);
@@ -2252,7 +2252,7 @@ xfs_agf_write_verify(
 	struct xfs_buf_log_item	*bip = bp->b_fspriv;
 
 	if (!xfs_agf_verify(mp, bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
@@ -2601,11 +2601,11 @@ xfs_free_extent(
 	 */
 	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
 	if (args.agno >= args.mp->m_sb.sb_agcount)
-		return EFSCORRUPTED;
+		return -EFSCORRUPTED;
 
 	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
 	if (args.agbno >= args.mp->m_sb.sb_agblocks)
-		return EFSCORRUPTED;
+		return -EFSCORRUPTED;
 
 	args.pag = xfs_perag_get(args.mp, args.agno);
 	ASSERT(args.pag);
@@ -2617,7 +2617,7 @@ xfs_free_extent(
 	/* validate the extent size is legal now we have the agf locked */
 	if (args.agbno + len >
 			be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
-		error = EFSCORRUPTED;
+		error = -EFSCORRUPTED;
 		goto error0;
 	}
 
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index feacb061bab7..feacb061bab7 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 8358f1ded94d..e0e83e24d3ef 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -355,9 +355,9 @@ xfs_allocbt_read_verify(
 	struct xfs_buf	*bp)
 {
 	if (!xfs_btree_sblock_verify_crc(bp))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (!xfs_allocbt_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error) {
 		trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -371,7 +371,7 @@ xfs_allocbt_write_verify(
 {
 	if (!xfs_allocbt_verify(bp)) {
 		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45e189e7e81c..45e189e7e81c 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index bfe36fc2cdc2..353fb425faef 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -85,7 +85,7 @@ xfs_attr_args_init(
 {
 
 	if (!name)
-		return EINVAL;
+		return -EINVAL;
 
 	memset(args, 0, sizeof(*args));
 	args->geo = dp->i_mount->m_attr_geo;
@@ -95,7 +95,7 @@ xfs_attr_args_init(
 	args->name = name;
 	args->namelen = strlen((const char *)name);
 	if (args->namelen >= MAXNAMELEN)
-		return EFAULT;		/* match IRIX behaviour */
+		return -EFAULT;		/* match IRIX behaviour */
 
 	args->hashval = xfs_da_hashname(args->name, args->namelen);
 	return 0;
@@ -131,10 +131,10 @@ xfs_attr_get(
 	XFS_STATS_INC(xs_attr_get);
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return EIO;
+		return -EIO;
 
 	if (!xfs_inode_hasattr(ip))
-		return ENOATTR;
+		return -ENOATTR;
 
 	error = xfs_attr_args_init(&args, ip, name, flags);
 	if (error)
@@ -145,7 +145,7 @@ xfs_attr_get(
 
 	lock_mode = xfs_ilock_attr_map_shared(ip);
 	if (!xfs_inode_hasattr(ip))
-		error = ENOATTR;
+		error = -ENOATTR;
 	else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
 		error = xfs_attr_shortform_getvalue(&args);
 	else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
@@ -155,7 +155,7 @@ xfs_attr_get(
 	xfs_iunlock(ip, lock_mode);
 
 	*valuelenp = args.valuelen;
-	return error == EEXIST ? 0 : error;
+	return error == -EEXIST ? 0 : error;
 }
 
 /*
@@ -213,7 +213,7 @@ xfs_attr_set(
 	XFS_STATS_INC(xs_attr_set);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-		return EIO;
+		return -EIO;
 
 	error = xfs_attr_args_init(&args, dp, name, flags);
 	if (error)
@@ -304,7 +304,7 @@ xfs_attr_set(
 		 * the inode.
 		 */
 		error = xfs_attr_shortform_addname(&args);
-		if (error != ENOSPC) {
+		if (error != -ENOSPC) {
 			/*
 			 * Commit the shortform mods, and we're done.
 			 * NOTE: this is also the error path (EEXIST, etc).
@@ -419,10 +419,10 @@ xfs_attr_remove(
 	XFS_STATS_INC(xs_attr_remove);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-		return EIO;
+		return -EIO;
 
 	if (!xfs_inode_hasattr(dp))
-		return ENOATTR;
+		return -ENOATTR;
 
 	error = xfs_attr_args_init(&args, dp, name, flags);
 	if (error)
@@ -477,7 +477,7 @@ xfs_attr_remove(
 	xfs_trans_ijoin(args.trans, dp, 0);
 
 	if (!xfs_inode_hasattr(dp)) {
-		error = XFS_ERROR(ENOATTR);
+		error = -ENOATTR;
 	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
 		ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
 		error = xfs_attr_shortform_remove(&args);
@@ -534,28 +534,28 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
 	trace_xfs_attr_sf_addname(args);
 
 	retval = xfs_attr_shortform_lookup(args);
-	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
-		return(retval);
-	} else if (retval == EEXIST) {
+	if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+		return retval;
+	} else if (retval == -EEXIST) {
 		if (args->flags & ATTR_CREATE)
-			return(retval);
+			return retval;
 		retval = xfs_attr_shortform_remove(args);
 		ASSERT(retval == 0);
 	}
 
 	if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
 	    args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
-		return(XFS_ERROR(ENOSPC));
+		return -ENOSPC;
 
 	newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
 	newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
 
 	forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
 	if (!forkoff)
-		return(XFS_ERROR(ENOSPC));
+		return -ENOSPC;
 
 	xfs_attr_shortform_add(args, forkoff);
-	return(0);
+	return 0;
 }
 
 
@@ -592,10 +592,10 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	 * the given flags produce an error or call for an atomic rename.
 	 */
 	retval = xfs_attr3_leaf_lookup_int(bp, args);
-	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+	if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
 		xfs_trans_brelse(args->trans, bp);
 		return retval;
-	} else if (retval == EEXIST) {
+	} else if (retval == -EEXIST) {
 		if (args->flags & ATTR_CREATE) {	/* pure create op */
 			xfs_trans_brelse(args->trans, bp);
 			return retval;
@@ -626,7 +626,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	 * if required.
 	 */
 	retval = xfs_attr3_leaf_add(bp, args);
-	if (retval == ENOSPC) {
+	if (retval == -ENOSPC) {
 		/*
 		 * Promote the attribute list to the Btree format, then
 		 * Commit that transaction so that the node_addname() call
@@ -642,7 +642,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 			ASSERT(committed);
 			args->trans = NULL;
 			xfs_bmap_cancel(args->flist);
-			return(error);
+			return error;
 		}
 
 		/*
@@ -658,13 +658,13 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 */
 		error = xfs_trans_roll(&args->trans, dp);
 		if (error)
-			return (error);
+			return error;
 
 		/*
 		 * Fob the whole rest of the problem off on the Btree code.
 		 */
 		error = xfs_attr_node_addname(args);
-		return(error);
+		return error;
 	}
 
 	/*
@@ -673,7 +673,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	 */
 	error = xfs_trans_roll(&args->trans, dp);
 	if (error)
-		return (error);
+		return error;
 
 	/*
 	 * If there was an out-of-line value, allocate the blocks we
@@ -684,7 +684,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	if (args->rmtblkno > 0) {
 		error = xfs_attr_rmtval_set(args);
 		if (error)
-			return(error);
+			return error;
 	}
 
 	/*
@@ -700,7 +700,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 */
 		error = xfs_attr3_leaf_flipflags(args);
 		if (error)
-			return(error);
+			return error;
 
 		/*
 		 * Dismantle the "old" attribute/value pair by removing
@@ -714,7 +714,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		if (args->rmtblkno) {
 			error = xfs_attr_rmtval_remove(args);
 			if (error)
-				return(error);
+				return error;
 		}
 
 		/*
@@ -744,7 +744,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 				ASSERT(committed);
 				args->trans = NULL;
 				xfs_bmap_cancel(args->flist);
-				return(error);
+				return error;
 			}
 
 			/*
@@ -795,7 +795,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 		return error;
 
 	error = xfs_attr3_leaf_lookup_int(bp, args);
-	if (error == ENOATTR) {
+	if (error == -ENOATTR) {
 		xfs_trans_brelse(args->trans, bp);
 		return error;
 	}
@@ -850,7 +850,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
 		return error;
 
 	error = xfs_attr3_leaf_lookup_int(bp, args);
-	if (error != EEXIST)  {
+	if (error != -EEXIST)  {
 		xfs_trans_brelse(args->trans, bp);
 		return error;
 	}
@@ -906,9 +906,9 @@ restart:
 		goto out;
 	blk = &state->path.blk[ state->path.active-1 ];
 	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+	if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
 		goto out;
-	} else if (retval == EEXIST) {
+	} else if (retval == -EEXIST) {
 		if (args->flags & ATTR_CREATE)
 			goto out;
 
@@ -933,7 +933,7 @@ restart:
 	}
 
 	retval = xfs_attr3_leaf_add(blk->bp, state->args);
-	if (retval == ENOSPC) {
+	if (retval == -ENOSPC) {
 		if (state->path.active == 1) {
 			/*
 			 * Its really a single leaf node, but it had
@@ -1031,7 +1031,7 @@ restart:
 	if (args->rmtblkno > 0) {
 		error = xfs_attr_rmtval_set(args);
 		if (error)
-			return(error);
+			return error;
 	}
 
 	/*
@@ -1061,7 +1061,7 @@ restart:
 		if (args->rmtblkno) {
 			error = xfs_attr_rmtval_remove(args);
 			if (error)
-				return(error);
+				return error;
 		}
 
 		/*
@@ -1134,8 +1134,8 @@ out:
 	if (state)
 		xfs_da_state_free(state);
 	if (error)
-		return(error);
-	return(retval);
+		return error;
+	return retval;
 }
 
 /*
@@ -1168,7 +1168,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 	 * Search to see if name exists, and get back a pointer to it.
 	 */
 	error = xfs_da3_node_lookup_int(state, &retval);
-	if (error || (retval != EEXIST)) {
+	if (error || (retval != -EEXIST)) {
 		if (error == 0)
 			error = retval;
 		goto out;
@@ -1297,7 +1297,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 
 out:
 	xfs_da_state_free(state);
-	return(error);
+	return error;
 }
 
 /*
@@ -1345,7 +1345,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
 		}
 	}
 
-	return(0);
+	return 0;
 }
 
 /*
@@ -1376,7 +1376,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
 						blk->blkno, blk->disk_blkno,
 						&blk->bp, XFS_ATTR_FORK);
 			if (error)
-				return(error);
+				return error;
 		} else {
 			blk->bp = NULL;
 		}
@@ -1395,13 +1395,13 @@ xfs_attr_refillstate(xfs_da_state_t *state)
 						blk->blkno, blk->disk_blkno,
 						&blk->bp, XFS_ATTR_FORK);
 			if (error)
-				return(error);
+				return error;
 		} else {
 			blk->bp = NULL;
 		}
 	}
 
-	return(0);
+	return 0;
 }
 
 /*
@@ -1431,7 +1431,7 @@ xfs_attr_node_get(xfs_da_args_t *args)
 	error = xfs_da3_node_lookup_int(state, &retval);
 	if (error) {
 		retval = error;
-	} else if (retval == EEXIST) {
+	} else if (retval == -EEXIST) {
 		blk = &state->path.blk[ state->path.active-1 ];
 		ASSERT(blk->bp != NULL);
 		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
@@ -1455,5 +1455,5 @@ xfs_attr_node_get(xfs_da_args_t *args)
 	}
 
 	xfs_da_state_free(state);
-	return(retval);
+	return retval;
 }
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 28712d29e43c..b1f73dbbf3d8 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -214,7 +214,7 @@ xfs_attr3_leaf_write_verify(
 	struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
 
 	if (!xfs_attr3_leaf_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
@@ -242,9 +242,9 @@ xfs_attr3_leaf_read_verify(
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
 	     !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (!xfs_attr3_leaf_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error)
 		xfs_verifier_error(bp);
@@ -547,7 +547,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 		break;
 	}
 	if (i == end)
-		return(XFS_ERROR(ENOATTR));
+		return -ENOATTR;
 
 	/*
 	 * Fix up the attribute fork data, covering the hole
@@ -582,7 +582,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 
 	xfs_sbversion_add_attr2(mp, args->trans);
 
-	return(0);
+	return 0;
 }
 
 /*
@@ -611,9 +611,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
 			continue;
 		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
 			continue;
-		return(XFS_ERROR(EEXIST));
+		return -EEXIST;
 	}
-	return(XFS_ERROR(ENOATTR));
+	return -ENOATTR;
 }
 
 /*
@@ -640,18 +640,18 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
 			continue;
 		if (args->flags & ATTR_KERNOVAL) {
 			args->valuelen = sfe->valuelen;
-			return(XFS_ERROR(EEXIST));
+			return -EEXIST;
 		}
 		if (args->valuelen < sfe->valuelen) {
 			args->valuelen = sfe->valuelen;
-			return(XFS_ERROR(ERANGE));
+			return -ERANGE;
 		}
 		args->valuelen = sfe->valuelen;
 		memcpy(args->value, &sfe->nameval[args->namelen],
 						    args->valuelen);
-		return(XFS_ERROR(EEXIST));
+		return -EEXIST;
 	}
-	return(XFS_ERROR(ENOATTR));
+	return -ENOATTR;
 }
 
 /*
@@ -691,7 +691,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 		 * If we hit an IO error middle of the transaction inside
 		 * grow_inode(), we may have inconsistent data. Bail out.
 		 */
-		if (error == EIO)
+		if (error == -EIO)
 			goto out;
 		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
 		memcpy(ifp->if_u1.if_data, tmpbuffer, size);	/* it back */
@@ -730,9 +730,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 						sfe->namelen);
 		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
 		error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
-		ASSERT(error == ENOATTR);
+		ASSERT(error == -ENOATTR);
 		error = xfs_attr3_leaf_add(bp, &nargs);
-		ASSERT(error != ENOSPC);
+		ASSERT(error != -ENOSPC);
 		if (error)
 			goto out;
 		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
@@ -741,7 +741,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 
 out:
 	kmem_free(tmpbuffer);
-	return(error);
+	return error;
 }
 
 /*
@@ -769,12 +769,12 @@ xfs_attr_shortform_allfit(
 		if (entry->flags & XFS_ATTR_INCOMPLETE)
 			continue;		/* don't copy partial entries */
 		if (!(entry->flags & XFS_ATTR_LOCAL))
-			return(0);
+			return 0;
 		name_loc = xfs_attr3_leaf_name_local(leaf, i);
 		if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
-			return(0);
+			return 0;
 		if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
-			return(0);
+			return 0;
 		bytes += sizeof(struct xfs_attr_sf_entry) - 1
 				+ name_loc->namelen
 				+ be16_to_cpu(name_loc->valuelen);
@@ -809,7 +809,7 @@ xfs_attr3_leaf_to_shortform(
 
 	tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
 	if (!tmpbuffer)
-		return ENOMEM;
+		return -ENOMEM;
 
 	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
 
@@ -1017,10 +1017,10 @@ xfs_attr3_leaf_split(
 	ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
 	error = xfs_da_grow_inode(state->args, &blkno);
 	if (error)
-		return(error);
+		return error;
 	error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
 	if (error)
-		return(error);
+		return error;
 	newblk->blkno = blkno;
 	newblk->magic = XFS_ATTR_LEAF_MAGIC;
 
@@ -1031,7 +1031,7 @@ xfs_attr3_leaf_split(
 	xfs_attr3_leaf_rebalance(state, oldblk, newblk);
 	error = xfs_da3_blk_link(state, oldblk, newblk);
 	if (error)
-		return(error);
+		return error;
 
 	/*
 	 * Save info on "old" attribute for "atomic rename" ops, leaf_add()
@@ -1053,7 +1053,7 @@ xfs_attr3_leaf_split(
 	 */
 	oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
 	newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
-	return(error);
+	return error;
 }
 
 /*
@@ -1108,7 +1108,7 @@ xfs_attr3_leaf_add(
 	 * no good and we should just give up.
 	 */
 	if (!ichdr.holes && sum < entsize)
-		return XFS_ERROR(ENOSPC);
+		return -ENOSPC;
 
 	/*
 	 * Compact the entries to coalesce free space.
@@ -1121,7 +1121,7 @@ xfs_attr3_leaf_add(
 	 * free region, in freemap[0].  If it is not big enough, give up.
 	 */
 	if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
-		tmp = ENOSPC;
+		tmp = -ENOSPC;
 		goto out_log_hdr;
 	}
 
@@ -1692,7 +1692,7 @@ xfs_attr3_leaf_toosmall(
 		ichdr.usedbytes;
 	if (bytes > (state->args->geo->blksize >> 1)) {
 		*action = 0;	/* blk over 50%, don't try to join */
-		return(0);
+		return 0;
 	}
 
 	/*
@@ -1711,7 +1711,7 @@ xfs_attr3_leaf_toosmall(
 		error = xfs_da3_path_shift(state, &state->altpath, forward,
 						 0, &retval);
 		if (error)
-			return(error);
+			return error;
 		if (retval) {
 			*action = 0;
 		} else {
@@ -1740,7 +1740,7 @@ xfs_attr3_leaf_toosmall(
 		error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
 					blkno, -1, &bp);
 		if (error)
-			return(error);
+			return error;
 
 		xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
 
@@ -1757,7 +1757,7 @@ xfs_attr3_leaf_toosmall(
 	}
 	if (i >= 2) {
 		*action = 0;
-		return(0);
+		return 0;
 	}
 
 	/*
@@ -1773,13 +1773,13 @@ xfs_attr3_leaf_toosmall(
 						 0, &retval);
 	}
 	if (error)
-		return(error);
+		return error;
 	if (retval) {
 		*action = 0;
 	} else {
 		*action = 1;
 	}
-	return(0);
+	return 0;
 }
 
 /*
@@ -2123,7 +2123,7 @@ xfs_attr3_leaf_lookup_int(
 	}
 	if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
 		args->index = probe;
-		return XFS_ERROR(ENOATTR);
+		return -ENOATTR;
 	}
 
 	/*
@@ -2152,7 +2152,7 @@ xfs_attr3_leaf_lookup_int(
 			if (!xfs_attr_namesp_match(args->flags, entry->flags))
 				continue;
 			args->index = probe;
-			return XFS_ERROR(EEXIST);
+			return -EEXIST;
 		} else {
 			name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
 			if (name_rmt->namelen != args->namelen)
@@ -2168,11 +2168,11 @@ xfs_attr3_leaf_lookup_int(
 			args->rmtblkcnt = xfs_attr3_rmt_blocks(
 							args->dp->i_mount,
 							args->rmtvaluelen);
-			return XFS_ERROR(EEXIST);
+			return -EEXIST;
 		}
 	}
 	args->index = probe;
-	return XFS_ERROR(ENOATTR);
+	return -ENOATTR;
 }
 
 /*
@@ -2208,7 +2208,7 @@ xfs_attr3_leaf_getvalue(
 		}
 		if (args->valuelen < valuelen) {
 			args->valuelen = valuelen;
-			return XFS_ERROR(ERANGE);
+			return -ERANGE;
 		}
 		args->valuelen = valuelen;
 		memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
@@ -2226,7 +2226,7 @@ xfs_attr3_leaf_getvalue(
 		}
 		if (args->valuelen < args->rmtvaluelen) {
 			args->valuelen = args->rmtvaluelen;
-			return XFS_ERROR(ERANGE);
+			return -ERANGE;
 		}
 		args->valuelen = args->rmtvaluelen;
 	}
@@ -2481,7 +2481,7 @@ xfs_attr3_leaf_clearflag(
 	 */
 	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
 	if (error)
-		return(error);
+		return error;
 
 	leaf = bp->b_addr;
 	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
@@ -2548,7 +2548,7 @@ xfs_attr3_leaf_setflag(
 	 */
 	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
 	if (error)
-		return(error);
+		return error;
 
 	leaf = bp->b_addr;
 #ifdef DEBUG
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index e2929da7c3ba..e2929da7c3ba 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index b5adfecbb8ee..7510ab8058a4 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -138,11 +138,11 @@ xfs_attr3_rmt_read_verify(
 
 	while (len > 0) {
 		if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
-			xfs_buf_ioerror(bp, EFSBADCRC);
+			xfs_buf_ioerror(bp, -EFSBADCRC);
 			break;
 		}
 		if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
-			xfs_buf_ioerror(bp, EFSCORRUPTED);
+			xfs_buf_ioerror(bp, -EFSCORRUPTED);
 			break;
 		}
 		len -= blksize;
@@ -178,7 +178,7 @@ xfs_attr3_rmt_write_verify(
 
 	while (len > 0) {
 		if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
-			xfs_buf_ioerror(bp, EFSCORRUPTED);
+			xfs_buf_ioerror(bp, -EFSCORRUPTED);
 			xfs_verifier_error(bp);
 			return;
 		}
@@ -257,7 +257,7 @@ xfs_attr_rmtval_copyout(
 				xfs_alert(mp,
 "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
 					bno, *offset, byte_cnt, ino);
-				return EFSCORRUPTED;
+				return -EFSCORRUPTED;
 			}
 			hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
 		}
@@ -452,7 +452,7 @@ xfs_attr_rmtval_set(
 			ASSERT(committed);
 			args->trans = NULL;
 			xfs_bmap_cancel(args->flist);
-			return(error);
+			return error;
 		}
 
 		/*
@@ -473,7 +473,7 @@ xfs_attr_rmtval_set(
 		 */
 		error = xfs_trans_roll(&args->trans, dp);
 		if (error)
-			return (error);
+			return error;
 	}
 
 	/*
@@ -498,7 +498,7 @@ xfs_attr_rmtval_set(
 				       blkcnt, &map, &nmap,
 				       XFS_BMAPI_ATTRFORK);
 		if (error)
-			return(error);
+			return error;
 		ASSERT(nmap == 1);
 		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
 		       (map.br_startblock != HOLESTARTBLOCK));
@@ -508,7 +508,7 @@ xfs_attr_rmtval_set(
 
 		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
 		if (!bp)
-			return ENOMEM;
+			return -ENOMEM;
 		bp->b_ops = &xfs_attr3_rmt_buf_ops;
 
 		xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
@@ -563,7 +563,7 @@ xfs_attr_rmtval_remove(
 		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
 				       blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
 		if (error)
-			return(error);
+			return error;
 		ASSERT(nmap == 1);
 		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
 		       (map.br_startblock != HOLESTARTBLOCK));
@@ -622,7 +622,7 @@ xfs_attr_rmtval_remove(
 		 */
 		error = xfs_trans_roll(&args->trans, args->dp);
 		if (error)
-			return (error);
+			return error;
 	}
-	return(0);
+	return 0;
 }
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index 5a9acfa156d7..5a9acfa156d7 100644
--- a/fs/xfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 919756e3ba53..919756e3ba53 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
index e1649c0d3e02..e1649c0d3e02 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/libxfs/xfs_bit.h
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 75c3fe5f3d9d..de2d26d32844 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -392,7 +392,7 @@ xfs_bmap_check_leaf_extents(
 	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
 	bno = be64_to_cpu(*pp);
 
-	ASSERT(bno != NULLDFSBNO);
+	ASSERT(bno != NULLFSBLOCK);
 	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
 	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
 
@@ -1033,7 +1033,7 @@ xfs_bmap_add_attrfork_btree(
 			goto error0;
 		if (stat == 0) {
 			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-			return XFS_ERROR(ENOSPC);
+			return -ENOSPC;
 		}
 		*firstblock = cur->bc_private.b.firstblock;
 		cur->bc_private.b.allocated = 0;
@@ -1115,7 +1115,7 @@ xfs_bmap_add_attrfork_local(
 
 	/* should only be called for types that support local format data */
 	ASSERT(0);
-	return EFSCORRUPTED;
+	return -EFSCORRUPTED;
 }
 
 /*
@@ -1192,7 +1192,7 @@ xfs_bmap_add_attrfork(
 		break;
 	default:
 		ASSERT(0);
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto trans_cancel;
 	}
 
@@ -1299,7 +1299,7 @@ xfs_bmap_read_extents(
 	ASSERT(level > 0);
 	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
 	bno = be64_to_cpu(*pp);
-	ASSERT(bno != NULLDFSBNO);
+	ASSERT(bno != NULLFSBLOCK);
 	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
 	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
 	/*
@@ -1399,7 +1399,7 @@ xfs_bmap_read_extents(
 	return 0;
 error0:
 	xfs_trans_brelse(tp, bp);
-	return XFS_ERROR(EFSCORRUPTED);
+	return -EFSCORRUPTED;
 }
 
 
@@ -1429,11 +1429,7 @@ xfs_bmap_search_multi_extents(
 	gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
 	gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
 	gotp->br_state = XFS_EXT_INVALID;
-#if XFS_BIG_BLKNOS
 	gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
-#else
-	gotp->br_startblock = 0xffffa5a5;
-#endif
 	prevp->br_startoff = NULLFILEOFF;
 
 	ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
@@ -1576,7 +1572,7 @@ xfs_bmap_last_before(
 	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
-	       return XFS_ERROR(EIO);
+	       return -EIO;
 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
 		*last_block = 0;
 		return 0;
@@ -1690,7 +1686,7 @@ xfs_bmap_last_offset(
 
 	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-	       return XFS_ERROR(EIO);
+	       return -EIO;
 
 	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
 	if (error || is_empty)
@@ -3323,7 +3319,7 @@ xfs_bmap_extsize_align(
 		if (orig_off < align_off ||
 		    orig_end > align_off + align_alen ||
 		    align_alen - temp < orig_alen)
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		/*
 		 * Try to fix it by moving the start up.
 		 */
@@ -3348,7 +3344,7 @@ xfs_bmap_extsize_align(
 		 * Result doesn't cover the request, fail it.
 		 */
 		if (orig_off < align_off || orig_end > align_off + align_alen)
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 	} else {
 		ASSERT(orig_off >= align_off);
 		ASSERT(orig_end <= align_off + align_alen);
@@ -4051,11 +4047,11 @@ xfs_bmapi_read(
 	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
 	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
 		XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	XFS_STATS_INC(xs_blk_mapr);
 
@@ -4246,11 +4242,11 @@ xfs_bmapi_delay(
 	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
 	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
 		XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	XFS_STATS_INC(xs_blk_mapw);
 
@@ -4469,7 +4465,7 @@ xfs_bmapi_convert_unwritten(
 	 * so generate another request.
 	 */
 	if (mval->br_blockcount < len)
-		return EAGAIN;
+		return -EAGAIN;
 	return 0;
 }
 
@@ -4540,11 +4536,11 @@ xfs_bmapi_write(
 	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
 	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
 		XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 
@@ -4620,7 +4616,7 @@ xfs_bmapi_write(
 
 		/* Execute unwritten extent conversion if necessary */
 		error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
-		if (error == EAGAIN)
+		if (error == -EAGAIN)
 			continue;
 		if (error)
 			goto error0;
@@ -4922,7 +4918,7 @@ xfs_bmap_del_extent(
 					goto done;
 				cur->bc_rec.b = new;
 				error = xfs_btree_insert(cur, &i);
-				if (error && error != ENOSPC)
+				if (error && error != -ENOSPC)
 					goto done;
 				/*
 				 * If get no-space back from btree insert,
@@ -4930,7 +4926,7 @@ xfs_bmap_del_extent(
 				 * block reservation.
 				 * Fix up our state and return the error.
 				 */
-				if (error == ENOSPC) {
+				if (error == -ENOSPC) {
 					/*
 					 * Reset the cursor, don't trust
 					 * it after any insert operation.
@@ -4958,7 +4954,7 @@ xfs_bmap_del_extent(
 					xfs_bmbt_set_blockcount(ep,
 						got.br_blockcount);
 					flags = 0;
-					error = XFS_ERROR(ENOSPC);
+					error = -ENOSPC;
 					goto done;
 				}
 				XFS_WANT_CORRUPTED_GOTO(i == 1, done);
@@ -5076,11 +5072,11 @@ xfs_bunmapi(
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
 		XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
 				 ip->i_mount);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	mp = ip->i_mount;
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(len > 0);
@@ -5325,7 +5321,7 @@ xfs_bunmapi(
 		    del.br_startoff > got.br_startoff &&
 		    del.br_startoff + del.br_blockcount <
 		    got.br_startoff + got.br_blockcount) {
-			error = XFS_ERROR(ENOSPC);
+			error = -ENOSPC;
 			goto error0;
 		}
 		error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
@@ -5449,11 +5445,11 @@ xfs_bmap_shift_extents(
 	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
 		XFS_ERROR_REPORT("xfs_bmap_shift_extents",
 				 XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	ASSERT(current_ext != NULL);
 
@@ -5516,14 +5512,14 @@ xfs_bmap_shift_extents(
 						*current_ext - 1), &left);
 
 			if (startoff < left.br_startoff + left.br_blockcount)
-				error = XFS_ERROR(EINVAL);
+				error = -EINVAL;
 		} else if (offset_shift_fsb > got.br_startoff) {
 			/*
 			 * When first extent is shifted, offset_shift_fsb
 			 * should be less than the stating offset of
 			 * the first extent.
 			 */
-			error = XFS_ERROR(EINVAL);
+			error = -EINVAL;
 		}
 
 		if (error)
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b879ca56a64c..b879ca56a64c 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 948836c4fd90..fba753308f31 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -111,23 +111,8 @@ __xfs_bmbt_get_all(
 	ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
 	s->br_startoff = ((xfs_fileoff_t)l0 &
 			   xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-#if XFS_BIG_BLKNOS
 	s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
 			   (((xfs_fsblock_t)l1) >> 21);
-#else
-#ifdef DEBUG
-	{
-		xfs_dfsbno_t	b;
-
-		b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
-		    (((xfs_dfsbno_t)l1) >> 21);
-		ASSERT((b >> 32) == 0 || isnulldstartblock(b));
-		s->br_startblock = (xfs_fsblock_t)b;
-	}
-#else	/* !DEBUG */
-	s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
-#endif	/* DEBUG */
-#endif	/* XFS_BIG_BLKNOS */
 	s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
 	/* This is xfs_extent_state() in-line */
 	if (ext_flag) {
@@ -163,21 +148,8 @@ xfs_fsblock_t
 xfs_bmbt_get_startblock(
 	xfs_bmbt_rec_host_t	*r)
 {
-#if XFS_BIG_BLKNOS
 	return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
 	       (((xfs_fsblock_t)r->l1) >> 21);
-#else
-#ifdef DEBUG
-	xfs_dfsbno_t	b;
-
-	b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
-	    (((xfs_dfsbno_t)r->l1) >> 21);
-	ASSERT((b >> 32) == 0 || isnulldstartblock(b));
-	return (xfs_fsblock_t)b;
-#else	/* !DEBUG */
-	return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
-#endif	/* DEBUG */
-#endif	/* XFS_BIG_BLKNOS */
 }
 
 /*
@@ -241,7 +213,6 @@ xfs_bmbt_set_allf(
 	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
 	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
 
-#if XFS_BIG_BLKNOS
 	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
 
 	r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -250,23 +221,6 @@ xfs_bmbt_set_allf(
 	r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
 		((xfs_bmbt_rec_base_t)blockcount &
 		(xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-#else	/* !XFS_BIG_BLKNOS */
-	if (isnullstartblock(startblock)) {
-		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-			((xfs_bmbt_rec_base_t)startoff << 9) |
-			 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-		r->l1 = xfs_mask64hi(11) |
-			  ((xfs_bmbt_rec_base_t)startblock << 21) |
-			  ((xfs_bmbt_rec_base_t)blockcount &
-			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-	} else {
-		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-			((xfs_bmbt_rec_base_t)startoff << 9);
-		r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
-			 ((xfs_bmbt_rec_base_t)blockcount &
-			 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-	}
-#endif	/* XFS_BIG_BLKNOS */
 }
 
 /*
@@ -298,8 +252,6 @@ xfs_bmbt_disk_set_allf(
 	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
 	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
 	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-
-#if XFS_BIG_BLKNOS
 	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
 
 	r->l0 = cpu_to_be64(
@@ -310,26 +262,6 @@ xfs_bmbt_disk_set_allf(
 		((xfs_bmbt_rec_base_t)startblock << 21) |
 		 ((xfs_bmbt_rec_base_t)blockcount &
 		  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-#else	/* !XFS_BIG_BLKNOS */
-	if (isnullstartblock(startblock)) {
-		r->l0 = cpu_to_be64(
-			((xfs_bmbt_rec_base_t)extent_flag << 63) |
-			 ((xfs_bmbt_rec_base_t)startoff << 9) |
-			  (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
-		r->l1 = cpu_to_be64(xfs_mask64hi(11) |
-			  ((xfs_bmbt_rec_base_t)startblock << 21) |
-			  ((xfs_bmbt_rec_base_t)blockcount &
-			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-	} else {
-		r->l0 = cpu_to_be64(
-			((xfs_bmbt_rec_base_t)extent_flag << 63) |
-			 ((xfs_bmbt_rec_base_t)startoff << 9));
-		r->l1 = cpu_to_be64(
-			((xfs_bmbt_rec_base_t)startblock << 21) |
-			 ((xfs_bmbt_rec_base_t)blockcount &
-			  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-	}
-#endif	/* XFS_BIG_BLKNOS */
 }
 
 /*
@@ -365,24 +297,11 @@ xfs_bmbt_set_startblock(
 	xfs_bmbt_rec_host_t *r,
 	xfs_fsblock_t	v)
 {
-#if XFS_BIG_BLKNOS
 	ASSERT((v & xfs_mask64hi(12)) == 0);
 	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
 		  (xfs_bmbt_rec_base_t)(v >> 43);
 	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
 		  (xfs_bmbt_rec_base_t)(v << 21);
-#else	/* !XFS_BIG_BLKNOS */
-	if (isnullstartblock(v)) {
-		r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-		r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
-			  ((xfs_bmbt_rec_base_t)v << 21) |
-			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-	} else {
-		r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-		r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
-			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-	}
-#endif	/* XFS_BIG_BLKNOS */
 }
 
 /*
@@ -438,8 +357,8 @@ xfs_bmbt_to_bmdr(
 		       cpu_to_be64(XFS_BUF_DADDR_NULL));
 	} else
 		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
-	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
-	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
+	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
+	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
 	ASSERT(rblock->bb_level != 0);
 	dblock->bb_level = rblock->bb_level;
 	dblock->bb_numrecs = rblock->bb_numrecs;
@@ -554,7 +473,7 @@ xfs_bmbt_alloc_block(
 	args.minlen = args.maxlen = args.prod = 1;
 	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
 	if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
-		error = XFS_ERROR(ENOSPC);
+		error = -ENOSPC;
 		goto error0;
 	}
 	error = xfs_alloc_vextent(&args);
@@ -763,11 +682,11 @@ xfs_bmbt_verify(
 
 	/* sibling pointer verification */
 	if (!block->bb_u.l.bb_leftsib ||
-	    (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
+	    (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
 	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
 		return false;
 	if (!block->bb_u.l.bb_rightsib ||
-	    (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
+	    (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
 	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
 		return false;
 
@@ -779,9 +698,9 @@ xfs_bmbt_read_verify(
 	struct xfs_buf	*bp)
 {
 	if (!xfs_btree_lblock_verify_crc(bp))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (!xfs_bmbt_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error) {
 		trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -795,7 +714,7 @@ xfs_bmbt_write_verify(
 {
 	if (!xfs_bmbt_verify(bp)) {
 		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
@@ -959,7 +878,7 @@ xfs_bmbt_change_owner(
 
 	cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
 	if (!cur)
-		return ENOMEM;
+		return -ENOMEM;
 
 	error = xfs_btree_change_owner(cur, new_owner, buffer_list);
 	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 819a8a4dee95..819a8a4dee95 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index cf893bc1e373..8fe6a93ff473 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -78,11 +78,11 @@ xfs_btree_check_lblock(
 		be16_to_cpu(block->bb_numrecs) <=
 			cur->bc_ops->get_maxrecs(cur, level) &&
 		block->bb_u.l.bb_leftsib &&
-		(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+		(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
 		 XFS_FSB_SANITY_CHECK(mp,
 			be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
 		block->bb_u.l.bb_rightsib &&
-		(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+		(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
 		 XFS_FSB_SANITY_CHECK(mp,
 			be64_to_cpu(block->bb_u.l.bb_rightsib)));
 
@@ -92,7 +92,7 @@ xfs_btree_check_lblock(
 		if (bp)
 			trace_xfs_btree_corrupt(bp, _RET_IP_);
 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	return 0;
 }
@@ -140,7 +140,7 @@ xfs_btree_check_sblock(
 		if (bp)
 			trace_xfs_btree_corrupt(bp, _RET_IP_);
 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	return 0;
 }
@@ -167,12 +167,12 @@ xfs_btree_check_block(
 int					/* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lptr(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_dfsbno_t		bno,	/* btree block disk address */
+	xfs_fsblock_t		bno,	/* btree block disk address */
 	int			level)	/* btree block level */
 {
 	XFS_WANT_CORRUPTED_RETURN(
 		level > 0 &&
-		bno != NULLDFSBNO &&
+		bno != NULLFSBLOCK &&
 		XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
 	return 0;
 }
@@ -595,7 +595,7 @@ xfs_btree_islastblock(
 	block = xfs_btree_get_block(cur, level, &bp);
 	xfs_btree_check_block(cur, block, level, bp);
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO);
+		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
 	else
 		return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
 }
@@ -771,16 +771,16 @@ xfs_btree_readahead_lblock(
 	struct xfs_btree_block	*block)
 {
 	int			rval = 0;
-	xfs_dfsbno_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
-	xfs_dfsbno_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 
-	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
 		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
 				     cur->bc_ops->buf_ops);
 		rval++;
 	}
 
-	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
 		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
 				     cur->bc_ops->buf_ops);
 		rval++;
@@ -852,7 +852,7 @@ xfs_btree_ptr_to_daddr(
 	union xfs_btree_ptr	*ptr)
 {
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+		ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
 
 		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
 	} else {
@@ -900,9 +900,9 @@ xfs_btree_setbuf(
 
 	b = XFS_BUF_TO_BLOCK(bp);
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO))
+		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
 			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
-		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO))
+		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
 			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
 	} else {
 		if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
@@ -918,7 +918,7 @@ xfs_btree_ptr_is_null(
 	union xfs_btree_ptr	*ptr)
 {
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		return ptr->l == cpu_to_be64(NULLDFSBNO);
+		return ptr->l == cpu_to_be64(NULLFSBLOCK);
 	else
 		return ptr->s == cpu_to_be32(NULLAGBLOCK);
 }
@@ -929,7 +929,7 @@ xfs_btree_set_ptr_null(
 	union xfs_btree_ptr	*ptr)
 {
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		ptr->l = cpu_to_be64(NULLDFSBNO);
+		ptr->l = cpu_to_be64(NULLFSBLOCK);
 	else
 		ptr->s = cpu_to_be32(NULLAGBLOCK);
 }
@@ -997,8 +997,8 @@ xfs_btree_init_block_int(
 	buf->bb_numrecs = cpu_to_be16(numrecs);
 
 	if (flags & XFS_BTREE_LONG_PTRS) {
-		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
 		if (flags & XFS_BTREE_CRC_BLOCKS) {
 			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
 			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
@@ -1140,7 +1140,7 @@ xfs_btree_get_buf_block(
 				 mp->m_bsize, flags);
 
 	if (!*bpp)
-		return ENOMEM;
+		return -ENOMEM;
 
 	(*bpp)->b_ops = cur->bc_ops->buf_ops;
 	*block = XFS_BUF_TO_BLOCK(*bpp);
@@ -1498,7 +1498,7 @@ xfs_btree_increment(
 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
 			goto out0;
 		ASSERT(0);
-		error = EFSCORRUPTED;
+		error = -EFSCORRUPTED;
 		goto error0;
 	}
 	ASSERT(lev < cur->bc_nlevels);
@@ -1597,7 +1597,7 @@ xfs_btree_decrement(
 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
 			goto out0;
 		ASSERT(0);
-		error = EFSCORRUPTED;
+		error = -EFSCORRUPTED;
 		goto error0;
 	}
 	ASSERT(lev < cur->bc_nlevels);
@@ -4018,7 +4018,7 @@ xfs_btree_block_change_owner(
 	/* now read rh sibling block for next iteration */
 	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
 	if (xfs_btree_ptr_is_null(cur, &rptr))
-		return ENOENT;
+		return -ENOENT;
 
 	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
 }
@@ -4061,7 +4061,7 @@ xfs_btree_change_owner(
 							     buffer_list);
 		} while (!error);
 
-		if (error != ENOENT)
+		if (error != -ENOENT)
 			return error;
 	}
 
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index a04b69422f67..8f18bab73ea5 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -258,7 +258,7 @@ xfs_btree_check_block(
 int					/* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lptr(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_dfsbno_t		ptr,	/* btree block disk address */
+	xfs_fsblock_t		ptr,	/* btree block disk address */
 	int			level);	/* btree block level */
 
 /*
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index fad1676ad8cd..fad1676ad8cd 100644
--- a/fs/xfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index a514ab616650..2c42ae28d027 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -185,7 +185,7 @@ xfs_da3_node_write_verify(
 	struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
 
 	if (!xfs_da3_node_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
@@ -214,13 +214,13 @@ xfs_da3_node_read_verify(
 	switch (be16_to_cpu(info->magic)) {
 		case XFS_DA3_NODE_MAGIC:
 			if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
-				xfs_buf_ioerror(bp, EFSBADCRC);
+				xfs_buf_ioerror(bp, -EFSBADCRC);
 				break;
 			}
 			/* fall through */
 		case XFS_DA_NODE_MAGIC:
 			if (!xfs_da3_node_verify(bp)) {
-				xfs_buf_ioerror(bp, EFSCORRUPTED);
+				xfs_buf_ioerror(bp, -EFSCORRUPTED);
 				break;
 			}
 			return;
@@ -315,7 +315,7 @@ xfs_da3_node_create(
 
 	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
 	if (error)
-		return(error);
+		return error;
 	bp->b_ops = &xfs_da3_node_buf_ops;
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
 	node = bp->b_addr;
@@ -337,7 +337,7 @@ xfs_da3_node_create(
 		XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
 
 	*bpp = bp;
-	return(0);
+	return 0;
 }
 
 /*
@@ -385,8 +385,8 @@ xfs_da3_split(
 		switch (oldblk->magic) {
 		case XFS_ATTR_LEAF_MAGIC:
 			error = xfs_attr3_leaf_split(state, oldblk, newblk);
-			if ((error != 0) && (error != ENOSPC)) {
-				return(error);	/* GROT: attr is inconsistent */
+			if ((error != 0) && (error != -ENOSPC)) {
+				return error;	/* GROT: attr is inconsistent */
 			}
 			if (!error) {
 				addblk = newblk;
@@ -408,7 +408,7 @@ xfs_da3_split(
 							    &state->extrablk);
 			}
 			if (error)
-				return(error);	/* GROT: attr inconsistent */
+				return error;	/* GROT: attr inconsistent */
 			addblk = newblk;
 			break;
 		case XFS_DIR2_LEAFN_MAGIC:
@@ -422,7 +422,7 @@ xfs_da3_split(
 							 max - i, &action);
 			addblk->bp = NULL;
 			if (error)
-				return(error);	/* GROT: dir is inconsistent */
+				return error;	/* GROT: dir is inconsistent */
 			/*
 			 * Record the newly split block for the next time thru?
 			 */
@@ -439,7 +439,7 @@ xfs_da3_split(
 		xfs_da3_fixhashpath(state, &state->path);
 	}
 	if (!addblk)
-		return(0);
+		return 0;
 
 	/*
 	 * Split the root node.
@@ -449,7 +449,7 @@ xfs_da3_split(
 	error = xfs_da3_root_split(state, oldblk, addblk);
 	if (error) {
 		addblk->bp = NULL;
-		return(error);	/* GROT: dir is inconsistent */
+		return error;	/* GROT: dir is inconsistent */
 	}
 
 	/*
@@ -492,7 +492,7 @@ xfs_da3_split(
 		    sizeof(node->hdr.info)));
 	}
 	addblk->bp = NULL;
-	return(0);
+	return 0;
 }
 
 /*
@@ -670,18 +670,18 @@ xfs_da3_node_split(
 		 */
 		error = xfs_da_grow_inode(state->args, &blkno);
 		if (error)
-			return(error);	/* GROT: dir is inconsistent */
+			return error;	/* GROT: dir is inconsistent */
 
 		error = xfs_da3_node_create(state->args, blkno, treelevel,
 					   &newblk->bp, state->args->whichfork);
 		if (error)
-			return(error);	/* GROT: dir is inconsistent */
+			return error;	/* GROT: dir is inconsistent */
 		newblk->blkno = blkno;
 		newblk->magic = XFS_DA_NODE_MAGIC;
 		xfs_da3_node_rebalance(state, oldblk, newblk);
 		error = xfs_da3_blk_link(state, oldblk, newblk);
 		if (error)
-			return(error);
+			return error;
 		*result = 1;
 	} else {
 		*result = 0;
@@ -721,7 +721,7 @@ xfs_da3_node_split(
 		}
 	}
 
-	return(0);
+	return 0;
 }
 
 /*
@@ -963,9 +963,9 @@ xfs_da3_join(
 		case XFS_ATTR_LEAF_MAGIC:
 			error = xfs_attr3_leaf_toosmall(state, &action);
 			if (error)
-				return(error);
+				return error;
 			if (action == 0)
-				return(0);
+				return 0;
 			xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
 			break;
 		case XFS_DIR2_LEAFN_MAGIC:
@@ -985,7 +985,7 @@ xfs_da3_join(
 			xfs_da3_fixhashpath(state, &state->path);
 			error = xfs_da3_node_toosmall(state, &action);
 			if (error)
-				return(error);
+				return error;
 			if (action == 0)
 				return 0;
 			xfs_da3_node_unbalance(state, drop_blk, save_blk);
@@ -995,12 +995,12 @@ xfs_da3_join(
 		error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
 		xfs_da_state_kill_altpath(state);
 		if (error)
-			return(error);
+			return error;
 		error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
 							 drop_blk->bp);
 		drop_blk->bp = NULL;
 		if (error)
-			return(error);
+			return error;
 	}
 	/*
 	 * We joined all the way to the top.  If it turns out that
@@ -1010,7 +1010,7 @@ xfs_da3_join(
 	xfs_da3_node_remove(state, drop_blk);
 	xfs_da3_fixhashpath(state, &state->path);
 	error = xfs_da3_root_join(state, &state->path.blk[0]);
-	return(error);
+	return error;
 }
 
 #ifdef	DEBUG
@@ -1099,7 +1099,7 @@ xfs_da3_root_join(
 	xfs_trans_log_buf(args->trans, root_blk->bp, 0,
 			  args->geo->blksize - 1);
 	error = xfs_da_shrink_inode(args, child, bp);
-	return(error);
+	return error;
 }
 
 /*
@@ -1142,7 +1142,7 @@ xfs_da3_node_toosmall(
 	dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 	if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
 		*action = 0;	/* blk over 50%, don't try to join */
-		return(0);	/* blk over 50%, don't try to join */
+		return 0;	/* blk over 50%, don't try to join */
 	}
 
 	/*
@@ -1161,13 +1161,13 @@ xfs_da3_node_toosmall(
 		error = xfs_da3_path_shift(state, &state->altpath, forward,
 						 0, &retval);
 		if (error)
-			return(error);
+			return error;
 		if (retval) {
 			*action = 0;
 		} else {
 			*action = 2;
 		}
-		return(0);
+		return 0;
 	}
 
 	/*
@@ -1194,7 +1194,7 @@ xfs_da3_node_toosmall(
 		error = xfs_da3_node_read(state->args->trans, dp,
 					blkno, -1, &bp, state->args->whichfork);
 		if (error)
-			return(error);
+			return error;
 
 		node = bp->b_addr;
 		dp->d_ops->node_hdr_from_disk(&thdr, node);
@@ -1486,7 +1486,7 @@ xfs_da3_node_lookup_int(
 		if (error) {
 			blk->blkno = 0;
 			state->path.active--;
-			return(error);
+			return error;
 		}
 		curr = blk->bp->b_addr;
 		blk->magic = be16_to_cpu(curr->magic);
@@ -1579,25 +1579,25 @@ xfs_da3_node_lookup_int(
 			args->blkno = blk->blkno;
 		} else {
 			ASSERT(0);
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 		}
-		if (((retval == ENOENT) || (retval == ENOATTR)) &&
+		if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
 		    (blk->hashval == args->hashval)) {
 			error = xfs_da3_path_shift(state, &state->path, 1, 1,
 							 &retval);
 			if (error)
-				return(error);
+				return error;
 			if (retval == 0) {
 				continue;
 			} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
 				/* path_shift() gives ENOENT */
-				retval = XFS_ERROR(ENOATTR);
+				retval = -ENOATTR;
 			}
 		}
 		break;
 	}
 	*result = retval;
-	return(0);
+	return 0;
 }
 
 /*========================================================================
@@ -1692,7 +1692,7 @@ xfs_da3_blk_link(
 						be32_to_cpu(old_info->back),
 						-1, &bp, args->whichfork);
 			if (error)
-				return(error);
+				return error;
 			ASSERT(bp != NULL);
 			tmp_info = bp->b_addr;
 			ASSERT(tmp_info->magic == old_info->magic);
@@ -1713,7 +1713,7 @@ xfs_da3_blk_link(
 						be32_to_cpu(old_info->forw),
 						-1, &bp, args->whichfork);
 			if (error)
-				return(error);
+				return error;
 			ASSERT(bp != NULL);
 			tmp_info = bp->b_addr;
 			ASSERT(tmp_info->magic == old_info->magic);
@@ -1726,7 +1726,7 @@ xfs_da3_blk_link(
 
 	xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
 	xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
-	return(0);
+	return 0;
 }
 
 /*
@@ -1772,7 +1772,7 @@ xfs_da3_blk_unlink(
 						be32_to_cpu(drop_info->back),
 						-1, &bp, args->whichfork);
 			if (error)
-				return(error);
+				return error;
 			ASSERT(bp != NULL);
 			tmp_info = bp->b_addr;
 			ASSERT(tmp_info->magic == save_info->magic);
@@ -1789,7 +1789,7 @@ xfs_da3_blk_unlink(
 						be32_to_cpu(drop_info->forw),
 						-1, &bp, args->whichfork);
 			if (error)
-				return(error);
+				return error;
 			ASSERT(bp != NULL);
 			tmp_info = bp->b_addr;
 			ASSERT(tmp_info->magic == save_info->magic);
@@ -1801,7 +1801,7 @@ xfs_da3_blk_unlink(
 	}
 
 	xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
-	return(0);
+	return 0;
 }
 
 /*
@@ -1859,9 +1859,9 @@ xfs_da3_path_shift(
 		}
 	}
 	if (level < 0) {
-		*result = XFS_ERROR(ENOENT);	/* we're out of our tree */
+		*result = -ENOENT;	/* we're out of our tree */
 		ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-		return(0);
+		return 0;
 	}
 
 	/*
@@ -1883,7 +1883,7 @@ xfs_da3_path_shift(
 		error = xfs_da3_node_read(args->trans, dp, blkno, -1,
 					&blk->bp, args->whichfork);
 		if (error)
-			return(error);
+			return error;
 		info = blk->bp->b_addr;
 		ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
 		       info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
@@ -2004,7 +2004,7 @@ xfs_da_grow_inode_int(
 	struct xfs_trans	*tp = args->trans;
 	struct xfs_inode	*dp = args->dp;
 	int			w = args->whichfork;
-	xfs_drfsbno_t		nblks = dp->i_d.di_nblocks;
+	xfs_rfsblock_t		nblks = dp->i_d.di_nblocks;
 	struct xfs_bmbt_irec	map, *mapp;
 	int			nmap, error, got, i, mapi;
 
@@ -2068,7 +2068,7 @@ xfs_da_grow_inode_int(
 	if (got != count || mapp[0].br_startoff != *bno ||
 	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
 	    *bno + count) {
-		error = XFS_ERROR(ENOSPC);
+		error = -ENOSPC;
 		goto out_free_map;
 	}
 
@@ -2158,7 +2158,7 @@ xfs_da3_swap_lastblock(
 	if (unlikely(lastoff == 0)) {
 		XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
 				 mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	/*
 	 * Read the last block in the btree space.
@@ -2209,7 +2209,7 @@ xfs_da3_swap_lastblock(
 		    sib_info->magic != dead_info->magic)) {
 			XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
 					 XFS_ERRLEVEL_LOW, mp);
-			error = XFS_ERROR(EFSCORRUPTED);
+			error = -EFSCORRUPTED;
 			goto done;
 		}
 		sib_info->forw = cpu_to_be32(dead_blkno);
@@ -2231,7 +2231,7 @@ xfs_da3_swap_lastblock(
 		       sib_info->magic != dead_info->magic)) {
 			XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
 					 XFS_ERRLEVEL_LOW, mp);
-			error = XFS_ERROR(EFSCORRUPTED);
+			error = -EFSCORRUPTED;
 			goto done;
 		}
 		sib_info->back = cpu_to_be32(dead_blkno);
@@ -2254,7 +2254,7 @@ xfs_da3_swap_lastblock(
 		if (level >= 0 && level != par_hdr.level + 1) {
 			XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
 					 XFS_ERRLEVEL_LOW, mp);
-			error = XFS_ERROR(EFSCORRUPTED);
+			error = -EFSCORRUPTED;
 			goto done;
 		}
 		level = par_hdr.level;
@@ -2267,7 +2267,7 @@ xfs_da3_swap_lastblock(
 		if (entno == par_hdr.count) {
 			XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
 					 XFS_ERRLEVEL_LOW, mp);
-			error = XFS_ERROR(EFSCORRUPTED);
+			error = -EFSCORRUPTED;
 			goto done;
 		}
 		par_blkno = be32_to_cpu(btree[entno].before);
@@ -2294,7 +2294,7 @@ xfs_da3_swap_lastblock(
 		if (unlikely(par_blkno == 0)) {
 			XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
 					 XFS_ERRLEVEL_LOW, mp);
-			error = XFS_ERROR(EFSCORRUPTED);
+			error = -EFSCORRUPTED;
 			goto done;
 		}
 		error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
@@ -2305,7 +2305,7 @@ xfs_da3_swap_lastblock(
 		if (par_hdr.level != level) {
 			XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
 					 XFS_ERRLEVEL_LOW, mp);
-			error = XFS_ERROR(EFSCORRUPTED);
+			error = -EFSCORRUPTED;
 			goto done;
 		}
 		btree = dp->d_ops->node_tree_p(par_node);
@@ -2359,7 +2359,7 @@ xfs_da_shrink_inode(
 		error = xfs_bunmapi(tp, dp, dead_blkno, count,
 				    xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
 				    0, args->firstblock, args->flist, &done);
-		if (error == ENOSPC) {
+		if (error == -ENOSPC) {
 			if (w != XFS_DATA_FORK)
 				break;
 			error = xfs_da3_swap_lastblock(args, &dead_blkno,
@@ -2427,7 +2427,7 @@ xfs_buf_map_from_irec(
 		map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
 				  KM_SLEEP | KM_NOFS);
 		if (!map)
-			return ENOMEM;
+			return -ENOMEM;
 		*mapp = map;
 	}
 
@@ -2500,8 +2500,8 @@ xfs_dabuf_map(
 	}
 
 	if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
-		error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED);
-		if (unlikely(error == EFSCORRUPTED)) {
+		error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
+		if (unlikely(error == -EFSCORRUPTED)) {
 			if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
 				int i;
 				xfs_alert(mp, "%s: bno %lld dir: inode %lld",
@@ -2561,7 +2561,7 @@ xfs_da_get_buf(
 
 	bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
 				    mapp, nmap, 0);
-	error = bp ? bp->b_error : XFS_ERROR(EIO);
+	error = bp ? bp->b_error : -EIO;
 	if (error) {
 		xfs_trans_brelse(trans, bp);
 		goto out_free;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 6e153e399a77..6e153e399a77 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index c9aee52a37e2..c9aee52a37e2 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 0a49b0286372..0a49b0286372 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h
index 623bbe8fd921..623bbe8fd921 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/libxfs/xfs_dinode.h
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 79670cda48ae..6cef22152fd6 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -108,7 +108,7 @@ xfs_da_mount(
 	if (!mp->m_dir_geo || !mp->m_attr_geo) {
 		kmem_free(mp->m_dir_geo);
 		kmem_free(mp->m_attr_geo);
-		return ENOMEM;
+		return -ENOMEM;
 	}
 
 	/* set up directory geometry */
@@ -202,7 +202,7 @@ xfs_dir_ino_validate(
 		xfs_warn(mp, "Invalid inode number 0x%Lx",
 				(unsigned long long) ino);
 		XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	return 0;
 }
@@ -226,7 +226,7 @@ xfs_dir_init(
 
 	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
 	if (!args)
-		return ENOMEM;
+		return -ENOMEM;
 
 	args->geo = dp->i_mount->m_dir_geo;
 	args->dp = dp;
@@ -261,7 +261,7 @@ xfs_dir_createname(
 
 	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
 	if (!args)
-		return ENOMEM;
+		return -ENOMEM;
 
 	args->geo = dp->i_mount->m_dir_geo;
 	args->name = name->name;
@@ -314,18 +314,18 @@ xfs_dir_cilookup_result(
 	int		len)
 {
 	if (args->cmpresult == XFS_CMP_DIFFERENT)
-		return ENOENT;
+		return -ENOENT;
 	if (args->cmpresult != XFS_CMP_CASE ||
 					!(args->op_flags & XFS_DA_OP_CILOOKUP))
-		return EEXIST;
+		return -EEXIST;
 
 	args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
 	if (!args->value)
-		return ENOMEM;
+		return -ENOMEM;
 
 	memcpy(args->value, name, len);
 	args->valuelen = len;
-	return EEXIST;
+	return -EEXIST;
 }
 
 /*
@@ -392,7 +392,7 @@ xfs_dir_lookup(
 		rval = xfs_dir2_node_lookup(args);
 
 out_check_rval:
-	if (rval == EEXIST)
+	if (rval == -EEXIST)
 		rval = 0;
 	if (!rval) {
 		*inum = args->inumber;
@@ -428,7 +428,7 @@ xfs_dir_removename(
 
 	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
 	if (!args)
-		return ENOMEM;
+		return -ENOMEM;
 
 	args->geo = dp->i_mount->m_dir_geo;
 	args->name = name->name;
@@ -493,7 +493,7 @@ xfs_dir_replace(
 
 	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
 	if (!args)
-		return ENOMEM;
+		return -ENOMEM;
 
 	args->geo = dp->i_mount->m_dir_geo;
 	args->name = name->name;
@@ -555,7 +555,7 @@ xfs_dir_canenter(
 
 	args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
 	if (!args)
-		return ENOMEM;
+		return -ENOMEM;
 
 	args->geo = dp->i_mount->m_dir_geo;
 	args->name = name->name;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c8e86b0b5e99..c8e86b0b5e99 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index c7cd3154026a..9628ceccfa02 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -91,9 +91,9 @@ xfs_dir3_block_read_verify(
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
 	     !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (!xfs_dir3_block_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error)
 		xfs_verifier_error(bp);
@@ -108,7 +108,7 @@ xfs_dir3_block_write_verify(
 	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
 
 	if (!xfs_dir3_block_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
@@ -392,7 +392,7 @@ xfs_dir2_block_addname(
 	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
 		xfs_trans_brelse(tp, bp);
 		if (!dup)
-			return XFS_ERROR(ENOSPC);
+			return -ENOSPC;
 		return 0;
 	}
 
@@ -402,7 +402,7 @@ xfs_dir2_block_addname(
 	if (!dup) {
 		/* Don't have a space reservation: return no-space.  */
 		if (args->total == 0)
-			return XFS_ERROR(ENOSPC);
+			return -ENOSPC;
 		/*
 		 * Convert to the next larger format.
 		 * Then add the new entry in that format.
@@ -647,7 +647,7 @@ xfs_dir2_block_lookup(
 	args->filetype = dp->d_ops->data_get_ftype(dep);
 	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
 	xfs_trans_brelse(args->trans, bp);
-	return XFS_ERROR(error);
+	return error;
 }
 
 /*
@@ -703,7 +703,7 @@ xfs_dir2_block_lookup_int(
 		if (low > high) {
 			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
 			xfs_trans_brelse(tp, bp);
-			return XFS_ERROR(ENOENT);
+			return -ENOENT;
 		}
 	}
 	/*
@@ -751,7 +751,7 @@ xfs_dir2_block_lookup_int(
 	 * No match, release the buffer and return ENOENT.
 	 */
 	xfs_trans_brelse(tp, bp);
-	return XFS_ERROR(ENOENT);
+	return -ENOENT;
 }
 
 /*
@@ -1091,7 +1091,7 @@ xfs_dir2_sf_to_block(
 	 */
 	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
 		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 
 	oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 8c2f6422648e..fdd803fecb8e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -100,7 +100,7 @@ __xfs_dir3_data_check(
 		break;
 	default:
 		XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
-		return EFSCORRUPTED;
+		return -EFSCORRUPTED;
 	}
 
 	/*
@@ -256,7 +256,7 @@ xfs_dir3_data_reada_verify(
 		xfs_dir3_data_verify(bp);
 		return;
 	default:
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		break;
 	}
@@ -270,9 +270,9 @@ xfs_dir3_data_read_verify(
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
 	     !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
-		 xfs_buf_ioerror(bp, EFSBADCRC);
+		 xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (!xfs_dir3_data_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error)
 		xfs_verifier_error(bp);
@@ -287,7 +287,7 @@ xfs_dir3_data_write_verify(
 	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
 
 	if (!xfs_dir3_data_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index fb0aad4440c1..a19174eb3cb2 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -183,9 +183,9 @@ __read_verify(
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
 	     !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (!xfs_dir3_leaf_verify(bp, magic))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error)
 		xfs_verifier_error(bp);
@@ -201,7 +201,7 @@ __write_verify(
 	struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
 
 	if (!xfs_dir3_leaf_verify(bp, magic)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
@@ -731,7 +731,7 @@ xfs_dir2_leaf_addname(
 		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
 							args->total == 0) {
 			xfs_trans_brelse(tp, lbp);
-			return XFS_ERROR(ENOSPC);
+			return -ENOSPC;
 		}
 		/*
 		 * Convert to node form.
@@ -755,7 +755,7 @@ xfs_dir2_leaf_addname(
 	 */
 	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
 		xfs_trans_brelse(tp, lbp);
-		return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
+		return use_block == -1 ? -ENOSPC : 0;
 	}
 	/*
 	 * If no allocations are allowed, return now before we've
@@ -763,7 +763,7 @@ xfs_dir2_leaf_addname(
 	 */
 	if (args->total == 0 && use_block == -1) {
 		xfs_trans_brelse(tp, lbp);
-		return XFS_ERROR(ENOSPC);
+		return -ENOSPC;
 	}
 	/*
 	 * Need to compact the leaf entries, removing stale ones.
@@ -1198,7 +1198,7 @@ xfs_dir2_leaf_lookup(
 	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
 	xfs_trans_brelse(tp, dbp);
 	xfs_trans_brelse(tp, lbp);
-	return XFS_ERROR(error);
+	return error;
 }
 
 /*
@@ -1327,13 +1327,13 @@ xfs_dir2_leaf_lookup_int(
 		return 0;
 	}
 	/*
-	 * No match found, return ENOENT.
+	 * No match found, return -ENOENT.
 	 */
 	ASSERT(cidb == -1);
 	if (dbp)
 		xfs_trans_brelse(tp, dbp);
 	xfs_trans_brelse(tp, lbp);
-	return XFS_ERROR(ENOENT);
+	return -ENOENT;
 }
 
 /*
@@ -1440,7 +1440,7 @@ xfs_dir2_leaf_removename(
 			 * Just go on, returning success, leaving the
 			 * empty block in place.
 			 */
-			if (error == ENOSPC && args->total == 0)
+			if (error == -ENOSPC && args->total == 0)
 				error = 0;
 			xfs_dir3_leaf_check(dp, lbp);
 			return error;
@@ -1641,7 +1641,7 @@ xfs_dir2_leaf_trim_data(
 	 * Get rid of the data block.
 	 */
 	if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
-		ASSERT(error != ENOSPC);
+		ASSERT(error != -ENOSPC);
 		xfs_trans_brelse(tp, dbp);
 		return error;
 	}
@@ -1815,7 +1815,7 @@ xfs_dir2_node_to_leaf(
 		 * punching out the middle of an extent, and this is an
 		 * isolated block.
 		 */
-		ASSERT(error != ENOSPC);
+		ASSERT(error != -ENOSPC);
 		return error;
 	}
 	fbp = NULL;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index da43d304fca2..2ae6ac2c11ae 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -117,9 +117,9 @@ xfs_dir3_free_read_verify(
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
 	    !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (!xfs_dir3_free_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error)
 		xfs_verifier_error(bp);
@@ -134,7 +134,7 @@ xfs_dir3_free_write_verify(
 	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
 
 	if (!xfs_dir3_free_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
@@ -406,7 +406,7 @@ xfs_dir2_leafn_add(
 	 * into other peoples memory
 	 */
 	if (index < 0)
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 
 	/*
 	 * If there are already the maximum number of leaf entries in
@@ -417,7 +417,7 @@ xfs_dir2_leafn_add(
 
 	if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
 		if (!leafhdr.stale)
-			return XFS_ERROR(ENOSPC);
+			return -ENOSPC;
 		compact = leafhdr.stale > 1;
 	} else
 		compact = 0;
@@ -629,7 +629,7 @@ xfs_dir2_leafn_lookup_for_addname(
 							XFS_ERRLEVEL_LOW, mp);
 				if (curfdb != newfdb)
 					xfs_trans_brelse(tp, curbp);
-				return XFS_ERROR(EFSCORRUPTED);
+				return -EFSCORRUPTED;
 			}
 			curfdb = newfdb;
 			if (be16_to_cpu(bests[fi]) >= length)
@@ -660,7 +660,7 @@ out:
 	 * Return the index, that will be the insertion point.
 	 */
 	*indexp = index;
-	return XFS_ERROR(ENOENT);
+	return -ENOENT;
 }
 
 /*
@@ -789,7 +789,7 @@ xfs_dir2_leafn_lookup_for_entry(
 			curbp->b_ops = &xfs_dir3_data_buf_ops;
 			xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
 			if (cmp == XFS_CMP_EXACT)
-				return XFS_ERROR(EEXIST);
+				return -EEXIST;
 		}
 	}
 	ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
@@ -812,7 +812,7 @@ xfs_dir2_leafn_lookup_for_entry(
 		state->extravalid = 0;
 	}
 	*indexp = index;
-	return XFS_ERROR(ENOENT);
+	return -ENOENT;
 }
 
 /*
@@ -1133,7 +1133,7 @@ xfs_dir3_data_block_free(
 		if (error == 0) {
 			fbp = NULL;
 			logfree = 0;
-		} else if (error != ENOSPC || args->total != 0)
+		} else if (error != -ENOSPC || args->total != 0)
 			return error;
 		/*
 		 * It's possible to get ENOSPC if there is no
@@ -1287,7 +1287,7 @@ xfs_dir2_leafn_remove(
 			 * In this case just drop the buffer and some one else
 			 * will eventually get rid of the empty block.
 			 */
-			else if (!(error == ENOSPC && args->total == 0))
+			else if (!(error == -ENOSPC && args->total == 0))
 				return error;
 		}
 		/*
@@ -1599,7 +1599,7 @@ xfs_dir2_node_addname(
 	error = xfs_da3_node_lookup_int(state, &rval);
 	if (error)
 		rval = error;
-	if (rval != ENOENT) {
+	if (rval != -ENOENT) {
 		goto done;
 	}
 	/*
@@ -1628,7 +1628,7 @@ xfs_dir2_node_addname(
 		 * It didn't work, we need to split the leaf block.
 		 */
 		if (args->total == 0) {
-			ASSERT(rval == ENOSPC);
+			ASSERT(rval == -ENOSPC);
 			goto done;
 		}
 		/*
@@ -1815,7 +1815,7 @@ xfs_dir2_node_addname_int(
 		 * Not allowed to allocate, return failure.
 		 */
 		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-			return XFS_ERROR(ENOSPC);
+			return -ENOSPC;
 
 		/*
 		 * Allocate and initialize the new data block.
@@ -1876,7 +1876,7 @@ xfs_dir2_node_addname_int(
 				}
 				XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
 						 XFS_ERRLEVEL_LOW, mp);
-				return XFS_ERROR(EFSCORRUPTED);
+				return -EFSCORRUPTED;
 			}
 
 			/*
@@ -2042,8 +2042,8 @@ xfs_dir2_node_lookup(
 	error = xfs_da3_node_lookup_int(state, &rval);
 	if (error)
 		rval = error;
-	else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
-		/* If a CI match, dup the actual name and return EEXIST */
+	else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) {
+		/* If a CI match, dup the actual name and return -EEXIST */
 		xfs_dir2_data_entry_t	*dep;
 
 		dep = (xfs_dir2_data_entry_t *)
@@ -2096,7 +2096,7 @@ xfs_dir2_node_removename(
 		goto out_free;
 
 	/* Didn't find it, upper layer screwed up. */
-	if (rval != EEXIST) {
+	if (rval != -EEXIST) {
 		error = rval;
 		goto out_free;
 	}
@@ -2169,7 +2169,7 @@ xfs_dir2_node_replace(
 	 * It should be found, since the vnodeops layer has looked it up
 	 * and locked it.  But paranoia is good.
 	 */
-	if (rval == EEXIST) {
+	if (rval == -EEXIST) {
 		struct xfs_dir2_leaf_entry *ents;
 		/*
 		 * Find the leaf entry.
@@ -2272,7 +2272,7 @@ xfs_dir2_node_trim_free(
 		 * space reservation, when breaking up an extent into two
 		 * pieces.  This is the last block of an extent.
 		 */
-		ASSERT(error != ENOSPC);
+		ASSERT(error != -ENOSPC);
 		xfs_trans_brelse(tp, bp);
 		return error;
 	}
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 27ce0794d196..27ce0794d196 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 53c3be619db5..5079e051ef08 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -51,10 +51,9 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args);
 #else
 #define	xfs_dir2_sf_check(args)
 #endif /* DEBUG */
-#if XFS_BIG_INUMS
+
 static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
 static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
-#endif /* XFS_BIG_INUMS */
 
 /*
  * Given a block directory (dp/block), calculate its size as a shortform (sf)
@@ -117,10 +116,10 @@ xfs_dir2_block_sfsize(
 		isdotdot =
 			dep->namelen == 2 &&
 			dep->name[0] == '.' && dep->name[1] == '.';
-#if XFS_BIG_INUMS
+
 		if (!isdot)
 			i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
-#endif
+
 		/* take into account the file type field */
 		if (!isdot && !isdotdot) {
 			count++;
@@ -251,7 +250,7 @@ xfs_dir2_block_to_sf(
 	logflags = XFS_ILOG_CORE;
 	error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
 	if (error) {
-		ASSERT(error != ENOSPC);
+		ASSERT(error != -ENOSPC);
 		goto out;
 	}
 
@@ -299,7 +298,7 @@ xfs_dir2_sf_addname(
 
 	trace_xfs_dir2_sf_addname(args);
 
-	ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
+	ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
 	dp = args->dp;
 	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
 	/*
@@ -307,7 +306,7 @@ xfs_dir2_sf_addname(
 	 */
 	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
 		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
 	ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -318,7 +317,7 @@ xfs_dir2_sf_addname(
 	 */
 	incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
 	objchange = 0;
-#if XFS_BIG_INUMS
+
 	/*
 	 * Do we have to change to 8 byte inodes?
 	 */
@@ -332,7 +331,7 @@ xfs_dir2_sf_addname(
 			 (uint)sizeof(xfs_dir2_ino4_t));
 		objchange = 1;
 	}
-#endif
+
 	new_isize = (int)dp->i_d.di_size + incr_isize;
 	/*
 	 * Won't fit as shortform any more (due to size),
@@ -345,7 +344,7 @@ xfs_dir2_sf_addname(
 		 * Just checking or no space reservation, it doesn't fit.
 		 */
 		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-			return XFS_ERROR(ENOSPC);
+			return -ENOSPC;
 		/*
 		 * Convert to block form then add the name.
 		 */
@@ -370,10 +369,8 @@ xfs_dir2_sf_addname(
 	 */
 	else {
 		ASSERT(pick == 2);
-#if XFS_BIG_INUMS
 		if (objchange)
 			xfs_dir2_sf_toino8(args);
-#endif
 		xfs_dir2_sf_addname_hard(args, objchange, new_isize);
 	}
 	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
@@ -425,10 +422,8 @@ xfs_dir2_sf_addname_easy(
 	 * Update the header and inode.
 	 */
 	sfp->count++;
-#if XFS_BIG_INUMS
 	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
 		sfp->i8count++;
-#endif
 	dp->i_d.di_size = new_isize;
 	xfs_dir2_sf_check(args);
 }
@@ -516,10 +511,8 @@ xfs_dir2_sf_addname_hard(
 	dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
 	dp->d_ops->sf_put_ftype(sfep, args->filetype);
 	sfp->count++;
-#if XFS_BIG_INUMS
 	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
 		sfp->i8count++;
-#endif
 	/*
 	 * If there's more left to copy, do that.
 	 */
@@ -593,13 +586,8 @@ xfs_dir2_sf_addname_pick(
 	/*
 	 * If changing the inode number size, do it the hard way.
 	 */
-#if XFS_BIG_INUMS
-	if (objchange) {
+	if (objchange)
 		return 2;
-	}
-#else
-	ASSERT(objchange == 0);
-#endif
 	/*
 	 * If it won't fit at the end then do it the hard way (use the hole).
 	 */
@@ -650,7 +638,6 @@ xfs_dir2_sf_check(
 		ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
 	}
 	ASSERT(i8count == sfp->i8count);
-	ASSERT(XFS_BIG_INUMS || i8count == 0);
 	ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
 	ASSERT(offset +
 	       (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
@@ -738,7 +725,7 @@ xfs_dir2_sf_lookup(
 	 */
 	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
 		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
 	ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -751,7 +738,7 @@ xfs_dir2_sf_lookup(
 		args->inumber = dp->i_ino;
 		args->cmpresult = XFS_CMP_EXACT;
 		args->filetype = XFS_DIR3_FT_DIR;
-		return XFS_ERROR(EEXIST);
+		return -EEXIST;
 	}
 	/*
 	 * Special case for ..
@@ -761,7 +748,7 @@ xfs_dir2_sf_lookup(
 		args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
 		args->cmpresult = XFS_CMP_EXACT;
 		args->filetype = XFS_DIR3_FT_DIR;
-		return XFS_ERROR(EEXIST);
+		return -EEXIST;
 	}
 	/*
 	 * Loop over all the entries trying to match ours.
@@ -781,20 +768,20 @@ xfs_dir2_sf_lookup(
 			args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
 			args->filetype = dp->d_ops->sf_get_ftype(sfep);
 			if (cmp == XFS_CMP_EXACT)
-				return XFS_ERROR(EEXIST);
+				return -EEXIST;
 			ci_sfep = sfep;
 		}
 	}
 	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
 	/*
 	 * Here, we can only be doing a lookup (not a rename or replace).
-	 * If a case-insensitive match was not found, return ENOENT.
+	 * If a case-insensitive match was not found, return -ENOENT.
 	 */
 	if (!ci_sfep)
-		return XFS_ERROR(ENOENT);
+		return -ENOENT;
 	/* otherwise process the CI match as required by the caller */
 	error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
-	return XFS_ERROR(error);
+	return error;
 }
 
 /*
@@ -824,7 +811,7 @@ xfs_dir2_sf_removename(
 	 */
 	if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
 		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 	ASSERT(dp->i_df.if_bytes == oldsize);
 	ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -847,7 +834,7 @@ xfs_dir2_sf_removename(
 	 * Didn't find it.
 	 */
 	if (i == sfp->count)
-		return XFS_ERROR(ENOENT);
+		return -ENOENT;
 	/*
 	 * Calculate sizes.
 	 */
@@ -870,7 +857,6 @@ xfs_dir2_sf_removename(
 	 */
 	xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-#if XFS_BIG_INUMS
 	/*
 	 * Are we changing inode number size?
 	 */
@@ -880,7 +866,6 @@ xfs_dir2_sf_removename(
 		else
 			sfp->i8count--;
 	}
-#endif
 	xfs_dir2_sf_check(args);
 	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
 	return 0;
@@ -895,12 +880,8 @@ xfs_dir2_sf_replace(
 {
 	xfs_inode_t		*dp;		/* incore directory inode */
 	int			i;		/* entry index */
-#if XFS_BIG_INUMS || defined(DEBUG)
 	xfs_ino_t		ino=0;		/* entry old inode number */
-#endif
-#if XFS_BIG_INUMS
 	int			i8elevated;	/* sf_toino8 set i8count=1 */
-#endif
 	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
 	xfs_dir2_sf_hdr_t	*sfp;		/* shortform structure */
 
@@ -914,13 +895,13 @@ xfs_dir2_sf_replace(
 	 */
 	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
 		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
 	ASSERT(dp->i_df.if_u1.if_data != NULL);
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-#if XFS_BIG_INUMS
+
 	/*
 	 * New inode number is large, and need to convert to 8-byte inodes.
 	 */
@@ -951,17 +932,15 @@ xfs_dir2_sf_replace(
 		sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	} else
 		i8elevated = 0;
-#endif
+
 	ASSERT(args->namelen != 1 || args->name[0] != '.');
 	/*
 	 * Replace ..'s entry.
 	 */
 	if (args->namelen == 2 &&
 	    args->name[0] == '.' && args->name[1] == '.') {
-#if XFS_BIG_INUMS || defined(DEBUG)
 		ino = dp->d_ops->sf_get_parent_ino(sfp);
 		ASSERT(args->inumber != ino);
-#endif
 		dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
 	}
 	/*
@@ -972,10 +951,8 @@ xfs_dir2_sf_replace(
 		     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
 			if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
 								XFS_CMP_EXACT) {
-#if XFS_BIG_INUMS || defined(DEBUG)
 				ino = dp->d_ops->sf_get_ino(sfp, sfep);
 				ASSERT(args->inumber != ino);
-#endif
 				dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
 				dp->d_ops->sf_put_ftype(sfep, args->filetype);
 				break;
@@ -986,14 +963,11 @@ xfs_dir2_sf_replace(
 		 */
 		if (i == sfp->count) {
 			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-#if XFS_BIG_INUMS
 			if (i8elevated)
 				xfs_dir2_sf_toino4(args);
-#endif
-			return XFS_ERROR(ENOENT);
+			return -ENOENT;
 		}
 	}
-#if XFS_BIG_INUMS
 	/*
 	 * See if the old number was large, the new number is small.
 	 */
@@ -1020,13 +994,11 @@ xfs_dir2_sf_replace(
 		if (!i8elevated)
 			sfp->i8count++;
 	}
-#endif
 	xfs_dir2_sf_check(args);
 	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
 	return 0;
 }
 
-#if XFS_BIG_INUMS
 /*
  * Convert from 8-byte inode numbers to 4-byte inode numbers.
  * The last 8-byte inode number is gone, but the count is still 1.
@@ -1181,4 +1153,3 @@ xfs_dir2_sf_toino8(
 	dp->i_d.di_size = newsize;
 	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
 }
-#endif	/* XFS_BIG_INUMS */
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index c2ac0c611ad8..bb969337efc8 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -257,9 +257,9 @@ xfs_dquot_buf_read_verify(
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 
 	if (!xfs_dquot_buf_verify_crc(mp, bp))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (!xfs_dquot_buf_verify(mp, bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error)
 		xfs_verifier_error(bp);
@@ -277,7 +277,7 @@ xfs_dquot_buf_write_verify(
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 
 	if (!xfs_dquot_buf_verify(mp, bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 34d85aca3058..7e42bba9a420 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -68,11 +68,7 @@ struct xfs_ifork;
 #define	XFS_RTLOBIT(w)	xfs_lowbit32(w)
 #define	XFS_RTHIBIT(w)	xfs_highbit32(w)
 
-#if XFS_BIG_BLKNOS
 #define	XFS_RTBLOCKLOG(b)	xfs_highbit64(b)
-#else
-#define	XFS_RTBLOCKLOG(b)	xfs_highbit32(b)
-#endif
 
 /*
  * Dquot and dquot block format definitions
@@ -304,23 +300,15 @@ typedef struct xfs_bmbt_rec_host {
  * Values and macros for delayed-allocation startblock fields.
  */
 #define STARTBLOCKVALBITS	17
-#define STARTBLOCKMASKBITS	(15 + XFS_BIG_BLKNOS * 20)
-#define DSTARTBLOCKMASKBITS	(15 + 20)
+#define STARTBLOCKMASKBITS	(15 + 20)
 #define STARTBLOCKMASK		\
 	(((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-#define DSTARTBLOCKMASK		\
-	(((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
 
 static inline int isnullstartblock(xfs_fsblock_t x)
 {
 	return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
 }
 
-static inline int isnulldstartblock(xfs_dfsbno_t x)
-{
-	return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
-}
-
 static inline xfs_fsblock_t nullstartblock(int k)
 {
 	ASSERT(k < (1 << STARTBLOCKVALBITS));
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 5960e5593fe0..b62771f1f4b5 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -292,7 +292,7 @@ xfs_ialloc_inode_init(
 					 mp->m_bsize * blks_per_cluster,
 					 XBF_UNMAPPED);
 		if (!fbuf)
-			return ENOMEM;
+			return -ENOMEM;
 
 		/* Initialize the inode buffers and log them appropriately. */
 		fbuf->b_ops = &xfs_inode_buf_ops;
@@ -380,7 +380,7 @@ xfs_ialloc_ag_alloc(
 	newlen = args.mp->m_ialloc_inos;
 	if (args.mp->m_maxicount &&
 	    args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
-		return XFS_ERROR(ENOSPC);
+		return -ENOSPC;
 	args.minlen = args.maxlen = args.mp->m_ialloc_blks;
 	/*
 	 * First try to allocate inodes contiguous with the last-allocated
@@ -1385,7 +1385,7 @@ xfs_dialloc(
 		if (error) {
 			xfs_trans_brelse(tp, agbp);
 
-			if (error != ENOSPC)
+			if (error != -ENOSPC)
 				goto out_error;
 
 			xfs_perag_put(pag);
@@ -1416,7 +1416,7 @@ nextag:
 			agno = 0;
 		if (agno == start_agno) {
 			*inop = NULLFSINO;
-			return noroom ? ENOSPC : 0;
+			return noroom ? -ENOSPC : 0;
 		}
 	}
 
@@ -1425,7 +1425,7 @@ out_alloc:
 	return xfs_dialloc_ag(tp, agbp, parent, inop);
 out_error:
 	xfs_perag_put(pag);
-	return XFS_ERROR(error);
+	return error;
 }
 
 STATIC int
@@ -1682,7 +1682,7 @@ xfs_difree(
 		xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
 			__func__, agno, mp->m_sb.sb_agcount);
 		ASSERT(0);
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 	agino = XFS_INO_TO_AGINO(mp, inode);
 	if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
@@ -1690,14 +1690,14 @@ xfs_difree(
 			__func__, (unsigned long long)inode,
 			(unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
 		ASSERT(0);
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
 	if (agbno >= mp->m_sb.sb_agblocks)  {
 		xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
 			__func__, agbno, mp->m_sb.sb_agblocks);
 		ASSERT(0);
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 	/*
 	 * Get the allocation group header.
@@ -1769,7 +1769,7 @@ xfs_imap_lookup(
 		if (i)
 			error = xfs_inobt_get_rec(cur, &rec, &i);
 		if (!error && i == 0)
-			error = EINVAL;
+			error = -EINVAL;
 	}
 
 	xfs_trans_brelse(tp, agbp);
@@ -1780,12 +1780,12 @@ xfs_imap_lookup(
 	/* check that the returned record contains the required inode */
 	if (rec.ir_startino > agino ||
 	    rec.ir_startino + mp->m_ialloc_inos <= agino)
-		return EINVAL;
+		return -EINVAL;
 
 	/* for untrusted inodes check it is allocated first */
 	if ((flags & XFS_IGET_UNTRUSTED) &&
 	    (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
-		return EINVAL;
+		return -EINVAL;
 
 	*chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
 	*offset_agbno = agbno - *chunk_agbno;
@@ -1829,7 +1829,7 @@ xfs_imap(
 		 * as they can be invalid without implying corruption.
 		 */
 		if (flags & XFS_IGET_UNTRUSTED)
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		if (agno >= mp->m_sb.sb_agcount) {
 			xfs_alert(mp,
 				"%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
@@ -1849,7 +1849,7 @@ xfs_imap(
 		}
 		xfs_stack_trace();
 #endif /* DEBUG */
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	blks_per_cluster = xfs_icluster_size_fsb(mp);
@@ -1922,7 +1922,7 @@ out_map:
 			__func__, (unsigned long long) imap->im_blkno,
 			(unsigned long long) imap->im_len,
 			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 	return 0;
 }
@@ -2072,11 +2072,11 @@ xfs_agi_read_verify(
 
 	if (xfs_sb_version_hascrc(&mp->m_sb) &&
 	    !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
 				XFS_ERRTAG_IALLOC_READ_AGI,
 				XFS_RANDOM_IALLOC_READ_AGI))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error)
 		xfs_verifier_error(bp);
@@ -2090,7 +2090,7 @@ xfs_agi_write_verify(
 	struct xfs_buf_log_item	*bip = bp->b_fspriv;
 
 	if (!xfs_agi_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 95ad1c002d60..95ad1c002d60 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 726f83a681a5..c9b06f30fe86 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -272,9 +272,9 @@ xfs_inobt_read_verify(
 	struct xfs_buf	*bp)
 {
 	if (!xfs_btree_sblock_verify_crc(bp))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (!xfs_inobt_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error) {
 		trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -288,7 +288,7 @@ xfs_inobt_write_verify(
 {
 	if (!xfs_inobt_verify(bp)) {
 		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index d7ebea72c2d0..d7ebea72c2d0 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index cb35ae41d4a1..f18fd2da49f7 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -101,7 +101,7 @@ xfs_inode_buf_verify(
 				return;
 			}
 
-			xfs_buf_ioerror(bp, EFSCORRUPTED);
+			xfs_buf_ioerror(bp, -EFSCORRUPTED);
 			xfs_verifier_error(bp);
 #ifdef DEBUG
 			xfs_alert(mp,
@@ -174,14 +174,14 @@ xfs_imap_to_bp(
 				   (int)imap->im_len, buf_flags, &bp,
 				   &xfs_inode_buf_ops);
 	if (error) {
-		if (error == EAGAIN) {
+		if (error == -EAGAIN) {
 			ASSERT(buf_flags & XBF_TRYLOCK);
 			return error;
 		}
 
-		if (error == EFSCORRUPTED &&
+		if (error == -EFSCORRUPTED &&
 		    (iget_flags & XFS_IGET_UNTRUSTED))
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 
 		xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
 			__func__, error);
@@ -390,7 +390,7 @@ xfs_iread(
 				__func__, ip->i_ino);
 
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
-		error = XFS_ERROR(EFSCORRUPTED);
+		error = -EFSCORRUPTED;
 		goto out_brelse;
 	}
 
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 9308c47f2a52..9308c47f2a52 100644
--- a/fs/xfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index b031e8d0d928..6a00f7fed69d 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -102,7 +102,7 @@ xfs_iformat_fork(
 				be64_to_cpu(dip->di_nblocks));
 		XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
 				     ip->i_mount, dip);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
@@ -111,7 +111,7 @@ xfs_iformat_fork(
 			dip->di_forkoff);
 		XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
 				     ip->i_mount, dip);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
@@ -121,7 +121,7 @@ xfs_iformat_fork(
 			ip->i_ino);
 		XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
 				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	switch (ip->i_d.di_mode & S_IFMT) {
@@ -132,7 +132,7 @@ xfs_iformat_fork(
 		if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
 			XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
 					      ip->i_mount, dip);
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 		}
 		ip->i_d.di_size = 0;
 		ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
@@ -153,7 +153,7 @@ xfs_iformat_fork(
 				XFS_CORRUPTION_ERROR("xfs_iformat(4)",
 						     XFS_ERRLEVEL_LOW,
 						     ip->i_mount, dip);
-				return XFS_ERROR(EFSCORRUPTED);
+				return -EFSCORRUPTED;
 			}
 
 			di_size = be64_to_cpu(dip->di_size);
@@ -166,7 +166,7 @@ xfs_iformat_fork(
 				XFS_CORRUPTION_ERROR("xfs_iformat(5)",
 						     XFS_ERRLEVEL_LOW,
 						     ip->i_mount, dip);
-				return XFS_ERROR(EFSCORRUPTED);
+				return -EFSCORRUPTED;
 			}
 
 			size = (int)di_size;
@@ -181,13 +181,13 @@ xfs_iformat_fork(
 		default:
 			XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
 					 ip->i_mount);
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 		}
 		break;
 
 	default:
 		XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	if (error) {
 		return error;
@@ -211,7 +211,7 @@ xfs_iformat_fork(
 			XFS_CORRUPTION_ERROR("xfs_iformat(8)",
 					     XFS_ERRLEVEL_LOW,
 					     ip->i_mount, dip);
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 		}
 
 		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
@@ -223,7 +223,7 @@ xfs_iformat_fork(
 		error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
 		break;
 	default:
-		error = XFS_ERROR(EFSCORRUPTED);
+		error = -EFSCORRUPTED;
 		break;
 	}
 	if (error) {
@@ -266,7 +266,7 @@ xfs_iformat_local(
 			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
 		XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
 				     ip->i_mount, dip);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	real_size = 0;
@@ -322,7 +322,7 @@ xfs_iformat_extents(
 			(unsigned long long) ip->i_ino, nex);
 		XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
 				     ip->i_mount, dip);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	ifp->if_real_bytes = 0;
@@ -350,7 +350,7 @@ xfs_iformat_extents(
 					XFS_ERROR_REPORT("xfs_iformat_extents(2)",
 							 XFS_ERRLEVEL_LOW,
 							 ip->i_mount);
-					return XFS_ERROR(EFSCORRUPTED);
+					return -EFSCORRUPTED;
 				}
 	}
 	ifp->if_flags |= XFS_IFEXTENTS;
@@ -399,7 +399,7 @@ xfs_iformat_btree(
 					(unsigned long long) ip->i_ino);
 		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
 					 mp, dip);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	ifp->if_broot_bytes = size;
@@ -436,7 +436,7 @@ xfs_iread_extents(
 	if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
 		XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
 				 ip->i_mount);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -528,7 +528,7 @@ xfs_iroot_realloc(
 		ifp->if_broot_bytes = (int)new_size;
 		ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
 			XFS_IFORK_SIZE(ip, whichfork));
-		memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
+		memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
 		return;
 	}
 
@@ -575,7 +575,7 @@ xfs_iroot_realloc(
 						     ifp->if_broot_bytes);
 		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
 						     (int)new_size);
-		memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
+		memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
 	}
 	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
@@ -1692,7 +1692,7 @@ xfs_iext_idx_to_irec(
 	}
 	*idxp = page_idx;
 	*erp_idxp = erp_idx;
-	return(erp);
+	return erp;
 }
 
 /*
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 7d3b1ed6dcbe..7d3b1ed6dcbe 100644
--- a/fs/xfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h
index 90efdaf1706f..4ff2278e147a 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/libxfs/xfs_inum.h
@@ -54,11 +54,7 @@ struct xfs_mount;
 #define	XFS_OFFBNO_TO_AGINO(mp,b,o)	\
 	((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
 
-#if XFS_BIG_INUMS
 #define	XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 56) - 1ULL))
-#else
-#define	XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 32) - 1ULL))
-#endif
 #define	XFS_MAXINUMBER_32	((xfs_ino_t)((1ULL << 32) - 1ULL))
 
 #endif	/* __XFS_INUM_H__ */
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index f0969c77bdbe..aff12f2d4428 100644
--- a/fs/xfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -380,7 +380,7 @@ typedef struct xfs_icdinode {
 	xfs_ictimestamp_t di_mtime;	/* time last modified */
 	xfs_ictimestamp_t di_ctime;	/* time created/inode modified */
 	xfs_fsize_t	di_size;	/* number of bytes in file */
-	xfs_drfsbno_t	di_nblocks;	/* # of direct & btree blocks used */
+	xfs_rfsblock_t	di_nblocks;	/* # of direct & btree blocks used */
 	xfs_extlen_t	di_extsize;	/* basic/minimum extent size for file */
 	xfs_extnum_t	di_nextents;	/* number of extents in data fork */
 	xfs_aextnum_t	di_anextents;	/* number of extents in attribute fork*/
@@ -516,7 +516,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
  * EFI/EFD log format definitions
  */
 typedef struct xfs_extent {
-	xfs_dfsbno_t	ext_start;
+	xfs_fsblock_t	ext_start;
 	xfs_extlen_t	ext_len;
 } xfs_extent_t;
 
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 1c55ccbb379d..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index ee7e0e80246b..ee7e0e80246b 100644
--- a/fs/xfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 137e20937077..1b0a08379759 100644
--- a/fs/xfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -98,8 +98,6 @@ typedef __uint16_t	xfs_qwarncnt_t;
 #define XFS_IS_QUOTA_ON(mp)	((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
 						   XFS_GQUOTA_ACTIVE | \
 						   XFS_PQUOTA_ACTIVE))
-#define XFS_IS_OQUOTA_ON(mp)	((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
-						   XFS_PQUOTA_ACTIVE))
 #define XFS_IS_UQUOTA_ON(mp)	((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
 #define XFS_IS_GQUOTA_ON(mp)	((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
 #define XFS_IS_PQUOTA_ON(mp)	((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index f4dd697cac08..f4dd697cac08 100644
--- a/fs/xfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 7703fa6770ff..ad525a5623a4 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -186,13 +186,13 @@ xfs_mount_validate_sb(
 	 */
 	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
 		xfs_warn(mp, "bad magic number");
-		return XFS_ERROR(EWRONGFS);
+		return -EWRONGFS;
 	}
 
 
 	if (!xfs_sb_good_version(sbp)) {
 		xfs_warn(mp, "bad version");
-		return XFS_ERROR(EWRONGFS);
+		return -EWRONGFS;
 	}
 
 	/*
@@ -220,7 +220,7 @@ xfs_mount_validate_sb(
 				xfs_warn(mp,
 "Attempted to mount read-only compatible filesystem read-write.\n"
 "Filesystem can only be safely mounted read only.");
-				return XFS_ERROR(EINVAL);
+				return -EINVAL;
 			}
 		}
 		if (xfs_sb_has_incompat_feature(sbp,
@@ -230,7 +230,7 @@ xfs_mount_validate_sb(
 "Filesystem can not be safely mounted by this kernel.",
 				(sbp->sb_features_incompat &
 						XFS_SB_FEAT_INCOMPAT_UNKNOWN));
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 	}
 
@@ -238,13 +238,13 @@ xfs_mount_validate_sb(
 		if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
 			xfs_notice(mp,
 			   "Version 5 of Super block has XFS_OQUOTA bits.");
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 		}
 	} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
 				XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
 			xfs_notice(mp,
 "Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 	}
 
 	if (unlikely(
@@ -252,7 +252,7 @@ xfs_mount_validate_sb(
 		xfs_warn(mp,
 		"filesystem is marked as having an external log; "
 		"specify logdev on the mount command line.");
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	if (unlikely(
@@ -260,7 +260,7 @@ xfs_mount_validate_sb(
 		xfs_warn(mp,
 		"filesystem is marked as having an internal log; "
 		"do not specify logdev on the mount command line.");
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	/*
@@ -294,7 +294,7 @@ xfs_mount_validate_sb(
 	    sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp)			||
 	    sbp->sb_shared_vn != 0)) {
 		xfs_notice(mp, "SB sanity check failed");
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	/*
@@ -305,7 +305,7 @@ xfs_mount_validate_sb(
 		"File system with blocksize %d bytes. "
 		"Only pagesize (%ld) or less will currently work.",
 				sbp->sb_blocksize, PAGE_SIZE);
-		return XFS_ERROR(ENOSYS);
+		return -ENOSYS;
 	}
 
 	/*
@@ -320,19 +320,19 @@ xfs_mount_validate_sb(
 	default:
 		xfs_warn(mp, "inode size of %d bytes not supported",
 				sbp->sb_inodesize);
-		return XFS_ERROR(ENOSYS);
+		return -ENOSYS;
 	}
 
 	if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
 	    xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
 		xfs_warn(mp,
 		"file system too large to be mounted on this system.");
-		return XFS_ERROR(EFBIG);
+		return -EFBIG;
 	}
 
 	if (check_inprogress && sbp->sb_inprogress) {
 		xfs_warn(mp, "Offline file system operation in progress!");
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	return 0;
 }
@@ -386,10 +386,11 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
 	}
 }
 
-void
-xfs_sb_from_disk(
+static void
+__xfs_sb_from_disk(
 	struct xfs_sb	*to,
-	xfs_dsb_t	*from)
+	xfs_dsb_t	*from,
+	bool		convert_xquota)
 {
 	to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
 	to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
@@ -445,6 +446,17 @@ xfs_sb_from_disk(
 	to->sb_pad = 0;
 	to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
 	to->sb_lsn = be64_to_cpu(from->sb_lsn);
+	/* Convert on-disk flags to in-memory flags? */
+	if (convert_xquota)
+		xfs_sb_quota_from_disk(to);
+}
+
+void
+xfs_sb_from_disk(
+	struct xfs_sb	*to,
+	xfs_dsb_t	*from)
+{
+	__xfs_sb_from_disk(to, from, true);
 }
 
 static inline void
@@ -577,7 +589,11 @@ xfs_sb_verify(
 	struct xfs_mount *mp = bp->b_target->bt_mount;
 	struct xfs_sb	sb;
 
-	xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+	/*
+	 * Use call variant which doesn't convert quota flags from disk 
+	 * format, because xfs_mount_validate_sb checks the on-disk flags.
+	 */
+	__xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
 
 	/*
 	 * Only check the in progress field for the primary superblock as
@@ -620,7 +636,7 @@ xfs_sb_read_verify(
 			/* Only fail bad secondaries on a known V5 filesystem */
 			if (bp->b_bn == XFS_SB_DADDR ||
 			    xfs_sb_version_hascrc(&mp->m_sb)) {
-				error = EFSBADCRC;
+				error = -EFSBADCRC;
 				goto out_error;
 			}
 		}
@@ -630,7 +646,7 @@ xfs_sb_read_verify(
 out_error:
 	if (error) {
 		xfs_buf_ioerror(bp, error);
-		if (error == EFSCORRUPTED || error == EFSBADCRC)
+		if (error == -EFSCORRUPTED || error == -EFSBADCRC)
 			xfs_verifier_error(bp);
 	}
 }
@@ -653,7 +669,7 @@ xfs_sb_quiet_read_verify(
 		return;
 	}
 	/* quietly fail */
-	xfs_buf_ioerror(bp, EWRONGFS);
+	xfs_buf_ioerror(bp, -EWRONGFS);
 }
 
 static void
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index c43c2d609a24..2e739708afd3 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -87,11 +87,11 @@ struct xfs_trans;
 typedef struct xfs_sb {
 	__uint32_t	sb_magicnum;	/* magic number == XFS_SB_MAGIC */
 	__uint32_t	sb_blocksize;	/* logical block size, bytes */
-	xfs_drfsbno_t	sb_dblocks;	/* number of data blocks */
-	xfs_drfsbno_t	sb_rblocks;	/* number of realtime blocks */
-	xfs_drtbno_t	sb_rextents;	/* number of realtime extents */
+	xfs_rfsblock_t	sb_dblocks;	/* number of data blocks */
+	xfs_rfsblock_t	sb_rblocks;	/* number of realtime blocks */
+	xfs_rtblock_t	sb_rextents;	/* number of realtime extents */
 	uuid_t		sb_uuid;	/* file system unique id */
-	xfs_dfsbno_t	sb_logstart;	/* starting block of log if internal */
+	xfs_fsblock_t	sb_logstart;	/* starting block of log if internal */
 	xfs_ino_t	sb_rootino;	/* root inode number */
 	xfs_ino_t	sb_rbmino;	/* bitmap inode for realtime extents */
 	xfs_ino_t	sb_rsumino;	/* summary inode for rt bitmap */
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 82404da2ca67..82404da2ca67 100644
--- a/fs/xfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 23c2f2577c8d..5782f037eab4 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -133,9 +133,9 @@ xfs_symlink_read_verify(
 		return;
 
 	if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
-		xfs_buf_ioerror(bp, EFSBADCRC);
+		xfs_buf_ioerror(bp, -EFSBADCRC);
 	else if (!xfs_symlink_verify(bp))
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error)
 		xfs_verifier_error(bp);
@@ -153,7 +153,7 @@ xfs_symlink_write_verify(
 		return;
 
 	if (!xfs_symlink_verify(bp)) {
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
 	}
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f2bda7c76b8a..f2bda7c76b8a 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 1097d14cd583..1097d14cd583 100644
--- a/fs/xfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index bf9c4579334d..bf9c4579334d 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 6888ad886ff6..a65fa5dde6e9 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -152,7 +152,7 @@ xfs_get_acl(struct inode *inode, int type)
 	if (!xfs_acl)
 		return ERR_PTR(-ENOMEM);
 
-	error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+	error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
 							&len, ATTR_ROOT);
 	if (error) {
 		/*
@@ -210,7 +210,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		len -= sizeof(struct xfs_acl_entry) *
 			 (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
 
-		error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
+		error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
 				len, ATTR_ROOT);
 
 		kmem_free(xfs_acl);
@@ -218,7 +218,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		/*
 		 * A NULL ACL argument means we want to remove the ACL.
 		 */
-		error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+		error = xfs_attr_remove(ip, ea_name, ATTR_ROOT);
 
 		/*
 		 * If the attribute didn't exist to start with that's fine.
@@ -244,7 +244,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
 		iattr.ia_mode = mode;
 		iattr.ia_ctime = current_fs_time(inode->i_sb);
 
-		error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+		error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
 	}
 
 	return error;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index faaf716e2080..11e9b4caa54f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -240,7 +240,7 @@ xfs_end_io(
 
 done:
 	if (error)
-		ioend->io_error = -error;
+		ioend->io_error = error;
 	xfs_destroy_ioend(ioend);
 }
 
@@ -308,14 +308,14 @@ xfs_map_blocks(
 	int			nimaps = 1;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return -XFS_ERROR(EIO);
+		return -EIO;
 
 	if (type == XFS_IO_UNWRITTEN)
 		bmapi_flags |= XFS_BMAPI_IGSTATE;
 
 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
 		if (nonblocking)
-			return -XFS_ERROR(EAGAIN);
+			return -EAGAIN;
 		xfs_ilock(ip, XFS_ILOCK_SHARED);
 	}
 
@@ -332,14 +332,14 @@ xfs_map_blocks(
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (error)
-		return -XFS_ERROR(error);
+		return error;
 
 	if (type == XFS_IO_DELALLOC &&
 	    (!nimaps || isnullstartblock(imap->br_startblock))) {
 		error = xfs_iomap_write_allocate(ip, offset, imap);
 		if (!error)
 			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
-		return -XFS_ERROR(error);
+		return error;
 	}
 
 #ifdef DEBUG
@@ -502,7 +502,7 @@ xfs_submit_ioend(
 		 * time.
 		 */
 		if (fail) {
-			ioend->io_error = -fail;
+			ioend->io_error = fail;
 			xfs_finish_ioend(ioend);
 			continue;
 		}
@@ -1253,7 +1253,7 @@ __xfs_get_blocks(
 	int			new = 0;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return -XFS_ERROR(EIO);
+		return -EIO;
 
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1302,7 +1302,7 @@ __xfs_get_blocks(
 			error = xfs_iomap_write_direct(ip, offset, size,
 						       &imap, nimaps);
 			if (error)
-				return -error;
+				return error;
 			new = 1;
 		} else {
 			/*
@@ -1415,7 +1415,7 @@ __xfs_get_blocks(
 
 out_unlock:
 	xfs_iunlock(ip, lockmode);
-	return -error;
+	return error;
 }
 
 int
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 09480c57f069..aa2a8b1838a2 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -76,7 +76,7 @@ xfs_attr3_leaf_freextent(
 		error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
 				       &map, &nmap, XFS_BMAPI_ATTRFORK);
 		if (error) {
-			return(error);
+			return error;
 		}
 		ASSERT(nmap == 1);
 		ASSERT(map.br_startblock != DELAYSTARTBLOCK);
@@ -95,21 +95,21 @@ xfs_attr3_leaf_freextent(
 					dp->i_mount->m_ddev_targp,
 					dblkno, dblkcnt, 0);
 			if (!bp)
-				return ENOMEM;
+				return -ENOMEM;
 			xfs_trans_binval(*trans, bp);
 			/*
 			 * Roll to next transaction.
 			 */
 			error = xfs_trans_roll(trans, dp);
 			if (error)
-				return (error);
+				return error;
 		}
 
 		tblkno += map.br_blockcount;
 		tblkcnt -= map.br_blockcount;
 	}
 
-	return(0);
+	return 0;
 }
 
 /*
@@ -227,7 +227,7 @@ xfs_attr3_node_inactive(
 	 */
 	if (level > XFS_DA_NODE_MAXDEPTH) {
 		xfs_trans_brelse(*trans, bp);	/* no locks for later trans */
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 
 	node = bp->b_addr;
@@ -256,7 +256,7 @@ xfs_attr3_node_inactive(
 		error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
 						XFS_ATTR_FORK);
 		if (error)
-			return(error);
+			return error;
 		if (child_bp) {
 						/* save for re-read later */
 			child_blkno = XFS_BUF_ADDR(child_bp);
@@ -277,7 +277,7 @@ xfs_attr3_node_inactive(
 							child_bp);
 				break;
 			default:
-				error = XFS_ERROR(EIO);
+				error = -EIO;
 				xfs_trans_brelse(*trans, child_bp);
 				break;
 			}
@@ -360,7 +360,7 @@ xfs_attr3_root_inactive(
 		error = xfs_attr3_leaf_inactive(trans, dp, bp);
 		break;
 	default:
-		error = XFS_ERROR(EIO);
+		error = -EIO;
 		xfs_trans_brelse(*trans, bp);
 		break;
 	}
@@ -414,7 +414,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
 	if (error) {
 		xfs_trans_cancel(trans, 0);
-		return(error);
+		return error;
 	}
 	xfs_ilock(dp, XFS_ILOCK_EXCL);
 
@@ -443,10 +443,10 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
-	return(error);
+	return error;
 
 out:
 	xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
-	return(error);
+	return error;
 }
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 90e2eeb21207..62db83ab6cbc 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -50,11 +50,11 @@ xfs_attr_shortform_compare(const void *a, const void *b)
 	sa = (xfs_attr_sf_sort_t *)a;
 	sb = (xfs_attr_sf_sort_t *)b;
 	if (sa->hash < sb->hash) {
-		return(-1);
+		return -1;
 	} else if (sa->hash > sb->hash) {
-		return(1);
+		return 1;
 	} else {
-		return(sa->entno - sb->entno);
+		return sa->entno - sb->entno;
 	}
 }
 
@@ -86,7 +86,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
 	ASSERT(sf != NULL);
 	if (!sf->hdr.count)
-		return(0);
+		return 0;
 	cursor = context->cursor;
 	ASSERT(cursor != NULL);
 
@@ -124,7 +124,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
 		}
 		trace_xfs_attr_list_sf_all(context);
-		return(0);
+		return 0;
 	}
 
 	/* do no more for a search callback */
@@ -150,7 +150,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 					     XFS_ERRLEVEL_LOW,
 					     context->dp->i_mount, sfe);
 			kmem_free(sbuf);
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 		}
 
 		sbp->entno = i;
@@ -188,7 +188,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	}
 	if (i == nsbuf) {
 		kmem_free(sbuf);
-		return(0);
+		return 0;
 	}
 
 	/*
@@ -213,7 +213,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	}
 
 	kmem_free(sbuf);
-	return(0);
+	return 0;
 }
 
 STATIC int
@@ -243,8 +243,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	if (cursor->blkno > 0) {
 		error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1,
 					      &bp, XFS_ATTR_FORK);
-		if ((error != 0) && (error != EFSCORRUPTED))
-			return(error);
+		if ((error != 0) && (error != -EFSCORRUPTED))
+			return error;
 		if (bp) {
 			struct xfs_attr_leaf_entry *entries;
 
@@ -295,7 +295,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 						      cursor->blkno, -1, &bp,
 						      XFS_ATTR_FORK);
 			if (error)
-				return(error);
+				return error;
 			node = bp->b_addr;
 			magic = be16_to_cpu(node->hdr.info.magic);
 			if (magic == XFS_ATTR_LEAF_MAGIC ||
@@ -308,7 +308,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 						     context->dp->i_mount,
 						     node);
 				xfs_trans_brelse(NULL, bp);
-				return XFS_ERROR(EFSCORRUPTED);
+				return -EFSCORRUPTED;
 			}
 
 			dp->d_ops->node_hdr_from_disk(&nodehdr, node);
@@ -496,11 +496,11 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
 	context->cursor->blkno = 0;
 	error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
 	if (error)
-		return XFS_ERROR(error);
+		return error;
 
 	error = xfs_attr3_leaf_list_int(bp, context);
 	xfs_trans_brelse(NULL, bp);
-	return XFS_ERROR(error);
+	return error;
 }
 
 int
@@ -514,7 +514,7 @@ xfs_attr_list_int(
 	XFS_STATS_INC(xs_attr_list);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-		return EIO;
+		return -EIO;
 
 	/*
 	 * Decide on what work routines to call based on the inode size.
@@ -616,16 +616,16 @@ xfs_attr_list(
 	 * Validate the cursor.
 	 */
 	if (cursor->pad1 || cursor->pad2)
-		return(XFS_ERROR(EINVAL));
+		return -EINVAL;
 	if ((cursor->initted == 0) &&
 	    (cursor->hashval || cursor->blkno || cursor->offset))
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	/*
 	 * Check for a properly aligned buffer.
 	 */
 	if (((long)buffer) & (sizeof(int)-1))
-		return XFS_ERROR(EFAULT);
+		return -EFAULT;
 	if (flags & ATTR_KERNOVAL)
 		bufsize = 0;
 
@@ -648,6 +648,6 @@ xfs_attr_list(
 	alist->al_offset[0] = context.bufsize;
 
 	error = xfs_attr_list_int(&context);
-	ASSERT(error >= 0);
+	ASSERT(error <= 0);
 	return error;
 }
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 64731ef3324d..2f1e30d39a35 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -133,7 +133,7 @@ xfs_bmap_finish(
 			mp = ntp->t_mountp;
 			if (!XFS_FORCED_SHUTDOWN(mp))
 				xfs_force_shutdown(mp,
-						   (error == EFSCORRUPTED) ?
+						   (error == -EFSCORRUPTED) ?
 						   SHUTDOWN_CORRUPT_INCORE :
 						   SHUTDOWN_META_IO_ERROR);
 			return error;
@@ -365,7 +365,7 @@ xfs_bmap_count_tree(
 			xfs_trans_brelse(tp, bp);
 			XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
 					 XFS_ERRLEVEL_LOW, mp);
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 		}
 		xfs_trans_brelse(tp, bp);
 	} else {
@@ -425,14 +425,14 @@ xfs_bmap_count_blocks(
 	ASSERT(level > 0);
 	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
 	bno = be64_to_cpu(*pp);
-	ASSERT(bno != NULLDFSBNO);
+	ASSERT(bno != NULLFSBLOCK);
 	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
 	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
 
 	if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
 		XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
 				 mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	return 0;
@@ -524,13 +524,13 @@ xfs_getbmap(
 			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
 			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
 			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
-				return XFS_ERROR(EINVAL);
+				return -EINVAL;
 		} else if (unlikely(
 			   ip->i_d.di_aformat != 0 &&
 			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
 			XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
 					 ip->i_mount);
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 		}
 
 		prealloced = 0;
@@ -539,7 +539,7 @@ xfs_getbmap(
 		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
 		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
 		    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 
 		if (xfs_get_extsz_hint(ip) ||
 		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
@@ -559,26 +559,26 @@ xfs_getbmap(
 		bmv->bmv_entries = 0;
 		return 0;
 	} else if (bmv->bmv_length < 0) {
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	nex = bmv->bmv_count - 1;
 	if (nex <= 0)
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	bmvend = bmv->bmv_offset + bmv->bmv_length;
 
 
 	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
-		return XFS_ERROR(ENOMEM);
+		return -ENOMEM;
 	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
 	if (!out)
-		return XFS_ERROR(ENOMEM);
+		return -ENOMEM;
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	if (whichfork == XFS_DATA_FORK) {
 		if (!(iflags & BMV_IF_DELALLOC) &&
 		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
-			error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+			error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
 			if (error)
 				goto out_unlock_iolock;
 
@@ -611,7 +611,7 @@ xfs_getbmap(
 	/*
 	 * Allocate enough space to handle "subnex" maps at a time.
 	 */
-	error = ENOMEM;
+	error = -ENOMEM;
 	subnex = 16;
 	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
 	if (!map)
@@ -809,7 +809,7 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
 	 * have speculative prealloc/delalloc blocks to remove.
 	 */
 	if (VFS_I(ip)->i_size == 0 &&
-	    VN_CACHED(VFS_I(ip)) == 0 &&
+	    VFS_I(ip)->i_mapping->nrpages == 0 &&
 	    ip->i_delayed_blks == 0)
 		return false;
 
@@ -882,7 +882,7 @@ xfs_free_eofblocks(
 		if (need_iolock) {
 			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
 				xfs_trans_cancel(tp, 0);
-				return EAGAIN;
+				return -EAGAIN;
 			}
 		}
 
@@ -955,14 +955,14 @@ xfs_alloc_file_space(
 	trace_xfs_alloc_file_space(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	error = xfs_qm_dqattach(ip, 0);
 	if (error)
 		return error;
 
 	if (len <= 0)
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	rt = XFS_IS_REALTIME_INODE(ip);
 	extsz = xfs_get_extsz_hint(ip);
@@ -1028,7 +1028,7 @@ xfs_alloc_file_space(
 			/*
 			 * Free the transaction structure.
 			 */
-			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
 			xfs_trans_cancel(tp, 0);
 			break;
 		}
@@ -1065,7 +1065,7 @@ xfs_alloc_file_space(
 		allocated_fsb = imapp->br_blockcount;
 
 		if (nimaps == 0) {
-			error = XFS_ERROR(ENOSPC);
+			error = -ENOSPC;
 			break;
 		}
 
@@ -1126,7 +1126,7 @@ xfs_zero_remaining_bytes(
 					mp->m_rtdev_targp : mp->m_ddev_targp,
 				  BTOBB(mp->m_sb.sb_blocksize), 0);
 	if (!bp)
-		return XFS_ERROR(ENOMEM);
+		return -ENOMEM;
 
 	xfs_buf_unlock(bp);
 
@@ -1158,7 +1158,7 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
 
 		if (XFS_FORCED_SHUTDOWN(mp)) {
-			error = XFS_ERROR(EIO);
+			error = -EIO;
 			break;
 		}
 		xfs_buf_iorequest(bp);
@@ -1176,7 +1176,7 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_WRITE(bp);
 
 		if (XFS_FORCED_SHUTDOWN(mp)) {
-			error = XFS_ERROR(EIO);
+			error = -EIO;
 			break;
 		}
 		xfs_buf_iorequest(bp);
@@ -1234,7 +1234,7 @@ xfs_free_file_space(
 
 	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
 	ioffset = offset & ~(rounding - 1);
-	error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
 					      ioffset, -1);
 	if (error)
 		goto out;
@@ -1315,7 +1315,7 @@ xfs_free_file_space(
 			/*
 			 * Free the transaction structure.
 			 */
-			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
 			xfs_trans_cancel(tp, 0);
 			break;
 		}
@@ -1557,14 +1557,14 @@ xfs_swap_extents_check_format(
 	/* Should never get a local format */
 	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
 	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-		return EINVAL;
+		return -EINVAL;
 
 	/*
 	 * if the target inode has less extents that then temporary inode then
 	 * why did userspace call us?
 	 */
 	if (ip->i_d.di_nextents < tip->i_d.di_nextents)
-		return EINVAL;
+		return -EINVAL;
 
 	/*
 	 * if the target inode is in extent form and the temp inode is in btree
@@ -1573,19 +1573,19 @@ xfs_swap_extents_check_format(
 	 */
 	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
 	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
-		return EINVAL;
+		return -EINVAL;
 
 	/* Check temp in extent form to max in target */
 	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
 	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
 			XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-		return EINVAL;
+		return -EINVAL;
 
 	/* Check target in extent form to max in temp */
 	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
 	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
 			XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-		return EINVAL;
+		return -EINVAL;
 
 	/*
 	 * If we are in a btree format, check that the temp root block will fit
@@ -1599,26 +1599,50 @@ xfs_swap_extents_check_format(
 	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
 		if (XFS_IFORK_BOFF(ip) &&
 		    XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
-			return EINVAL;
+			return -EINVAL;
 		if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
 		    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-			return EINVAL;
+			return -EINVAL;
 	}
 
 	/* Reciprocal target->temp btree format checks */
 	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
 		if (XFS_IFORK_BOFF(tip) &&
 		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
-			return EINVAL;
+			return -EINVAL;
 		if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
 		    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-			return EINVAL;
+			return -EINVAL;
 	}
 
 	return 0;
 }
 
 int
+xfs_swap_extent_flush(
+	struct xfs_inode	*ip)
+{
+	int	error;
+
+	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+	if (error)
+		return error;
+	truncate_pagecache_range(VFS_I(ip), 0, -1);
+
+	/* Verify O_DIRECT for ftmp */
+	if (VFS_I(ip)->i_mapping->nrpages)
+		return -EINVAL;
+
+	/*
+	 * Don't try to swap extents on mmap()d files because we can't lock
+	 * out races against page faults safely.
+	 */
+	if (mapping_mapped(VFS_I(ip)->i_mapping))
+		return -EBUSY;
+	return 0;
+}
+
+int
 xfs_swap_extents(
 	xfs_inode_t	*ip,	/* target inode */
 	xfs_inode_t	*tip,	/* tmp inode */
@@ -1633,51 +1657,57 @@ xfs_swap_extents(
 	int		aforkblks = 0;
 	int		taforkblks = 0;
 	__uint64_t	tmp;
+	int		lock_flags;
 
 	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
 	if (!tempifp) {
-		error = XFS_ERROR(ENOMEM);
+		error = -ENOMEM;
 		goto out;
 	}
 
 	/*
-	 * we have to do two separate lock calls here to keep lockdep
-	 * happy. If we try to get all the locks in one call, lock will
-	 * report false positives when we drop the ILOCK and regain them
-	 * below.
+	 * Lock up the inodes against other IO and truncate to begin with.
+	 * Then we can ensure the inodes are flushed and have no page cache
+	 * safely. Once we have done this we can take the ilocks and do the rest
+	 * of the checks.
 	 */
+	lock_flags = XFS_IOLOCK_EXCL;
 	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
-	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
 
 	/* Verify that both files have the same format */
 	if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto out_unlock;
 	}
 
 	/* Verify both files are either real-time or non-realtime */
 	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto out_unlock;
 	}
 
-	error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
+	error = xfs_swap_extent_flush(ip);
+	if (error)
+		goto out_unlock;
+	error = xfs_swap_extent_flush(tip);
 	if (error)
 		goto out_unlock;
-	truncate_pagecache_range(VFS_I(tip), 0, -1);
 
-	/* Verify O_DIRECT for ftmp */
-	if (VN_CACHED(VFS_I(tip)) != 0) {
-		error = XFS_ERROR(EINVAL);
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
 		goto out_unlock;
 	}
+	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+	lock_flags |= XFS_ILOCK_EXCL;
 
 	/* Verify all data are being swapped */
 	if (sxp->sx_offset != 0 ||
 	    sxp->sx_length != ip->i_d.di_size ||
 	    sxp->sx_length != tip->i_d.di_size) {
-		error = XFS_ERROR(EFAULT);
-		goto out_unlock;
+		error = -EFAULT;
+		goto out_trans_cancel;
 	}
 
 	trace_xfs_swap_extent_before(ip, 0);
@@ -1689,7 +1719,7 @@ xfs_swap_extents(
 		xfs_notice(mp,
 		    "%s: inode 0x%llx format is incompatible for exchanging.",
 				__func__, ip->i_ino);
-		goto out_unlock;
+		goto out_trans_cancel;
 	}
 
 	/*
@@ -1703,43 +1733,9 @@ xfs_swap_extents(
 	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
 	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
 	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
-		error = XFS_ERROR(EBUSY);
-		goto out_unlock;
-	}
-
-	/* We need to fail if the file is memory mapped.  Once we have tossed
-	 * all existing pages, the page fault will have no option
-	 * but to go to the filesystem for pages. By making the page fault call
-	 * vop_read (or write in the case of autogrow) they block on the iolock
-	 * until we have switched the extents.
-	 */
-	if (VN_MAPPED(VFS_I(ip))) {
-		error = XFS_ERROR(EBUSY);
-		goto out_unlock;
-	}
-
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	xfs_iunlock(tip, XFS_ILOCK_EXCL);
-
-	/*
-	 * There is a race condition here since we gave up the
-	 * ilock.  However, the data fork will not change since
-	 * we have the iolock (locked for truncation too) so we
-	 * are safe.  We don't really care if non-io related
-	 * fields change.
-	 */
-	truncate_pagecache_range(VFS_I(ip), 0, -1);
-
-	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
-	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-	if (error) {
-		xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
-		xfs_iunlock(tip, XFS_IOLOCK_EXCL);
-		xfs_trans_cancel(tp, 0);
-		goto out;
+		error = -EBUSY;
+		goto out_trans_cancel;
 	}
-	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-
 	/*
 	 * Count the number of extended attribute blocks
 	 */
@@ -1757,8 +1753,8 @@ xfs_swap_extents(
 			goto out_trans_cancel;
 	}
 
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, lock_flags);
+	xfs_trans_ijoin(tp, tip, lock_flags);
 
 	/*
 	 * Before we've swapped the forks, lets set the owners of the forks
@@ -1887,8 +1883,8 @@ out:
 	return error;
 
 out_unlock:
-	xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_iunlock(ip, lock_flags);
+	xfs_iunlock(tip, lock_flags);
 	goto out;
 
 out_trans_cancel:
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 7a34a1ae6552..cd7b8ca9b064 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -130,7 +130,7 @@ xfs_buf_get_maps(
 	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
 				KM_NOFS);
 	if (!bp->b_maps)
-		return ENOMEM;
+		return -ENOMEM;
 	return 0;
 }
 
@@ -344,7 +344,7 @@ retry:
 		if (unlikely(page == NULL)) {
 			if (flags & XBF_READ_AHEAD) {
 				bp->b_page_count = i;
-				error = ENOMEM;
+				error = -ENOMEM;
 				goto out_free_pages;
 			}
 
@@ -465,7 +465,7 @@ _xfs_buf_find(
 	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
 	if (blkno >= eofs) {
 		/*
-		 * XXX (dgc): we should really be returning EFSCORRUPTED here,
+		 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
 		 * but none of the higher level infrastructure supports
 		 * returning a specific error on buffer lookup failures.
 		 */
@@ -1052,8 +1052,8 @@ xfs_buf_ioerror(
 	xfs_buf_t		*bp,
 	int			error)
 {
-	ASSERT(error >= 0 && error <= 0xffff);
-	bp->b_error = (unsigned short)error;
+	ASSERT(error <= 0 && error >= -1000);
+	bp->b_error = error;
 	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
 }
 
@@ -1064,7 +1064,7 @@ xfs_buf_ioerror_alert(
 {
 	xfs_alert(bp->b_target->bt_mount,
 "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
-		(__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
+		(__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
 }
 
 /*
@@ -1083,7 +1083,7 @@ xfs_bioerror(
 	/*
 	 * No need to wait until the buffer is unpinned, we aren't flushing it.
 	 */
-	xfs_buf_ioerror(bp, EIO);
+	xfs_buf_ioerror(bp, -EIO);
 
 	/*
 	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
@@ -1094,7 +1094,7 @@ xfs_bioerror(
 
 	xfs_buf_ioend(bp, 0);
 
-	return EIO;
+	return -EIO;
 }
 
 /*
@@ -1127,13 +1127,13 @@ xfs_bioerror_relse(
 		 * There's no reason to mark error for
 		 * ASYNC buffers.
 		 */
-		xfs_buf_ioerror(bp, EIO);
+		xfs_buf_ioerror(bp, -EIO);
 		complete(&bp->b_iowait);
 	} else {
 		xfs_buf_relse(bp);
 	}
 
-	return EIO;
+	return -EIO;
 }
 
 STATIC int
@@ -1199,7 +1199,7 @@ xfs_buf_bio_end_io(
 	 * buffers that require multiple bios to complete.
 	 */
 	if (!bp->b_error)
-		xfs_buf_ioerror(bp, -error);
+		xfs_buf_ioerror(bp, error);
 
 	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
 		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
@@ -1286,7 +1286,7 @@ next_chunk:
 		 * because the caller (xfs_buf_iorequest) holds a count itself.
 		 */
 		atomic_dec(&bp->b_io_remaining);
-		xfs_buf_ioerror(bp, EIO);
+		xfs_buf_ioerror(bp, -EIO);
 		bio_put(bio);
 	}
 
@@ -1330,6 +1330,20 @@ _xfs_buf_ioapply(
 						   SHUTDOWN_CORRUPT_INCORE);
 				return;
 			}
+		} else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
+			struct xfs_mount *mp = bp->b_target->bt_mount;
+
+			/*
+			 * non-crc filesystems don't attach verifiers during
+			 * log recovery, so don't warn for such filesystems.
+			 */
+			if (xfs_sb_version_hascrc(&mp->m_sb)) {
+				xfs_warn(mp,
+					"%s: no ops on block 0x%llx/0x%x",
+					__func__, bp->b_bn, bp->b_length);
+				xfs_hex_dump(bp->b_addr, 64);
+				dump_stack();
+			}
 		}
 	} else if (bp->b_flags & XBF_READ_AHEAD) {
 		rw = READA;
@@ -1628,7 +1642,7 @@ xfs_setsize_buftarg(
 		xfs_warn(btp->bt_mount,
 			"Cannot set_blocksize to %u on device %s",
 			sectorsize, name);
-		return EINVAL;
+		return -EINVAL;
 	}
 
 	/* Set up device logical sector size mask */
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3a7a5523d3dc..c753183900b3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -178,7 +178,7 @@ typedef struct xfs_buf {
 	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
 	unsigned int		b_page_count;	/* size of page array */
 	unsigned int		b_offset;	/* page offset in first page */
-	unsigned short		b_error;	/* error code on I/O */
+	int			b_error;	/* error code on I/O */
 	const struct xfs_buf_ops	*b_ops;
 
 #ifdef XFS_BUF_LOCK_TRACKING
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 4654338b03fc..76007deed31f 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -488,7 +488,7 @@ xfs_buf_item_unpin(
 		xfs_buf_lock(bp);
 		xfs_buf_hold(bp);
 		bp->b_flags |= XBF_ASYNC;
-		xfs_buf_ioerror(bp, EIO);
+		xfs_buf_ioerror(bp, -EIO);
 		XFS_BUF_UNDONE(bp);
 		xfs_buf_stale(bp);
 		xfs_buf_ioend(bp, 0);
@@ -725,7 +725,7 @@ xfs_buf_item_get_format(
 	bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
 				KM_SLEEP);
 	if (!bip->bli_formats)
-		return ENOMEM;
+		return -ENOMEM;
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 48e99afb9cb0..f1b69edcdf31 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -95,7 +95,7 @@ xfs_dir2_sf_getdents(
 	 */
 	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
 		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 
 	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
@@ -677,7 +677,7 @@ xfs_readdir(
 	trace_xfs_readdir(dp);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	ASSERT(S_ISDIR(dp->i_d.di_mode));
 	XFS_STATS_INC(xs_dir_getdents);
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 4f11ef011139..13d08a1b390e 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -124,7 +124,7 @@ xfs_trim_extents(
 		}
 
 		trace_xfs_discard_extent(mp, agno, fbno, flen);
-		error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
+		error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
 		if (error)
 			goto out_del_cursor;
 		*blocks_trimmed += flen;
@@ -166,11 +166,11 @@ xfs_ioc_trim(
 	int			error, last_error = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 	if (!blk_queue_discard(q))
-		return -XFS_ERROR(EOPNOTSUPP);
+		return -EOPNOTSUPP;
 	if (copy_from_user(&range, urange, sizeof(range)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	/*
 	 * Truncating down the len isn't actually quite correct, but using
@@ -182,7 +182,7 @@ xfs_ioc_trim(
 	if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
 	    range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
 	    range.len < mp->m_sb.sb_blocksize)
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	start = BTOBB(range.start);
 	end = start + BTOBBT(range.len) - 1;
@@ -195,7 +195,7 @@ xfs_ioc_trim(
 	end_agno = xfs_daddr_to_agno(mp, end);
 
 	for (agno = start_agno; agno <= end_agno; agno++) {
-		error = -xfs_trim_extents(mp, agno, start, end, minlen,
+		error = xfs_trim_extents(mp, agno, start, end, minlen,
 					  &blocks_trimmed);
 		if (error)
 			last_error = error;
@@ -206,7 +206,7 @@ xfs_ioc_trim(
 
 	range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
 	if (copy_to_user(urange, &range, sizeof(range)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	return 0;
 }
 
@@ -222,11 +222,11 @@ xfs_discard_extents(
 		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
 					 busyp->length);
 
-		error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+		error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
 				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
 				XFS_FSB_TO_BB(mp, busyp->length),
 				GFP_NOFS, 0);
-		if (error && error != EOPNOTSUPP) {
+		if (error && error != -EOPNOTSUPP) {
 			xfs_info(mp,
 	 "discard failed for extent [0x%llu,%u], error %d",
 				 (unsigned long long)busyp->bno,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 3ee0cd43edc0..63c2de49f61d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -327,7 +327,7 @@ xfs_qm_dqalloc(
 	 */
 	if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
 		xfs_iunlock(quotip, XFS_ILOCK_EXCL);
-		return (ESRCH);
+		return -ESRCH;
 	}
 
 	xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
@@ -354,7 +354,7 @@ xfs_qm_dqalloc(
 			       mp->m_quotainfo->qi_dqchunklen,
 			       0);
 	if (!bp) {
-		error = ENOMEM;
+		error = -ENOMEM;
 		goto error1;
 	}
 	bp->b_ops = &xfs_dquot_buf_ops;
@@ -400,7 +400,7 @@ xfs_qm_dqalloc(
       error0:
 	xfs_iunlock(quotip, XFS_ILOCK_EXCL);
 
-	return (error);
+	return error;
 }
 
 STATIC int
@@ -426,7 +426,7 @@ xfs_qm_dqrepair(
 
 	if (error) {
 		ASSERT(*bpp == NULL);
-		return XFS_ERROR(error);
+		return error;
 	}
 	(*bpp)->b_ops = &xfs_dquot_buf_ops;
 
@@ -442,7 +442,7 @@ xfs_qm_dqrepair(
 		if (error) {
 			/* repair failed, we're screwed */
 			xfs_trans_brelse(tp, *bpp);
-			return XFS_ERROR(EIO);
+			return -EIO;
 		}
 	}
 
@@ -480,7 +480,7 @@ xfs_qm_dqtobp(
 		 * didn't have the quota inode lock.
 		 */
 		xfs_iunlock(quotip, lock_mode);
-		return ESRCH;
+		return -ESRCH;
 	}
 
 	/*
@@ -508,7 +508,7 @@ xfs_qm_dqtobp(
 		 * We don't allocate unless we're asked to
 		 */
 		if (!(flags & XFS_QMOPT_DQALLOC))
-			return ENOENT;
+			return -ENOENT;
 
 		ASSERT(tp);
 		error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
@@ -530,7 +530,7 @@ xfs_qm_dqtobp(
 					   mp->m_quotainfo->qi_dqchunklen,
 					   0, &bp, &xfs_dquot_buf_ops);
 
-		if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
+		if (error == -EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
 			xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
 						mp->m_quotainfo->qi_dqperchunk;
 			ASSERT(bp == NULL);
@@ -539,7 +539,7 @@ xfs_qm_dqtobp(
 
 		if (error) {
 			ASSERT(bp == NULL);
-			return XFS_ERROR(error);
+			return error;
 		}
 	}
 
@@ -547,7 +547,7 @@ xfs_qm_dqtobp(
 	*O_bpp = bp;
 	*O_ddpp = bp->b_addr + dqp->q_bufoffset;
 
-	return (0);
+	return 0;
 }
 
 
@@ -715,7 +715,7 @@ xfs_qm_dqget(
 	if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
 	    (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
 	    (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
-		return (ESRCH);
+		return -ESRCH;
 	}
 
 #ifdef DEBUG
@@ -723,7 +723,7 @@ xfs_qm_dqget(
 		if ((xfs_dqerror_target == mp->m_ddev_targp) &&
 		    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
 			xfs_debug(mp, "Returning error in dqget");
-			return (EIO);
+			return -EIO;
 		}
 	}
 
@@ -796,14 +796,14 @@ restart:
 		} else {
 			/* inode stays locked on return */
 			xfs_qm_dqdestroy(dqp);
-			return XFS_ERROR(ESRCH);
+			return -ESRCH;
 		}
 	}
 
 	mutex_lock(&qi->qi_tree_lock);
-	error = -radix_tree_insert(tree, id, dqp);
+	error = radix_tree_insert(tree, id, dqp);
 	if (unlikely(error)) {
-		WARN_ON(error != EEXIST);
+		WARN_ON(error != -EEXIST);
 
 		/*
 		 * Duplicate found. Just throw away the new dquot and start
@@ -829,7 +829,7 @@ restart:
 	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	trace_xfs_dqget_miss(dqp);
 	*O_dqpp = dqp;
-	return (0);
+	return 0;
 }
 
 /*
@@ -966,7 +966,7 @@ xfs_qm_dqflush(
 					     SHUTDOWN_CORRUPT_INCORE);
 		else
 			spin_unlock(&mp->m_ail->xa_lock);
-		error = XFS_ERROR(EIO);
+		error = -EIO;
 		goto out_unlock;
 	}
 
@@ -974,7 +974,8 @@ xfs_qm_dqflush(
 	 * Get the buffer containing the on-disk dquot
 	 */
 	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
-				   mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
+				   mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+				   &xfs_dquot_buf_ops);
 	if (error)
 		goto out_unlock;
 
@@ -992,7 +993,7 @@ xfs_qm_dqflush(
 		xfs_buf_relse(bp);
 		xfs_dqfunlock(dqp);
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 
 	/* This is the only portion of data that needs to persist */
@@ -1045,7 +1046,7 @@ xfs_qm_dqflush(
 
 out_unlock:
 	xfs_dqfunlock(dqp);
-	return XFS_ERROR(EIO);
+	return -EIO;
 }
 
 /*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 68a68f704837..c24c67e22a2a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -139,6 +139,21 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
 	}
 }
 
+/*
+ * Check whether a dquot is under low free space conditions. We assume the quota
+ * is enabled and enforced.
+ */
+static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
+{
+	int64_t freesp;
+
+	freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount;
+	if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT])
+		return true;
+
+	return false;
+}
+
 #define XFS_DQ_IS_LOCKED(dqp)	(mutex_is_locked(&((dqp)->q_qlock)))
 #define XFS_DQ_IS_DIRTY(dqp)	((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index edac5b057d28..b92fd7bc49e3 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -27,29 +27,6 @@
 
 #ifdef DEBUG
 
-int	xfs_etrap[XFS_ERROR_NTRAP] = {
-	0,
-};
-
-int
-xfs_error_trap(int e)
-{
-	int i;
-
-	if (!e)
-		return 0;
-	for (i = 0; i < XFS_ERROR_NTRAP; i++) {
-		if (xfs_etrap[i] == 0)
-			break;
-		if (e != xfs_etrap[i])
-			continue;
-		xfs_notice(NULL, "%s: error %d", __func__, e);
-		BUG();
-		break;
-	}
-	return e;
-}
-
 int	xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t	xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *	xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
@@ -190,7 +167,7 @@ xfs_verifier_error(
 	struct xfs_mount *mp = bp->b_target->bt_mount;
 
 	xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
-		  bp->b_error == EFSBADCRC ? "CRC error" : "corruption",
+		  bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
 		  __return_address, bp->b_bn);
 
 	xfs_alert(mp, "Unmount and run xfs_repair");
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c1c57d4a4b5d..279a76e52791 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -18,15 +18,6 @@
 #ifndef	__XFS_ERROR_H__
 #define	__XFS_ERROR_H__
 
-#ifdef DEBUG
-#define	XFS_ERROR_NTRAP	10
-extern int	xfs_etrap[XFS_ERROR_NTRAP];
-extern int	xfs_error_trap(int);
-#define	XFS_ERROR(e)	xfs_error_trap(e)
-#else
-#define	XFS_ERROR(e)	(e)
-#endif
-
 struct xfs_mount;
 
 extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
@@ -56,7 +47,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
 		if (unlikely(!fs_is_ok)) { \
 			XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
 					 XFS_ERRLEVEL_LOW, NULL); \
-			error = XFS_ERROR(EFSCORRUPTED); \
+			error = -EFSCORRUPTED; \
 			goto l; \
 		} \
 	}
@@ -68,7 +59,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
 		if (unlikely(!fs_is_ok)) { \
 			XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
 					 XFS_ERRLEVEL_LOW, NULL); \
-			return XFS_ERROR(EFSCORRUPTED); \
+			return -EFSCORRUPTED; \
 		} \
 	}
 
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 753e467aa1a5..5a6bd5d8779a 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -147,9 +147,9 @@ xfs_nfs_get_inode(
 		 * We don't use ESTALE directly down the chain to not
 		 * confuse applications using bulkstat that expect EINVAL.
 		 */
-		if (error == EINVAL || error == ENOENT)
-			error = ESTALE;
-		return ERR_PTR(-error);
+		if (error == -EINVAL || error == -ENOENT)
+			error = -ESTALE;
+		return ERR_PTR(error);
 	}
 
 	if (ip->i_d.di_gen != generation) {
@@ -217,7 +217,7 @@ xfs_fs_get_parent(
 
 	error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
 	if (unlikely(error))
-		return ERR_PTR(-error);
+		return ERR_PTR(error);
 
 	return d_obtain_alias(VFS_I(cip));
 }
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
 
 	if (!lsn)
 		return 0;
-	return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 
 const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index fb7a4c1ce1c5..c4327419dc5c 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -298,7 +298,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 		}
 		return 0;
 	}
-	return EFSCORRUPTED;
+	return -EFSCORRUPTED;
 }
 
 /*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f66779d7a46..076b1708d134 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
 #include "xfs_trace.h"
 #include "xfs_log.h"
 #include "xfs_dinode.h"
+#include "xfs_icache.h"
 
 #include <linux/aio.h>
 #include <linux/dcache.h>
@@ -155,7 +156,7 @@ xfs_dir_fsync(
 
 	if (!lsn)
 		return 0;
-	return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 
 STATIC int
@@ -179,7 +180,7 @@ xfs_file_fsync(
 		return error;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return -XFS_ERROR(EIO);
+		return -EIO;
 
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 
@@ -225,7 +226,7 @@ xfs_file_fsync(
 	    !log_flushed)
 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
 
-	return -error;
+	return error;
 }
 
 STATIC ssize_t
@@ -246,11 +247,11 @@ xfs_file_read_iter(
 	XFS_STATS_INC(xs_read_calls);
 
 	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
+		ioflags |= XFS_IO_ISDIRECT;
 	if (file->f_mode & FMODE_NOCMTIME)
-		ioflags |= IO_INVIS;
+		ioflags |= XFS_IO_INVIS;
 
-	if (unlikely(ioflags & IO_ISDIRECT)) {
+	if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
 		xfs_buftarg_t	*target =
 			XFS_IS_REALTIME_INODE(ip) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -258,7 +259,7 @@ xfs_file_read_iter(
 		if ((pos | size) & target->bt_logical_sectormask) {
 			if (pos == i_size_read(inode))
 				return 0;
-			return -XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 	}
 
@@ -283,7 +284,7 @@ xfs_file_read_iter(
 	 * proceeed concurrently without serialisation.
 	 */
 	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-	if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+	if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
 		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
@@ -325,7 +326,7 @@ xfs_file_splice_read(
 	XFS_STATS_INC(xs_read_calls);
 
 	if (infilp->f_mode & FMODE_NOCMTIME)
-		ioflags |= IO_INVIS;
+		ioflags |= XFS_IO_INVIS;
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
@@ -524,7 +525,7 @@ restart:
 			xfs_rw_ilock(ip, *iolock);
 			goto restart;
 		}
-		error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
+		error = xfs_zero_eof(ip, *pos, i_size_read(inode));
 		if (error)
 			return error;
 	}
@@ -594,7 +595,7 @@ xfs_file_dio_aio_write(
 
 	/* DIO must be aligned to device logical sector size */
 	if ((pos | count) & target->bt_logical_sectormask)
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	/* "unaligned" here means not aligned to a filesystem block */
 	if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
@@ -689,14 +690,28 @@ write_retry:
 	ret = generic_perform_write(file, from, pos);
 	if (likely(ret >= 0))
 		iocb->ki_pos = pos + ret;
+
 	/*
-	 * If we just got an ENOSPC, try to write back all dirty inodes to
-	 * convert delalloc space to free up some of the excess reserved
-	 * metadata space.
+	 * If we hit a space limit, try to free up some lingering preallocated
+	 * space before returning an error. In the case of ENOSPC, first try to
+	 * write back all dirty inodes to free up some of the excess reserved
+	 * metadata space. This reduces the chances that the eofblocks scan
+	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
+	 * also behaves as a filter to prevent too many eofblocks scans from
+	 * running at the same time.
 	 */
-	if (ret == -ENOSPC && !enospc) {
+	if (ret == -EDQUOT && !enospc) {
+		enospc = xfs_inode_free_quota_eofblocks(ip);
+		if (enospc)
+			goto write_retry;
+	} else if (ret == -ENOSPC && !enospc) {
+		struct xfs_eofblocks eofb = {0};
+
 		enospc = 1;
 		xfs_flush_inodes(ip->i_mount);
+		eofb.eof_scan_owner = ip->i_ino; /* for locking */
+		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
+		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
 		goto write_retry;
 	}
 
@@ -772,7 +787,7 @@ xfs_file_fallocate(
 		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
 
 		if (offset & blksize_mask || len & blksize_mask) {
-			error = EINVAL;
+			error = -EINVAL;
 			goto out_unlock;
 		}
 
@@ -781,7 +796,7 @@ xfs_file_fallocate(
 		 * in which case it is effectively a truncate operation
 		 */
 		if (offset + len >= i_size_read(inode)) {
-			error = EINVAL;
+			error = -EINVAL;
 			goto out_unlock;
 		}
 
@@ -794,7 +809,7 @@ xfs_file_fallocate(
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 		    offset + len > i_size_read(inode)) {
 			new_size = offset + len;
-			error = -inode_newsize_ok(inode, new_size);
+			error = inode_newsize_ok(inode, new_size);
 			if (error)
 				goto out_unlock;
 		}
@@ -844,7 +859,7 @@ xfs_file_fallocate(
 
 out_unlock:
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-	return -error;
+	return error;
 }
 
 
@@ -889,7 +904,7 @@ xfs_file_release(
 	struct inode	*inode,
 	struct file	*filp)
 {
-	return -xfs_release(XFS_I(inode));
+	return xfs_release(XFS_I(inode));
 }
 
 STATIC int
@@ -918,7 +933,7 @@ xfs_file_readdir(
 
 	error = xfs_readdir(ip, ctx, bufsize);
 	if (error)
-		return -error;
+		return error;
 	return 0;
 }
 
@@ -1184,7 +1199,7 @@ xfs_seek_data(
 
 	isize = i_size_read(inode);
 	if (start >= isize) {
-		error = ENXIO;
+		error = -ENXIO;
 		goto out_unlock;
 	}
 
@@ -1206,7 +1221,7 @@ xfs_seek_data(
 
 		/* No extents at given offset, must be beyond EOF */
 		if (nmap == 0) {
-			error = ENXIO;
+			error = -ENXIO;
 			goto out_unlock;
 		}
 
@@ -1237,7 +1252,7 @@ xfs_seek_data(
 		 * we are reading after EOF if nothing in map[1].
 		 */
 		if (nmap == 1) {
-			error = ENXIO;
+			error = -ENXIO;
 			goto out_unlock;
 		}
 
@@ -1250,7 +1265,7 @@ xfs_seek_data(
 		fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
 		start = XFS_FSB_TO_B(mp, fsbno);
 		if (start >= isize) {
-			error = ENXIO;
+			error = -ENXIO;
 			goto out_unlock;
 		}
 	}
@@ -1262,7 +1277,7 @@ out_unlock:
 	xfs_iunlock(ip, lock);
 
 	if (error)
-		return -error;
+		return error;
 	return offset;
 }
 
@@ -1282,13 +1297,13 @@ xfs_seek_hole(
 	int			error;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return -XFS_ERROR(EIO);
+		return -EIO;
 
 	lock = xfs_ilock_data_map_shared(ip);
 
 	isize = i_size_read(inode);
 	if (start >= isize) {
-		error = ENXIO;
+		error = -ENXIO;
 		goto out_unlock;
 	}
 
@@ -1307,7 +1322,7 @@ xfs_seek_hole(
 
 		/* No extents at given offset, must be beyond EOF */
 		if (nmap == 0) {
-			error = ENXIO;
+			error = -ENXIO;
 			goto out_unlock;
 		}
 
@@ -1370,7 +1385,7 @@ out_unlock:
 	xfs_iunlock(ip, lock);
 
 	if (error)
-		return -error;
+		return error;
 	return offset;
 }
 
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 8ec81bed7992..e92730c1d3ca 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -258,7 +258,7 @@ next_ag:
 	if (*agp == NULLAGNUMBER)
 		return 0;
 
-	err = ENOMEM;
+	err = -ENOMEM;
 	item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
 	if (!item)
 		goto out_put_ag;
@@ -268,7 +268,7 @@ next_ag:
 
 	err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
 	if (err) {
-		if (err == EEXIST)
+		if (err == -EEXIST)
 			err = 0;
 		goto out_free_item;
 	}
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index d34703dbcb42..18dc721ca19f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -255,8 +255,8 @@ typedef struct xfs_fsop_resblks {
 	((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
 
 /* Used for sanity checks on superblock */
-#define XFS_MAX_DBLOCKS(s) ((xfs_drfsbno_t)(s)->sb_agcount * (s)->sb_agblocks)
-#define XFS_MIN_DBLOCKS(s) ((xfs_drfsbno_t)((s)->sb_agcount - 1) *	\
+#define XFS_MAX_DBLOCKS(s) ((xfs_rfsblock_t)(s)->sb_agcount * (s)->sb_agblocks)
+#define XFS_MIN_DBLOCKS(s) ((xfs_rfsblock_t)((s)->sb_agcount - 1) *	\
 			 (s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
 
 /*
@@ -375,6 +375,9 @@ struct xfs_fs_eofblocks {
 #define XFS_EOF_FLAGS_GID		(1 << 2) /* filter by gid */
 #define XFS_EOF_FLAGS_PRID		(1 << 3) /* filter by project id */
 #define XFS_EOF_FLAGS_MINFILESIZE	(1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_UNION		(1 << 5) /* union filter algorithm;
+						  * kernel only, not included in
+						  * valid mask */
 #define XFS_EOF_FLAGS_VALID	\
 	(XFS_EOF_FLAGS_SYNC |	\
 	 XFS_EOF_FLAGS_UID |	\
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index d2295561570a..f91de1ef05e1 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -168,7 +168,7 @@ xfs_growfs_data_private(
 	nb = in->newblocks;
 	pct = in->imaxpct;
 	if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
 		return error;
 	dpct = pct - mp->m_sb.sb_imax_pct;
@@ -176,7 +176,7 @@ xfs_growfs_data_private(
 				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
 				XFS_FSS_TO_BB(mp, 1), 0, NULL);
 	if (!bp)
-		return EIO;
+		return -EIO;
 	if (bp->b_error) {
 		error = bp->b_error;
 		xfs_buf_relse(bp);
@@ -191,7 +191,7 @@ xfs_growfs_data_private(
 		nagcount--;
 		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
 		if (nb < mp->m_sb.sb_dblocks)
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 	}
 	new = nb - mp->m_sb.sb_dblocks;
 	oagcount = mp->m_sb.sb_agcount;
@@ -229,7 +229,7 @@ xfs_growfs_data_private(
 				XFS_FSS_TO_BB(mp, 1), 0,
 				&xfs_agf_buf_ops);
 		if (!bp) {
-			error = ENOMEM;
+			error = -ENOMEM;
 			goto error0;
 		}
 
@@ -270,7 +270,7 @@ xfs_growfs_data_private(
 				XFS_FSS_TO_BB(mp, 1), 0,
 				&xfs_agfl_buf_ops);
 		if (!bp) {
-			error = ENOMEM;
+			error = -ENOMEM;
 			goto error0;
 		}
 
@@ -298,7 +298,7 @@ xfs_growfs_data_private(
 				XFS_FSS_TO_BB(mp, 1), 0,
 				&xfs_agi_buf_ops);
 		if (!bp) {
-			error = ENOMEM;
+			error = -ENOMEM;
 			goto error0;
 		}
 
@@ -336,7 +336,7 @@ xfs_growfs_data_private(
 				&xfs_allocbt_buf_ops);
 
 		if (!bp) {
-			error = ENOMEM;
+			error = -ENOMEM;
 			goto error0;
 		}
 
@@ -365,7 +365,7 @@ xfs_growfs_data_private(
 				BTOBB(mp->m_sb.sb_blocksize), 0,
 				&xfs_allocbt_buf_ops);
 		if (!bp) {
-			error = ENOMEM;
+			error = -ENOMEM;
 			goto error0;
 		}
 
@@ -395,7 +395,7 @@ xfs_growfs_data_private(
 				BTOBB(mp->m_sb.sb_blocksize), 0,
 				&xfs_inobt_buf_ops);
 		if (!bp) {
-			error = ENOMEM;
+			error = -ENOMEM;
 			goto error0;
 		}
 
@@ -420,7 +420,7 @@ xfs_growfs_data_private(
 				BTOBB(mp->m_sb.sb_blocksize), 0,
 				&xfs_inobt_buf_ops);
 			if (!bp) {
-				error = ENOMEM;
+				error = -ENOMEM;
 				goto error0;
 			}
 
@@ -531,7 +531,7 @@ xfs_growfs_data_private(
 				bp->b_ops = &xfs_sb_buf_ops;
 				xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
 			} else
-				error = ENOMEM;
+				error = -ENOMEM;
 		}
 
 		/*
@@ -576,17 +576,17 @@ xfs_growfs_log_private(
 
 	nb = in->newblocks;
 	if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	if (nb == mp->m_sb.sb_logblocks &&
 	    in->isint == (mp->m_sb.sb_logstart != 0))
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	/*
 	 * Moving the log is hard, need new interfaces to sync
 	 * the log first, hold off all activity while moving it.
 	 * Can have shorter or longer log in the same space,
 	 * or transform internal to external log or vice versa.
 	 */
-	return XFS_ERROR(ENOSYS);
+	return -ENOSYS;
 }
 
 /*
@@ -604,9 +604,9 @@ xfs_growfs_data(
 	int error;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
+		return -EPERM;
 	if (!mutex_trylock(&mp->m_growlock))
-		return XFS_ERROR(EWOULDBLOCK);
+		return -EWOULDBLOCK;
 	error = xfs_growfs_data_private(mp, in);
 	mutex_unlock(&mp->m_growlock);
 	return error;
@@ -620,9 +620,9 @@ xfs_growfs_log(
 	int error;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
+		return -EPERM;
 	if (!mutex_trylock(&mp->m_growlock))
-		return XFS_ERROR(EWOULDBLOCK);
+		return -EWOULDBLOCK;
 	error = xfs_growfs_log_private(mp, in);
 	mutex_unlock(&mp->m_growlock);
 	return error;
@@ -674,7 +674,7 @@ xfs_reserve_blocks(
 	/* If inval is null, report current values and return */
 	if (inval == (__uint64_t *)NULL) {
 		if (!outval)
-			return EINVAL;
+			return -EINVAL;
 		outval->resblks = mp->m_resblks;
 		outval->resblks_avail = mp->m_resblks_avail;
 		return 0;
@@ -757,7 +757,7 @@ out:
 		int error;
 		error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
 						 fdblks_delta, 0);
-		if (error == ENOSPC)
+		if (error == -ENOSPC)
 			goto retry;
 	}
 	return 0;
@@ -818,7 +818,7 @@ xfs_fs_goingdown(
 				SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
 		break;
 	default:
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	return 0;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index c48df5f25b9f..981b2cf51985 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -33,6 +33,9 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_bmap_util.h"
+#include "xfs_quota.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -158,7 +161,7 @@ xfs_iget_cache_hit(
 	if (ip->i_ino != ino) {
 		trace_xfs_iget_skip(ip);
 		XFS_STATS_INC(xs_ig_frecycle);
-		error = EAGAIN;
+		error = -EAGAIN;
 		goto out_error;
 	}
 
@@ -176,7 +179,7 @@ xfs_iget_cache_hit(
 	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
 		trace_xfs_iget_skip(ip);
 		XFS_STATS_INC(xs_ig_frecycle);
-		error = EAGAIN;
+		error = -EAGAIN;
 		goto out_error;
 	}
 
@@ -184,7 +187,7 @@ xfs_iget_cache_hit(
 	 * If lookup is racing with unlink return an error immediately.
 	 */
 	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-		error = ENOENT;
+		error = -ENOENT;
 		goto out_error;
 	}
 
@@ -206,7 +209,7 @@ xfs_iget_cache_hit(
 		spin_unlock(&ip->i_flags_lock);
 		rcu_read_unlock();
 
-		error = -inode_init_always(mp->m_super, inode);
+		error = inode_init_always(mp->m_super, inode);
 		if (error) {
 			/*
 			 * Re-initializing the inode failed, and we are in deep
@@ -243,7 +246,7 @@ xfs_iget_cache_hit(
 		/* If the VFS inode is being torn down, pause and try again. */
 		if (!igrab(inode)) {
 			trace_xfs_iget_skip(ip);
-			error = EAGAIN;
+			error = -EAGAIN;
 			goto out_error;
 		}
 
@@ -285,7 +288,7 @@ xfs_iget_cache_miss(
 
 	ip = xfs_inode_alloc(mp, ino);
 	if (!ip)
-		return ENOMEM;
+		return -ENOMEM;
 
 	error = xfs_iread(mp, tp, ip, flags);
 	if (error)
@@ -294,7 +297,7 @@ xfs_iget_cache_miss(
 	trace_xfs_iget_miss(ip);
 
 	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-		error = ENOENT;
+		error = -ENOENT;
 		goto out_destroy;
 	}
 
@@ -305,7 +308,7 @@ xfs_iget_cache_miss(
 	 * recurse into the file system.
 	 */
 	if (radix_tree_preload(GFP_NOFS)) {
-		error = EAGAIN;
+		error = -EAGAIN;
 		goto out_destroy;
 	}
 
@@ -341,7 +344,7 @@ xfs_iget_cache_miss(
 	if (unlikely(error)) {
 		WARN_ON(error != -EEXIST);
 		XFS_STATS_INC(xs_ig_dup);
-		error = EAGAIN;
+		error = -EAGAIN;
 		goto out_preload_end;
 	}
 	spin_unlock(&pag->pag_ici_lock);
@@ -408,7 +411,7 @@ xfs_iget(
 
 	/* reject inode numbers outside existing AGs */
 	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
-		return EINVAL;
+		return -EINVAL;
 
 	/* get the perag structure and ensure that it's inode capable */
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
@@ -445,7 +448,7 @@ again:
 	return 0;
 
 out_error_or_again:
-	if (error == EAGAIN) {
+	if (error == -EAGAIN) {
 		delay(1);
 		goto again;
 	}
@@ -489,18 +492,18 @@ xfs_inode_ag_walk_grab(
 
 	/* nothing to sync during shutdown */
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return EFSCORRUPTED;
+		return -EFSCORRUPTED;
 
 	/* If we can't grab the inode, it must on it's way to reclaim. */
 	if (!igrab(inode))
-		return ENOENT;
+		return -ENOENT;
 
 	/* inode is valid */
 	return 0;
 
 out_unlock_noent:
 	spin_unlock(&ip->i_flags_lock);
-	return ENOENT;
+	return -ENOENT;
 }
 
 STATIC int
@@ -583,16 +586,16 @@ restart:
 				continue;
 			error = execute(batch[i], flags, args);
 			IRELE(batch[i]);
-			if (error == EAGAIN) {
+			if (error == -EAGAIN) {
 				skipped++;
 				continue;
 			}
-			if (error && last_error != EFSCORRUPTED)
+			if (error && last_error != -EFSCORRUPTED)
 				last_error = error;
 		}
 
 		/* bail out if the filesystem is corrupted.  */
-		if (error == EFSCORRUPTED)
+		if (error == -EFSCORRUPTED)
 			break;
 
 		cond_resched();
@@ -652,11 +655,11 @@ xfs_inode_ag_iterator(
 		xfs_perag_put(pag);
 		if (error) {
 			last_error = error;
-			if (error == EFSCORRUPTED)
+			if (error == -EFSCORRUPTED)
 				break;
 		}
 	}
-	return XFS_ERROR(last_error);
+	return last_error;
 }
 
 int
@@ -680,11 +683,11 @@ xfs_inode_ag_iterator_tag(
 		xfs_perag_put(pag);
 		if (error) {
 			last_error = error;
-			if (error == EFSCORRUPTED)
+			if (error == -EFSCORRUPTED)
 				break;
 		}
 	}
-	return XFS_ERROR(last_error);
+	return last_error;
 }
 
 /*
@@ -944,7 +947,7 @@ restart:
 	 * see the stale flag set on the inode.
 	 */
 	error = xfs_iflush(ip, &bp);
-	if (error == EAGAIN) {
+	if (error == -EAGAIN) {
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		/* backoff longer than in xfs_ifree_cluster */
 		delay(2);
@@ -997,7 +1000,7 @@ out:
 	xfs_iflags_clear(ip, XFS_IRECLAIM);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	/*
-	 * We could return EAGAIN here to make reclaim rescan the inode tree in
+	 * We could return -EAGAIN here to make reclaim rescan the inode tree in
 	 * a short while. However, this just burns CPU time scanning the tree
 	 * waiting for IO to complete and the reclaim work never goes back to
 	 * the idle state. Instead, return 0 to let the next scheduled
@@ -1100,7 +1103,7 @@ restart:
 				if (!batch[i])
 					continue;
 				error = xfs_reclaim_inode(batch[i], pag, flags);
-				if (error && last_error != EFSCORRUPTED)
+				if (error && last_error != -EFSCORRUPTED)
 					last_error = error;
 			}
 
@@ -1129,7 +1132,7 @@ restart:
 		trylock = 0;
 		goto restart;
 	}
-	return XFS_ERROR(last_error);
+	return last_error;
 }
 
 int
@@ -1203,6 +1206,30 @@ xfs_inode_match_id(
 	return 1;
 }
 
+/*
+ * A union-based inode filtering algorithm. Process the inode if any of the
+ * criteria match. This is for global/internal scans only.
+ */
+STATIC int
+xfs_inode_match_id_union(
+	struct xfs_inode	*ip,
+	struct xfs_eofblocks	*eofb)
+{
+	if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
+	    uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
+		return 1;
+
+	if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
+	    gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
+		return 1;
+
+	if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
+	    xfs_get_projid(ip) == eofb->eof_prid)
+		return 1;
+
+	return 0;
+}
+
 STATIC int
 xfs_inode_free_eofblocks(
 	struct xfs_inode	*ip,
@@ -1211,6 +1238,10 @@ xfs_inode_free_eofblocks(
 {
 	int ret;
 	struct xfs_eofblocks *eofb = args;
+	bool need_iolock = true;
+	int match;
+
+	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
 
 	if (!xfs_can_free_eofblocks(ip, false)) {
 		/* inode could be preallocated or append-only */
@@ -1228,19 +1259,31 @@ xfs_inode_free_eofblocks(
 		return 0;
 
 	if (eofb) {
-		if (!xfs_inode_match_id(ip, eofb))
+		if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
+			match = xfs_inode_match_id_union(ip, eofb);
+		else
+			match = xfs_inode_match_id(ip, eofb);
+		if (!match)
 			return 0;
 
 		/* skip the inode if the file size is too small */
 		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
 		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
 			return 0;
+
+		/*
+		 * A scan owner implies we already hold the iolock. Skip it in
+		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+		 * the possibility of EAGAIN being returned.
+		 */
+		if (eofb->eof_scan_owner == ip->i_ino)
+			need_iolock = false;
 	}
 
-	ret = xfs_free_eofblocks(ip->i_mount, ip, true);
+	ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
 
 	/* don't revisit the inode if we're not waiting */
-	if (ret == EAGAIN && !(flags & SYNC_WAIT))
+	if (ret == -EAGAIN && !(flags & SYNC_WAIT))
 		ret = 0;
 
 	return ret;
@@ -1260,6 +1303,55 @@ xfs_icache_free_eofblocks(
 					 eofb, XFS_ICI_EOFBLOCKS_TAG);
 }
 
+/*
+ * Run eofblocks scans on the quotas applicable to the inode. For inodes with
+ * multiple quotas, we don't know exactly which quota caused an allocation
+ * failure. We make a best effort by including each quota under low free space
+ * conditions (less than 1% free space) in the scan.
+ */
+int
+xfs_inode_free_quota_eofblocks(
+	struct xfs_inode *ip)
+{
+	int scan = 0;
+	struct xfs_eofblocks eofb = {0};
+	struct xfs_dquot *dq;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+
+	/*
+	 * Set the scan owner to avoid a potential livelock. Otherwise, the scan
+	 * can repeatedly trylock on the inode we're currently processing. We
+	 * run a sync scan to increase effectiveness and use the union filter to
+	 * cover all applicable quotas in a single scan.
+	 */
+	eofb.eof_scan_owner = ip->i_ino;
+	eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
+
+	if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
+		dq = xfs_inode_dquot(ip, XFS_DQ_USER);
+		if (dq && xfs_dquot_lowsp(dq)) {
+			eofb.eof_uid = VFS_I(ip)->i_uid;
+			eofb.eof_flags |= XFS_EOF_FLAGS_UID;
+			scan = 1;
+		}
+	}
+
+	if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
+		dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
+		if (dq && xfs_dquot_lowsp(dq)) {
+			eofb.eof_gid = VFS_I(ip)->i_gid;
+			eofb.eof_flags |= XFS_EOF_FLAGS_GID;
+			scan = 1;
+		}
+	}
+
+	if (scan)
+		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+
+	return scan;
+}
+
 void
 xfs_inode_set_eofblocks_tag(
 	xfs_inode_t	*ip)
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 9cf017b899be..46748b86b12f 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -27,6 +27,7 @@ struct xfs_eofblocks {
 	kgid_t		eof_gid;
 	prid_t		eof_prid;
 	__u64		eof_min_file_size;
+	xfs_ino_t	eof_scan_owner;
 };
 
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
@@ -57,6 +58,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
 void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
 void xfs_eofblocks_worker(struct work_struct *);
 
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
@@ -72,31 +74,32 @@ xfs_fs_eofblocks_from_user(
 	struct xfs_eofblocks		*dst)
 {
 	if (src->eof_version != XFS_EOFBLOCKS_VERSION)
-		return EINVAL;
+		return -EINVAL;
 
 	if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
-		return EINVAL;
+		return -EINVAL;
 
 	if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
 	    memchr_inv(src->pad64, 0, sizeof(src->pad64)))
-		return EINVAL;
+		return -EINVAL;
 
 	dst->eof_flags = src->eof_flags;
 	dst->eof_prid = src->eof_prid;
 	dst->eof_min_file_size = src->eof_min_file_size;
+	dst->eof_scan_owner = NULLFSINO;
 
 	dst->eof_uid = INVALID_UID;
 	if (src->eof_flags & XFS_EOF_FLAGS_UID) {
 		dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
 		if (!uid_valid(dst->eof_uid))
-			return EINVAL;
+			return -EINVAL;
 	}
 
 	dst->eof_gid = INVALID_GID;
 	if (src->eof_flags & XFS_EOF_FLAGS_GID) {
 		dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
 		if (!gid_valid(dst->eof_gid))
-			return EINVAL;
+			return -EINVAL;
 	}
 	return 0;
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a6115fe1ac94..fea3c92fb3f0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -583,7 +583,7 @@ xfs_lookup(
 	trace_xfs_lookup(dp, name);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	lock_mode = xfs_ilock_data_map_shared(dp);
 	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
@@ -893,7 +893,7 @@ xfs_dir_ialloc(
 	}
 	if (!ialloc_context && !ip) {
 		*ipp = NULL;
-		return XFS_ERROR(ENOSPC);
+		return -ENOSPC;
 	}
 
 	/*
@@ -1088,7 +1088,7 @@ xfs_create(
 	trace_xfs_create(dp, name);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	prid = xfs_get_initial_prid(dp);
 
@@ -1125,12 +1125,12 @@ xfs_create(
 	 */
 	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
 	error = xfs_trans_reserve(tp, &tres, resblks, 0);
-	if (error == ENOSPC) {
+	if (error == -ENOSPC) {
 		/* flush outstanding delalloc blocks and retry */
 		xfs_flush_inodes(mp);
 		error = xfs_trans_reserve(tp, &tres, resblks, 0);
 	}
-	if (error == ENOSPC) {
+	if (error == -ENOSPC) {
 		/* No space at all so try a "no-allocation" reservation */
 		resblks = 0;
 		error = xfs_trans_reserve(tp, &tres, 0, 0);
@@ -1165,7 +1165,7 @@ xfs_create(
 	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
 			       prid, resblks > 0, &ip, &committed);
 	if (error) {
-		if (error == ENOSPC)
+		if (error == -ENOSPC)
 			goto out_trans_cancel;
 		goto out_trans_abort;
 	}
@@ -1184,7 +1184,7 @@ xfs_create(
 					&first_block, &free_list, resblks ?
 					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 	if (error) {
-		ASSERT(error != ENOSPC);
+		ASSERT(error != -ENOSPC);
 		goto out_trans_abort;
 	}
 	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -1274,7 +1274,7 @@ xfs_create_tmpfile(
 	uint			resblks;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	prid = xfs_get_initial_prid(dp);
 
@@ -1293,7 +1293,7 @@ xfs_create_tmpfile(
 
 	tres = &M_RES(mp)->tr_create_tmpfile;
 	error = xfs_trans_reserve(tp, tres, resblks, 0);
-	if (error == ENOSPC) {
+	if (error == -ENOSPC) {
 		/* No space at all so try a "no-allocation" reservation */
 		resblks = 0;
 		error = xfs_trans_reserve(tp, tres, 0, 0);
@@ -1311,7 +1311,7 @@ xfs_create_tmpfile(
 	error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
 				prid, resblks > 0, &ip, NULL);
 	if (error) {
-		if (error == ENOSPC)
+		if (error == -ENOSPC)
 			goto out_trans_cancel;
 		goto out_trans_abort;
 	}
@@ -1382,7 +1382,7 @@ xfs_link(
 	ASSERT(!S_ISDIR(sip->i_d.di_mode));
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	error = xfs_qm_dqattach(sip, 0);
 	if (error)
@@ -1396,7 +1396,7 @@ xfs_link(
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
-	if (error == ENOSPC) {
+	if (error == -ENOSPC) {
 		resblks = 0;
 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
 	}
@@ -1417,7 +1417,7 @@ xfs_link(
 	 */
 	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
 		     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
-		error = XFS_ERROR(EXDEV);
+		error = -EXDEV;
 		goto error_return;
 	}
 
@@ -1635,8 +1635,8 @@ xfs_release(
 		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
 		if (truncated) {
 			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
-			if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
-				error = -filemap_flush(VFS_I(ip)->i_mapping);
+			if (ip->i_delayed_blks > 0) {
+				error = filemap_flush(VFS_I(ip)->i_mapping);
 				if (error)
 					return error;
 			}
@@ -1673,7 +1673,7 @@ xfs_release(
 			return 0;
 
 		error = xfs_free_eofblocks(mp, ip, true);
-		if (error && error != EAGAIN)
+		if (error && error != -EAGAIN)
 			return error;
 
 		/* delalloc blocks after truncation means it really is dirty */
@@ -1772,7 +1772,7 @@ xfs_inactive_ifree(
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
 				  XFS_IFREE_SPACE_RES(mp), 0);
 	if (error) {
-		if (error == ENOSPC) {
+		if (error == -ENOSPC) {
 			xfs_warn_ratelimited(mp,
 			"Failed to remove inode(s) from unlinked list. "
 			"Please free space, unmount and run xfs_repair.");
@@ -2219,7 +2219,7 @@ xfs_ifree_cluster(
 					XBF_UNMAPPED);
 
 		if (!bp)
-			return ENOMEM;
+			return -ENOMEM;
 
 		/*
 		 * This buffer may not have been correctly initialised as we
@@ -2491,7 +2491,7 @@ xfs_remove(
 	trace_xfs_remove(dp, name);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	error = xfs_qm_dqattach(dp, 0);
 	if (error)
@@ -2521,12 +2521,12 @@ xfs_remove(
 	 */
 	resblks = XFS_REMOVE_SPACE_RES(mp);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
-	if (error == ENOSPC) {
+	if (error == -ENOSPC) {
 		resblks = 0;
 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
 	}
 	if (error) {
-		ASSERT(error != ENOSPC);
+		ASSERT(error != -ENOSPC);
 		cancel_flags = 0;
 		goto out_trans_cancel;
 	}
@@ -2543,11 +2543,11 @@ xfs_remove(
 	if (is_dir) {
 		ASSERT(ip->i_d.di_nlink >= 2);
 		if (ip->i_d.di_nlink != 2) {
-			error = XFS_ERROR(ENOTEMPTY);
+			error = -ENOTEMPTY;
 			goto out_trans_cancel;
 		}
 		if (!xfs_dir_isempty(ip)) {
-			error = XFS_ERROR(ENOTEMPTY);
+			error = -ENOTEMPTY;
 			goto out_trans_cancel;
 		}
 
@@ -2582,7 +2582,7 @@ xfs_remove(
 	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
 					&first_block, &free_list, resblks);
 	if (error) {
-		ASSERT(error != ENOENT);
+		ASSERT(error != -ENOENT);
 		goto out_bmap_cancel;
 	}
 
@@ -2702,7 +2702,7 @@ xfs_rename(
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
-	if (error == ENOSPC) {
+	if (error == -ENOSPC) {
 		spaceres = 0;
 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
 	}
@@ -2747,7 +2747,7 @@ xfs_rename(
 	 */
 	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
 		     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
-		error = XFS_ERROR(EXDEV);
+		error = -EXDEV;
 		goto error_return;
 	}
 
@@ -2770,7 +2770,7 @@ xfs_rename(
 		error = xfs_dir_createname(tp, target_dp, target_name,
 						src_ip->i_ino, &first_block,
 						&free_list, spaceres);
-		if (error == ENOSPC)
+		if (error == -ENOSPC)
 			goto error_return;
 		if (error)
 			goto abort_return;
@@ -2795,7 +2795,7 @@ xfs_rename(
 			 */
 			if (!(xfs_dir_isempty(target_ip)) ||
 			    (target_ip->i_d.di_nlink > 2)) {
-				error = XFS_ERROR(EEXIST);
+				error = -EEXIST;
 				goto error_return;
 			}
 		}
@@ -2847,7 +2847,7 @@ xfs_rename(
 		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
 					target_dp->i_ino,
 					&first_block, &free_list, spaceres);
-		ASSERT(error != EEXIST);
+		ASSERT(error != -EEXIST);
 		if (error)
 			goto abort_return;
 	}
@@ -3055,7 +3055,7 @@ cluster_corrupt_out:
 		if (bp->b_iodone) {
 			XFS_BUF_UNDONE(bp);
 			xfs_buf_stale(bp);
-			xfs_buf_ioerror(bp, EIO);
+			xfs_buf_ioerror(bp, -EIO);
 			xfs_buf_ioend(bp, 0);
 		} else {
 			xfs_buf_stale(bp);
@@ -3069,7 +3069,7 @@ cluster_corrupt_out:
 	xfs_iflush_abort(iq, false);
 	kmem_free(ilist);
 	xfs_perag_put(pag);
-	return XFS_ERROR(EFSCORRUPTED);
+	return -EFSCORRUPTED;
 }
 
 /*
@@ -3124,7 +3124,7 @@ xfs_iflush(
 	 * as we wait for an empty AIL as part of the unmount process.
 	 */
 	if (XFS_FORCED_SHUTDOWN(mp)) {
-		error = XFS_ERROR(EIO);
+		error = -EIO;
 		goto abort_out;
 	}
 
@@ -3167,7 +3167,7 @@ corrupt_out:
 	xfs_buf_relse(bp);
 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 cluster_corrupt_out:
-	error = XFS_ERROR(EFSCORRUPTED);
+	error = -EFSCORRUPTED;
 abort_out:
 	/*
 	 * Unlocks the flush lock
@@ -3331,5 +3331,5 @@ xfs_iflush_int(
 	return 0;
 
 corrupt_out:
-	return XFS_ERROR(EFSCORRUPTED);
+	return -EFSCORRUPTED;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f72bffa67266..c10e3fadd9af 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -398,4 +398,14 @@ do { \
 
 extern struct kmem_zone	*xfs_inode_zone;
 
+/*
+ * Flags for read/write calls
+ */
+#define XFS_IO_ISDIRECT	0x00001		/* bypass page cache */
+#define XFS_IO_INVIS	0x00002		/* don't update inode timestamps */
+
+#define XFS_IO_FLAGS \
+	{ XFS_IO_ISDIRECT,	"DIRECT" }, \
+	{ XFS_IO_INVIS,		"INVIS"}
+
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a640137b3573..de5a7be36e60 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -788,5 +788,5 @@ xfs_inode_item_format_convert(
 		in_f->ilf_boffset = in_f64->ilf_boffset;
 		return 0;
 	}
-	return EFSCORRUPTED;
+	return -EFSCORRUPTED;
 }
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 8bc1bbce7451..3799695b9249 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -207,7 +207,7 @@ xfs_open_by_handle(
 	struct path		path;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 
 	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
 	if (IS_ERR(dentry))
@@ -216,7 +216,7 @@ xfs_open_by_handle(
 
 	/* Restrict xfs_open_by_handle to directories & regular files. */
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
-		error = -XFS_ERROR(EPERM);
+		error = -EPERM;
 		goto out_dput;
 	}
 
@@ -228,18 +228,18 @@ xfs_open_by_handle(
 	fmode = OPEN_FMODE(permflag);
 	if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
 	    (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
-		error = -XFS_ERROR(EPERM);
+		error = -EPERM;
 		goto out_dput;
 	}
 
 	if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
-		error = -XFS_ERROR(EACCES);
+		error = -EACCES;
 		goto out_dput;
 	}
 
 	/* Can't write directories. */
 	if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
-		error = -XFS_ERROR(EISDIR);
+		error = -EISDIR;
 		goto out_dput;
 	}
 
@@ -282,7 +282,7 @@ xfs_readlink_by_handle(
 	int			error;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 
 	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
 	if (IS_ERR(dentry))
@@ -290,22 +290,22 @@ xfs_readlink_by_handle(
 
 	/* Restrict this handle operation to symlinks only. */
 	if (!S_ISLNK(dentry->d_inode->i_mode)) {
-		error = -XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto out_dput;
 	}
 
 	if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
-		error = -XFS_ERROR(EFAULT);
+		error = -EFAULT;
 		goto out_dput;
 	}
 
 	link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
 	if (!link) {
-		error = -XFS_ERROR(ENOMEM);
+		error = -ENOMEM;
 		goto out_dput;
 	}
 
-	error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+	error = xfs_readlink(XFS_I(dentry->d_inode), link);
 	if (error)
 		goto out_kfree;
 	error = readlink_copy(hreq->ohandle, olen, link);
@@ -330,10 +330,10 @@ xfs_set_dmattrs(
 	int		error;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
+		return -EPERM;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
@@ -364,9 +364,9 @@ xfs_fssetdm_by_handle(
 	struct dentry		*dentry;
 
 	if (!capable(CAP_MKNOD))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 	if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	error = mnt_want_write_file(parfilp);
 	if (error)
@@ -379,16 +379,16 @@ xfs_fssetdm_by_handle(
 	}
 
 	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
-		error = -XFS_ERROR(EPERM);
+		error = -EPERM;
 		goto out;
 	}
 
 	if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
-		error = -XFS_ERROR(EFAULT);
+		error = -EFAULT;
 		goto out;
 	}
 
-	error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+	error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
 				 fsd.fsd_dmstate);
 
  out:
@@ -409,18 +409,18 @@ xfs_attrlist_by_handle(
 	char			*kbuf;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 	if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	if (al_hreq.buflen < sizeof(struct attrlist) ||
 	    al_hreq.buflen > XATTR_LIST_MAX)
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	/*
 	 * Reject flags, only allow namespaces.
 	 */
 	if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
 	if (IS_ERR(dentry))
@@ -431,7 +431,7 @@ xfs_attrlist_by_handle(
 		goto out_dput;
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+	error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
 					al_hreq.flags, cursor);
 	if (error)
 		goto out_kfree;
@@ -455,20 +455,20 @@ xfs_attrmulti_attr_get(
 	__uint32_t		flags)
 {
 	unsigned char		*kbuf;
-	int			error = EFAULT;
+	int			error = -EFAULT;
 
 	if (*len > XATTR_SIZE_MAX)
-		return EINVAL;
+		return -EINVAL;
 	kbuf = kmem_zalloc_large(*len, KM_SLEEP);
 	if (!kbuf)
-		return ENOMEM;
+		return -ENOMEM;
 
 	error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
 	if (error)
 		goto out_kfree;
 
 	if (copy_to_user(ubuf, kbuf, *len))
-		error = EFAULT;
+		error = -EFAULT;
 
 out_kfree:
 	kmem_free(kbuf);
@@ -484,20 +484,17 @@ xfs_attrmulti_attr_set(
 	__uint32_t		flags)
 {
 	unsigned char		*kbuf;
-	int			error = EFAULT;
 
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		return EPERM;
+		return -EPERM;
 	if (len > XATTR_SIZE_MAX)
-		return EINVAL;
+		return -EINVAL;
 
 	kbuf = memdup_user(ubuf, len);
 	if (IS_ERR(kbuf))
 		return PTR_ERR(kbuf);
 
-	error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
-
-	return error;
+	return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
 }
 
 int
@@ -507,7 +504,7 @@ xfs_attrmulti_attr_remove(
 	__uint32_t		flags)
 {
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		return EPERM;
+		return -EPERM;
 	return xfs_attr_remove(XFS_I(inode), name, flags);
 }
 
@@ -524,9 +521,9 @@ xfs_attrmulti_by_handle(
 	unsigned char		*attr_name;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 	if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	/* overflow check */
 	if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
@@ -536,18 +533,18 @@ xfs_attrmulti_by_handle(
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	error = E2BIG;
+	error = -E2BIG;
 	size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
 	if (!size || size > 16 * PAGE_SIZE)
 		goto out_dput;
 
 	ops = memdup_user(am_hreq.ops, size);
 	if (IS_ERR(ops)) {
-		error = -PTR_ERR(ops);
+		error = PTR_ERR(ops);
 		goto out_dput;
 	}
 
-	error = ENOMEM;
+	error = -ENOMEM;
 	attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
 	if (!attr_name)
 		goto out_kfree_ops;
@@ -557,7 +554,7 @@ xfs_attrmulti_by_handle(
 		ops[i].am_error = strncpy_from_user((char *)attr_name,
 				ops[i].am_attrname, MAXNAMELEN);
 		if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
-			error = ERANGE;
+			error = -ERANGE;
 		if (ops[i].am_error < 0)
 			break;
 
@@ -588,19 +585,19 @@ xfs_attrmulti_by_handle(
 			mnt_drop_write_file(parfilp);
 			break;
 		default:
-			ops[i].am_error = EINVAL;
+			ops[i].am_error = -EINVAL;
 		}
 	}
 
 	if (copy_to_user(am_hreq.ops, ops, size))
-		error = XFS_ERROR(EFAULT);
+		error = -EFAULT;
 
 	kfree(attr_name);
  out_kfree_ops:
 	kfree(ops);
  out_dput:
 	dput(dentry);
-	return -error;
+	return error;
 }
 
 int
@@ -625,16 +622,16 @@ xfs_ioc_space(
 	 */
 	if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
 	    !capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 
 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 
 	if (!(filp->f_mode & FMODE_WRITE))
-		return -XFS_ERROR(EBADF);
+		return -EBADF;
 
 	if (!S_ISREG(inode->i_mode))
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	error = mnt_want_write_file(filp);
 	if (error)
@@ -652,7 +649,7 @@ xfs_ioc_space(
 		bf->l_start += XFS_ISIZE(ip);
 		break;
 	default:
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto out_unlock;
 	}
 
@@ -669,7 +666,7 @@ xfs_ioc_space(
 	case XFS_IOC_UNRESVSP:
 	case XFS_IOC_UNRESVSP64:
 		if (bf->l_len <= 0) {
-			error = XFS_ERROR(EINVAL);
+			error = -EINVAL;
 			goto out_unlock;
 		}
 		break;
@@ -682,7 +679,7 @@ xfs_ioc_space(
 	    bf->l_start > mp->m_super->s_maxbytes ||
 	    bf->l_start + bf->l_len < 0 ||
 	    bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) {
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto out_unlock;
 	}
 
@@ -723,7 +720,7 @@ xfs_ioc_space(
 		break;
 	default:
 		ASSERT(0);
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 	}
 
 	if (error)
@@ -739,7 +736,7 @@ xfs_ioc_space(
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
-	if (!(ioflags & IO_INVIS)) {
+	if (!(ioflags & XFS_IO_INVIS)) {
 		ip->i_d.di_mode &= ~S_ISUID;
 		if (ip->i_d.di_mode & S_IXGRP)
 			ip->i_d.di_mode &= ~S_ISGID;
@@ -759,7 +756,7 @@ xfs_ioc_space(
 out_unlock:
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	mnt_drop_write_file(filp);
-	return -error;
+	return error;
 }
 
 STATIC int
@@ -781,41 +778,41 @@ xfs_ioc_bulkstat(
 		return -EPERM;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return -XFS_ERROR(EIO);
+		return -EIO;
 
 	if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	if ((count = bulkreq.icount) <= 0)
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	if (bulkreq.ubuffer == NULL)
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	if (cmd == XFS_IOC_FSINUMBERS)
 		error = xfs_inumbers(mp, &inlast, &count,
 					bulkreq.ubuffer, xfs_inumbers_fmt);
 	else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
-		error = xfs_bulkstat_single(mp, &inlast,
-						bulkreq.ubuffer, &done);
+		error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer,
+					sizeof(xfs_bstat_t), NULL, &done);
 	else	/* XFS_IOC_FSBULKSTAT */
 		error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
 				     sizeof(xfs_bstat_t), bulkreq.ubuffer,
 				     &done);
 
 	if (error)
-		return -error;
+		return error;
 
 	if (bulkreq.ocount != NULL) {
 		if (copy_to_user(bulkreq.lastip, &inlast,
 						sizeof(xfs_ino_t)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 
 		if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 	}
 
 	return 0;
@@ -831,7 +828,7 @@ xfs_ioc_fsgeometry_v1(
 
 	error = xfs_fs_geometry(mp, &fsgeo, 3);
 	if (error)
-		return -error;
+		return error;
 
 	/*
 	 * Caller should have passed an argument of type
@@ -839,7 +836,7 @@ xfs_ioc_fsgeometry_v1(
 	 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
 	 */
 	if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	return 0;
 }
 
@@ -853,10 +850,10 @@ xfs_ioc_fsgeometry(
 
 	error = xfs_fs_geometry(mp, &fsgeo, 4);
 	if (error)
-		return -error;
+		return error;
 
 	if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	return 0;
 }
 
@@ -1041,16 +1038,16 @@ xfs_ioctl_setattr(
 	trace_xfs_ioctl_setattr(ip);
 
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
-		return XFS_ERROR(EROFS);
+		return -EROFS;
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	/*
 	 * Disallow 32bit project ids when projid32bit feature is not enabled.
 	 */
 	if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
 			!xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	/*
 	 * If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1088,7 +1085,7 @@ xfs_ioctl_setattr(
 	 * CAP_FSETID capability is applicable.
 	 */
 	if (!inode_owner_or_capable(VFS_I(ip))) {
-		code = XFS_ERROR(EPERM);
+		code = -EPERM;
 		goto error_return;
 	}
 
@@ -1099,7 +1096,7 @@ xfs_ioctl_setattr(
 	 */
 	if (mask & FSX_PROJID) {
 		if (current_user_ns() != &init_user_ns) {
-			code = XFS_ERROR(EINVAL);
+			code = -EINVAL;
 			goto error_return;
 		}
 
@@ -1122,7 +1119,7 @@ xfs_ioctl_setattr(
 		if (ip->i_d.di_nextents &&
 		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 		     fa->fsx_extsize)) {
-			code = XFS_ERROR(EINVAL);	/* EFBIG? */
+			code = -EINVAL;	/* EFBIG? */
 			goto error_return;
 		}
 
@@ -1141,7 +1138,7 @@ xfs_ioctl_setattr(
 
 			extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
 			if (extsize_fsb > MAXEXTLEN) {
-				code = XFS_ERROR(EINVAL);
+				code = -EINVAL;
 				goto error_return;
 			}
 
@@ -1153,13 +1150,13 @@ xfs_ioctl_setattr(
 			} else {
 				size = mp->m_sb.sb_blocksize;
 				if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
-					code = XFS_ERROR(EINVAL);
+					code = -EINVAL;
 					goto error_return;
 				}
 			}
 
 			if (fa->fsx_extsize % size) {
-				code = XFS_ERROR(EINVAL);
+				code = -EINVAL;
 				goto error_return;
 			}
 		}
@@ -1173,7 +1170,7 @@ xfs_ioctl_setattr(
 		if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 		    (XFS_IS_REALTIME_INODE(ip)) !=
 		    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
-			code = XFS_ERROR(EINVAL);	/* EFBIG? */
+			code = -EINVAL;	/* EFBIG? */
 			goto error_return;
 		}
 
@@ -1184,7 +1181,7 @@ xfs_ioctl_setattr(
 			if ((mp->m_sb.sb_rblocks == 0) ||
 			    (mp->m_sb.sb_rextsize == 0) ||
 			    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
-				code = XFS_ERROR(EINVAL);
+				code = -EINVAL;
 				goto error_return;
 			}
 		}
@@ -1198,7 +1195,7 @@ xfs_ioctl_setattr(
 		     (fa->fsx_xflags &
 				(XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 		    !capable(CAP_LINUX_IMMUTABLE)) {
-			code = XFS_ERROR(EPERM);
+			code = -EPERM;
 			goto error_return;
 		}
 	}
@@ -1301,7 +1298,7 @@ xfs_ioc_fssetxattr(
 		return error;
 	error = xfs_ioctl_setattr(ip, &fa, mask);
 	mnt_drop_write_file(filp);
-	return -error;
+	return error;
 }
 
 STATIC int
@@ -1346,7 +1343,7 @@ xfs_ioc_setxflags(
 		return error;
 	error = xfs_ioctl_setattr(ip, &fa, mask);
 	mnt_drop_write_file(filp);
-	return -error;
+	return error;
 }
 
 STATIC int
@@ -1356,7 +1353,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
 
 	/* copy only getbmap portion (not getbmapx) */
 	if (copy_to_user(base, bmv, sizeof(struct getbmap)))
-		return XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	*ap += sizeof(struct getbmap);
 	return 0;
@@ -1373,23 +1370,23 @@ xfs_ioc_getbmap(
 	int			error;
 
 	if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	if (bmx.bmv_count < 2)
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
-	if (ioflags & IO_INVIS)
+	if (ioflags & XFS_IO_INVIS)
 		bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
 
 	error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
 			    (struct getbmap *)arg+1);
 	if (error)
-		return -error;
+		return error;
 
 	/* copy back header - only size of getbmap */
 	if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	return 0;
 }
 
@@ -1399,7 +1396,7 @@ xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
 	struct getbmapx __user	*base = *ap;
 
 	if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
-		return XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	*ap += sizeof(struct getbmapx);
 	return 0;
@@ -1414,22 +1411,22 @@ xfs_ioc_getbmapx(
 	int			error;
 
 	if (copy_from_user(&bmx, arg, sizeof(bmx)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	if (bmx.bmv_count < 2)
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	if (bmx.bmv_iflags & (~BMV_IF_VALID))
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
 			    (struct getbmapx *)arg+1);
 	if (error)
-		return -error;
+		return error;
 
 	/* copy back header */
 	if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	return 0;
 }
@@ -1445,33 +1442,33 @@ xfs_ioc_swapext(
 	/* Pull information for the target fd */
 	f = fdget((int)sxp->sx_fdtarget);
 	if (!f.file) {
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto out;
 	}
 
 	if (!(f.file->f_mode & FMODE_WRITE) ||
 	    !(f.file->f_mode & FMODE_READ) ||
 	    (f.file->f_flags & O_APPEND)) {
-		error = XFS_ERROR(EBADF);
+		error = -EBADF;
 		goto out_put_file;
 	}
 
 	tmp = fdget((int)sxp->sx_fdtmp);
 	if (!tmp.file) {
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto out_put_file;
 	}
 
 	if (!(tmp.file->f_mode & FMODE_WRITE) ||
 	    !(tmp.file->f_mode & FMODE_READ) ||
 	    (tmp.file->f_flags & O_APPEND)) {
-		error = XFS_ERROR(EBADF);
+		error = -EBADF;
 		goto out_put_tmp_file;
 	}
 
 	if (IS_SWAPFILE(file_inode(f.file)) ||
 	    IS_SWAPFILE(file_inode(tmp.file))) {
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto out_put_tmp_file;
 	}
 
@@ -1479,17 +1476,17 @@ xfs_ioc_swapext(
 	tip = XFS_I(file_inode(tmp.file));
 
 	if (ip->i_mount != tip->i_mount) {
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto out_put_tmp_file;
 	}
 
 	if (ip->i_ino == tip->i_ino) {
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto out_put_tmp_file;
 	}
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-		error = XFS_ERROR(EIO);
+		error = -EIO;
 		goto out_put_tmp_file;
 	}
 
@@ -1523,7 +1520,7 @@ xfs_file_ioctl(
 	int			error;
 
 	if (filp->f_mode & FMODE_NOCMTIME)
-		ioflags |= IO_INVIS;
+		ioflags |= XFS_IO_INVIS;
 
 	trace_xfs_file_ioctl(ip);
 
@@ -1542,7 +1539,7 @@ xfs_file_ioctl(
 		xfs_flock64_t		bf;
 
 		if (copy_from_user(&bf, arg, sizeof(bf)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
 	}
 	case XFS_IOC_DIOINFO: {
@@ -1555,7 +1552,7 @@ xfs_file_ioctl(
 		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
 
 		if (copy_to_user(arg, &da, sizeof(da)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		return 0;
 	}
 
@@ -1588,7 +1585,7 @@ xfs_file_ioctl(
 		struct fsdmidata	dmi;
 
 		if (copy_from_user(&dmi, arg, sizeof(dmi)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 
 		error = mnt_want_write_file(filp);
 		if (error)
@@ -1597,7 +1594,7 @@ xfs_file_ioctl(
 		error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
 				dmi.fsd_dmstate);
 		mnt_drop_write_file(filp);
-		return -error;
+		return error;
 	}
 
 	case XFS_IOC_GETBMAP:
@@ -1613,14 +1610,14 @@ xfs_file_ioctl(
 		xfs_fsop_handlereq_t	hreq;
 
 		if (copy_from_user(&hreq, arg, sizeof(hreq)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		return xfs_find_handle(cmd, &hreq);
 	}
 	case XFS_IOC_OPEN_BY_HANDLE: {
 		xfs_fsop_handlereq_t	hreq;
 
 		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		return xfs_open_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_FSSETDM_BY_HANDLE:
@@ -1630,7 +1627,7 @@ xfs_file_ioctl(
 		xfs_fsop_handlereq_t	hreq;
 
 		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		return xfs_readlink_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_ATTRLIST_BY_HANDLE:
@@ -1643,13 +1640,13 @@ xfs_file_ioctl(
 		struct xfs_swapext	sxp;
 
 		if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		error = mnt_want_write_file(filp);
 		if (error)
 			return error;
 		error = xfs_ioc_swapext(&sxp);
 		mnt_drop_write_file(filp);
-		return -error;
+		return error;
 	}
 
 	case XFS_IOC_FSCOUNTS: {
@@ -1657,10 +1654,10 @@ xfs_file_ioctl(
 
 		error = xfs_fs_counts(mp, &out);
 		if (error)
-			return -error;
+			return error;
 
 		if (copy_to_user(arg, &out, sizeof(out)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		return 0;
 	}
 
@@ -1672,10 +1669,10 @@ xfs_file_ioctl(
 			return -EPERM;
 
 		if (mp->m_flags & XFS_MOUNT_RDONLY)
-			return -XFS_ERROR(EROFS);
+			return -EROFS;
 
 		if (copy_from_user(&inout, arg, sizeof(inout)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 
 		error = mnt_want_write_file(filp);
 		if (error)
@@ -1686,10 +1683,10 @@ xfs_file_ioctl(
 		error = xfs_reserve_blocks(mp, &in, &inout);
 		mnt_drop_write_file(filp);
 		if (error)
-			return -error;
+			return error;
 
 		if (copy_to_user(arg, &inout, sizeof(inout)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		return 0;
 	}
 
@@ -1701,10 +1698,10 @@ xfs_file_ioctl(
 
 		error = xfs_reserve_blocks(mp, NULL, &out);
 		if (error)
-			return -error;
+			return error;
 
 		if (copy_to_user(arg, &out, sizeof(out)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 
 		return 0;
 	}
@@ -1713,42 +1710,42 @@ xfs_file_ioctl(
 		xfs_growfs_data_t in;
 
 		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 
 		error = mnt_want_write_file(filp);
 		if (error)
 			return error;
 		error = xfs_growfs_data(mp, &in);
 		mnt_drop_write_file(filp);
-		return -error;
+		return error;
 	}
 
 	case XFS_IOC_FSGROWFSLOG: {
 		xfs_growfs_log_t in;
 
 		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 
 		error = mnt_want_write_file(filp);
 		if (error)
 			return error;
 		error = xfs_growfs_log(mp, &in);
 		mnt_drop_write_file(filp);
-		return -error;
+		return error;
 	}
 
 	case XFS_IOC_FSGROWFSRT: {
 		xfs_growfs_rt_t in;
 
 		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 
 		error = mnt_want_write_file(filp);
 		if (error)
 			return error;
 		error = xfs_growfs_rt(mp, &in);
 		mnt_drop_write_file(filp);
-		return -error;
+		return error;
 	}
 
 	case XFS_IOC_GOINGDOWN: {
@@ -1758,10 +1755,9 @@ xfs_file_ioctl(
 			return -EPERM;
 
 		if (get_user(in, (__uint32_t __user *)arg))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 
-		error = xfs_fs_goingdown(mp, in);
-		return -error;
+		return xfs_fs_goingdown(mp, in);
 	}
 
 	case XFS_IOC_ERROR_INJECTION: {
@@ -1771,18 +1767,16 @@ xfs_file_ioctl(
 			return -EPERM;
 
 		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 
-		error = xfs_errortag_add(in.errtag, mp);
-		return -error;
+		return xfs_errortag_add(in.errtag, mp);
 	}
 
 	case XFS_IOC_ERROR_CLEARALL:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
-		error = xfs_errortag_clearall(mp, 1);
-		return -error;
+		return xfs_errortag_clearall(mp, 1);
 
 	case XFS_IOC_FREE_EOFBLOCKS: {
 		struct xfs_fs_eofblocks eofb;
@@ -1792,16 +1786,16 @@ xfs_file_ioctl(
 			return -EPERM;
 
 		if (mp->m_flags & XFS_MOUNT_RDONLY)
-			return -XFS_ERROR(EROFS);
+			return -EROFS;
 
 		if (copy_from_user(&eofb, arg, sizeof(eofb)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 
 		error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
 		if (error)
-			return -error;
+			return error;
 
-		return -xfs_icache_free_eofblocks(mp, &keofb);
+		return xfs_icache_free_eofblocks(mp, &keofb);
 	}
 
 	default:
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 944d5baa710a..a554646ff141 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -28,7 +28,6 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_vnode.h"
 #include "xfs_inode.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
@@ -56,7 +55,7 @@ xfs_compat_flock64_copyin(
 	    get_user(bf->l_sysid,	&arg32->l_sysid) ||
 	    get_user(bf->l_pid,		&arg32->l_pid) ||
 	    copy_from_user(bf->l_pad,	&arg32->l_pad,	4*sizeof(u32)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	return 0;
 }
 
@@ -70,10 +69,10 @@ xfs_compat_ioc_fsgeometry_v1(
 
 	error = xfs_fs_geometry(mp, &fsgeo, 3);
 	if (error)
-		return -error;
+		return error;
 	/* The 32-bit variant simply has some padding at the end */
 	if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	return 0;
 }
 
@@ -84,7 +83,7 @@ xfs_compat_growfs_data_copyin(
 {
 	if (get_user(in->newblocks, &arg32->newblocks) ||
 	    get_user(in->imaxpct,   &arg32->imaxpct))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	return 0;
 }
 
@@ -95,14 +94,14 @@ xfs_compat_growfs_rt_copyin(
 {
 	if (get_user(in->newblocks, &arg32->newblocks) ||
 	    get_user(in->extsize,   &arg32->extsize))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	return 0;
 }
 
 STATIC int
 xfs_inumbers_fmt_compat(
 	void			__user *ubuffer,
-	const xfs_inogrp_t	*buffer,
+	const struct xfs_inogrp	*buffer,
 	long			count,
 	long			*written)
 {
@@ -113,7 +112,7 @@ xfs_inumbers_fmt_compat(
 		if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
 		    put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
 		    put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 	}
 	*written = count * sizeof(*p32);
 	return 0;
@@ -132,7 +131,7 @@ xfs_ioctl32_bstime_copyin(
 
 	if (get_user(sec32,		&bstime32->tv_sec)	||
 	    get_user(bstime->tv_nsec,	&bstime32->tv_nsec))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	bstime->tv_sec = sec32;
 	return 0;
 }
@@ -164,7 +163,7 @@ xfs_ioctl32_bstat_copyin(
 	    get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask)	||
 	    get_user(bstat->bs_dmstate,	&bstat32->bs_dmstate)	||
 	    get_user(bstat->bs_aextents, &bstat32->bs_aextents))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	return 0;
 }
 
@@ -180,7 +179,7 @@ xfs_bstime_store_compat(
 	sec32 = p->tv_sec;
 	if (put_user(sec32, &p32->tv_sec) ||
 	    put_user(p->tv_nsec, &p32->tv_nsec))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	return 0;
 }
 
@@ -195,7 +194,7 @@ xfs_bulkstat_one_fmt_compat(
 	compat_xfs_bstat_t	__user *p32 = ubuffer;
 
 	if (ubsize < sizeof(*p32))
-		return XFS_ERROR(ENOMEM);
+		return -ENOMEM;
 
 	if (put_user(buffer->bs_ino,	  &p32->bs_ino)		||
 	    put_user(buffer->bs_mode,	  &p32->bs_mode)	||
@@ -218,7 +217,7 @@ xfs_bulkstat_one_fmt_compat(
 	    put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)	||
 	    put_user(buffer->bs_dmstate,  &p32->bs_dmstate)	||
 	    put_user(buffer->bs_aextents, &p32->bs_aextents))
-		return XFS_ERROR(EFAULT);
+		return -EFAULT;
 	if (ubused)
 		*ubused = sizeof(*p32);
 	return 0;
@@ -256,30 +255,30 @@ xfs_compat_ioc_bulkstat(
 	/* should be called again (unused here, but used in dmapi) */
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return -XFS_ERROR(EIO);
+		return -EIO;
 
 	if (get_user(addr, &p32->lastip))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	bulkreq.lastip = compat_ptr(addr);
 	if (get_user(bulkreq.icount, &p32->icount) ||
 	    get_user(addr, &p32->ubuffer))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	bulkreq.ubuffer = compat_ptr(addr);
 	if (get_user(addr, &p32->ocount))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	bulkreq.ocount = compat_ptr(addr);
 
 	if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	if ((count = bulkreq.icount) <= 0)
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	if (bulkreq.ubuffer == NULL)
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	if (cmd == XFS_IOC_FSINUMBERS_32) {
 		error = xfs_inumbers(mp, &inlast, &count,
@@ -294,17 +293,17 @@ xfs_compat_ioc_bulkstat(
 			xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
 			bulkreq.ubuffer, &done);
 	} else
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 	if (error)
-		return -error;
+		return error;
 
 	if (bulkreq.ocount != NULL) {
 		if (copy_to_user(bulkreq.lastip, &inlast,
 						sizeof(xfs_ino_t)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 
 		if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 	}
 
 	return 0;
@@ -318,7 +317,7 @@ xfs_compat_handlereq_copyin(
 	compat_xfs_fsop_handlereq_t	hreq32;
 
 	if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	hreq->fd = hreq32.fd;
 	hreq->path = compat_ptr(hreq32.path);
@@ -352,19 +351,19 @@ xfs_compat_attrlist_by_handle(
 	char			*kbuf;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 	if (copy_from_user(&al_hreq, arg,
 			   sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 	if (al_hreq.buflen < sizeof(struct attrlist) ||
 	    al_hreq.buflen > XATTR_LIST_MAX)
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	/*
 	 * Reject flags, only allow namespaces.
 	 */
 	if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
-		return -XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
 	if (IS_ERR(dentry))
@@ -376,7 +375,7 @@ xfs_compat_attrlist_by_handle(
 		goto out_dput;
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+	error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
 					al_hreq.flags, cursor);
 	if (error)
 		goto out_kfree;
@@ -404,10 +403,10 @@ xfs_compat_attrmulti_by_handle(
 	unsigned char				*attr_name;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 	if (copy_from_user(&am_hreq, arg,
 			   sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	/* overflow check */
 	if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
@@ -417,7 +416,7 @@ xfs_compat_attrmulti_by_handle(
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	error = E2BIG;
+	error = -E2BIG;
 	size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
 	if (!size || size > 16 * PAGE_SIZE)
 		goto out_dput;
@@ -428,7 +427,7 @@ xfs_compat_attrmulti_by_handle(
 		goto out_dput;
 	}
 
-	error = ENOMEM;
+	error = -ENOMEM;
 	attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
 	if (!attr_name)
 		goto out_kfree_ops;
@@ -439,7 +438,7 @@ xfs_compat_attrmulti_by_handle(
 				compat_ptr(ops[i].am_attrname),
 				MAXNAMELEN);
 		if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
-			error = ERANGE;
+			error = -ERANGE;
 		if (ops[i].am_error < 0)
 			break;
 
@@ -470,19 +469,19 @@ xfs_compat_attrmulti_by_handle(
 			mnt_drop_write_file(parfilp);
 			break;
 		default:
-			ops[i].am_error = EINVAL;
+			ops[i].am_error = -EINVAL;
 		}
 	}
 
 	if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
-		error = XFS_ERROR(EFAULT);
+		error = -EFAULT;
 
 	kfree(attr_name);
  out_kfree_ops:
 	kfree(ops);
  out_dput:
 	dput(dentry);
-	return -error;
+	return error;
 }
 
 STATIC int
@@ -496,26 +495,26 @@ xfs_compat_fssetdm_by_handle(
 	struct dentry		*dentry;
 
 	if (!capable(CAP_MKNOD))
-		return -XFS_ERROR(EPERM);
+		return -EPERM;
 	if (copy_from_user(&dmhreq, arg,
 			   sizeof(compat_xfs_fsop_setdm_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
+		return -EFAULT;
 
 	dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
 	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
-		error = -XFS_ERROR(EPERM);
+		error = -EPERM;
 		goto out;
 	}
 
 	if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
-		error = -XFS_ERROR(EFAULT);
+		error = -EFAULT;
 		goto out;
 	}
 
-	error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+	error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
 				 fsd.fsd_dmstate);
 
 out:
@@ -537,7 +536,7 @@ xfs_file_compat_ioctl(
 	int			error;
 
 	if (filp->f_mode & FMODE_NOCMTIME)
-		ioflags |= IO_INVIS;
+		ioflags |= XFS_IO_INVIS;
 
 	trace_xfs_file_compat_ioctl(ip);
 
@@ -588,7 +587,7 @@ xfs_file_compat_ioctl(
 		struct xfs_flock64	bf;
 
 		if (xfs_compat_flock64_copyin(&bf, arg))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
 		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
 	}
@@ -598,25 +597,25 @@ xfs_file_compat_ioctl(
 		struct xfs_growfs_data	in;
 
 		if (xfs_compat_growfs_data_copyin(&in, arg))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		error = mnt_want_write_file(filp);
 		if (error)
 			return error;
 		error = xfs_growfs_data(mp, &in);
 		mnt_drop_write_file(filp);
-		return -error;
+		return error;
 	}
 	case XFS_IOC_FSGROWFSRT_32: {
 		struct xfs_growfs_rt	in;
 
 		if (xfs_compat_growfs_rt_copyin(&in, arg))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		error = mnt_want_write_file(filp);
 		if (error)
 			return error;
 		error = xfs_growfs_rt(mp, &in);
 		mnt_drop_write_file(filp);
-		return -error;
+		return error;
 	}
 #endif
 	/* long changes size, but xfs only copiese out 32 bits */
@@ -633,13 +632,13 @@ xfs_file_compat_ioctl(
 		if (copy_from_user(&sxp, sxu,
 				   offsetof(struct xfs_swapext, sx_stat)) ||
 		    xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		error = mnt_want_write_file(filp);
 		if (error)
 			return error;
 		error = xfs_ioc_swapext(&sxp);
 		mnt_drop_write_file(filp);
-		return -error;
+		return error;
 	}
 	case XFS_IOC_FSBULKSTAT_32:
 	case XFS_IOC_FSBULKSTAT_SINGLE_32:
@@ -651,7 +650,7 @@ xfs_file_compat_ioctl(
 		struct xfs_fsop_handlereq	hreq;
 
 		if (xfs_compat_handlereq_copyin(&hreq, arg))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
 		return xfs_find_handle(cmd, &hreq);
 	}
@@ -659,14 +658,14 @@ xfs_file_compat_ioctl(
 		struct xfs_fsop_handlereq	hreq;
 
 		if (xfs_compat_handlereq_copyin(&hreq, arg))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		return xfs_open_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_READLINK_BY_HANDLE_32: {
 		struct xfs_fsop_handlereq	hreq;
 
 		if (xfs_compat_handlereq_copyin(&hreq, arg))
-			return -XFS_ERROR(EFAULT);
+			return -EFAULT;
 		return xfs_readlink_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_ATTRLIST_BY_HANDLE_32:
@@ -676,6 +675,6 @@ xfs_file_compat_ioctl(
 	case XFS_IOC_FSSETDM_BY_HANDLE_32:
 		return xfs_compat_fssetdm_by_handle(filp, arg);
 	default:
-		return -XFS_ERROR(ENOIOCTLCMD);
+		return -ENOIOCTLCMD;
 	}
 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6d3ec2b6ee29..e9c47b6f5e5a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -110,7 +110,7 @@ xfs_alert_fsblock_zero(
 		(unsigned long long)imap->br_startoff,
 		(unsigned long long)imap->br_blockcount,
 		imap->br_state);
-	return EFSCORRUPTED;
+	return -EFSCORRUPTED;
 }
 
 int
@@ -138,7 +138,7 @@ xfs_iomap_write_direct(
 
 	error = xfs_qm_dqattach(ip, 0);
 	if (error)
-		return XFS_ERROR(error);
+		return error;
 
 	rt = XFS_IS_REALTIME_INODE(ip);
 	extsz = xfs_get_extsz_hint(ip);
@@ -148,7 +148,7 @@ xfs_iomap_write_direct(
 	if ((offset + count) > XFS_ISIZE(ip)) {
 		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
 		if (error)
-			return XFS_ERROR(error);
+			return error;
 	} else {
 		if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
 			last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -188,7 +188,7 @@ xfs_iomap_write_direct(
 	 */
 	if (error) {
 		xfs_trans_cancel(tp, 0);
-		return XFS_ERROR(error);
+		return error;
 	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -225,7 +225,7 @@ xfs_iomap_write_direct(
 	 * Copy any maps to caller's array and return any error.
 	 */
 	if (nimaps == 0) {
-		error = XFS_ERROR(ENOSPC);
+		error = -ENOSPC;
 		goto out_unlock;
 	}
 
@@ -397,7 +397,8 @@ xfs_quota_calc_throttle(
 	struct xfs_inode *ip,
 	int type,
 	xfs_fsblock_t *qblocks,
-	int *qshift)
+	int *qshift,
+	int64_t	*qfreesp)
 {
 	int64_t freesp;
 	int shift = 0;
@@ -406,6 +407,7 @@ xfs_quota_calc_throttle(
 	/* over hi wmark, squash the prealloc completely */
 	if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
 		*qblocks = 0;
+		*qfreesp = 0;
 		return;
 	}
 
@@ -418,6 +420,9 @@ xfs_quota_calc_throttle(
 			shift += 2;
 	}
 
+	if (freesp < *qfreesp)
+		*qfreesp = freesp;
+
 	/* only overwrite the throttle values if we are more aggressive */
 	if ((freesp >> shift) < (*qblocks >> *qshift)) {
 		*qblocks = freesp;
@@ -476,15 +481,18 @@ xfs_iomap_prealloc_size(
 	}
 
 	/*
-	 * Check each quota to cap the prealloc size and provide a shift
-	 * value to throttle with.
+	 * Check each quota to cap the prealloc size, provide a shift value to
+	 * throttle with and adjust amount of available space.
 	 */
 	if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
-		xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift);
+		xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift,
+					&freesp);
 	if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
-		xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift);
+		xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift,
+					&freesp);
 	if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
-		xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift);
+		xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift,
+					&freesp);
 
 	/*
 	 * The final prealloc size is set to the minimum of free space available
@@ -552,7 +560,7 @@ xfs_iomap_write_delay(
 	 */
 	error = xfs_qm_dqattach_locked(ip, 0);
 	if (error)
-		return XFS_ERROR(error);
+		return error;
 
 	extsz = xfs_get_extsz_hint(ip);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -596,11 +604,11 @@ retry:
 				imap, &nimaps, XFS_BMAPI_ENTIRE);
 	switch (error) {
 	case 0:
-	case ENOSPC:
-	case EDQUOT:
+	case -ENOSPC:
+	case -EDQUOT:
 		break;
 	default:
-		return XFS_ERROR(error);
+		return error;
 	}
 
 	/*
@@ -614,7 +622,7 @@ retry:
 			error = 0;
 			goto retry;
 		}
-		return XFS_ERROR(error ? error : ENOSPC);
+		return error ? error : -ENOSPC;
 	}
 
 	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
@@ -663,7 +671,7 @@ xfs_iomap_write_allocate(
 	 */
 	error = xfs_qm_dqattach(ip, 0);
 	if (error)
-		return XFS_ERROR(error);
+		return error;
 
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	count_fsb = imap->br_blockcount;
@@ -690,7 +698,7 @@ xfs_iomap_write_allocate(
 						  nres, 0);
 			if (error) {
 				xfs_trans_cancel(tp, 0);
-				return XFS_ERROR(error);
+				return error;
 			}
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
 			xfs_trans_ijoin(tp, ip, 0);
@@ -739,7 +747,7 @@ xfs_iomap_write_allocate(
 			if ((map_start_fsb + count_fsb) > last_block) {
 				count_fsb = last_block - map_start_fsb;
 				if (count_fsb == 0) {
-					error = EAGAIN;
+					error = -EAGAIN;
 					goto trans_cancel;
 				}
 			}
@@ -793,7 +801,7 @@ trans_cancel:
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 error0:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return XFS_ERROR(error);
+	return error;
 }
 
 int
@@ -853,7 +861,7 @@ xfs_iomap_write_unwritten(
 					  resblks, 0);
 		if (error) {
 			xfs_trans_cancel(tp, 0);
-			return XFS_ERROR(error);
+			return error;
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -892,7 +900,7 @@ xfs_iomap_write_unwritten(
 		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		if (error)
-			return XFS_ERROR(error);
+			return error;
 
 		if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
 			return xfs_alert_fsblock_zero(ip, &imap);
@@ -915,5 +923,5 @@ error_on_bmapi_transaction:
 	xfs_bmap_cancel(&free_list);
 	xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return XFS_ERROR(error);
+	return error;
 }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 205613a06068..72129493e9d3 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -72,7 +72,7 @@ xfs_initxattrs(
 	int			error = 0;
 
 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-		error = -xfs_attr_set(ip, xattr->name, xattr->value,
+		error = xfs_attr_set(ip, xattr->name, xattr->value,
 				      xattr->value_len, ATTR_SECURE);
 		if (error < 0)
 			break;
@@ -93,7 +93,7 @@ xfs_init_security(
 	struct inode	*dir,
 	const struct qstr *qstr)
 {
-	return -security_inode_init_security(inode, dir, qstr,
+	return security_inode_init_security(inode, dir, qstr,
 					     &xfs_initxattrs, NULL);
 }
 
@@ -173,12 +173,12 @@ xfs_generic_create(
 
 #ifdef CONFIG_XFS_POSIX_ACL
 	if (default_acl) {
-		error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
 		if (error)
 			goto out_cleanup_inode;
 	}
 	if (acl) {
-		error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
 		if (error)
 			goto out_cleanup_inode;
 	}
@@ -194,7 +194,7 @@ xfs_generic_create(
 		posix_acl_release(default_acl);
 	if (acl)
 		posix_acl_release(acl);
-	return -error;
+	return error;
 
  out_cleanup_inode:
 	if (!tmpfile)
@@ -248,8 +248,8 @@ xfs_vn_lookup(
 	xfs_dentry_to_name(&name, dentry, 0);
 	error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
 	if (unlikely(error)) {
-		if (unlikely(error != ENOENT))
-			return ERR_PTR(-error);
+		if (unlikely(error != -ENOENT))
+			return ERR_PTR(error);
 		d_add(dentry, NULL);
 		return NULL;
 	}
@@ -275,8 +275,8 @@ xfs_vn_ci_lookup(
 	xfs_dentry_to_name(&xname, dentry, 0);
 	error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
 	if (unlikely(error)) {
-		if (unlikely(error != ENOENT))
-			return ERR_PTR(-error);
+		if (unlikely(error != -ENOENT))
+			return ERR_PTR(error);
 		/*
 		 * call d_add(dentry, NULL) here when d_drop_negative_children
 		 * is called in xfs_vn_mknod (ie. allow negative dentries
@@ -311,7 +311,7 @@ xfs_vn_link(
 
 	error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
 	if (unlikely(error))
-		return -error;
+		return error;
 
 	ihold(inode);
 	d_instantiate(dentry, inode);
@@ -328,7 +328,7 @@ xfs_vn_unlink(
 
 	xfs_dentry_to_name(&name, dentry, 0);
 
-	error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
+	error = xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
 	if (error)
 		return error;
 
@@ -375,7 +375,7 @@ xfs_vn_symlink(
 	xfs_cleanup_inode(dir, inode, dentry);
 	iput(inode);
  out:
-	return -error;
+	return error;
 }
 
 STATIC int
@@ -392,8 +392,8 @@ xfs_vn_rename(
 	xfs_dentry_to_name(&oname, odentry, 0);
 	xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
 
-	return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
-			   XFS_I(ndir), &nname, new_inode ?
+	return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+			  XFS_I(ndir), &nname, new_inode ?
 						XFS_I(new_inode) : NULL);
 }
 
@@ -414,7 +414,7 @@ xfs_vn_follow_link(
 	if (!link)
 		goto out_err;
 
-	error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+	error = xfs_readlink(XFS_I(dentry->d_inode), link);
 	if (unlikely(error))
 		goto out_kfree;
 
@@ -441,7 +441,7 @@ xfs_vn_getattr(
 	trace_xfs_getattr(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return -XFS_ERROR(EIO);
+		return -EIO;
 
 	stat->size = XFS_ISIZE(ip);
 	stat->dev = inode->i_sb->s_dev;
@@ -546,14 +546,14 @@ xfs_setattr_nonsize(
 	/* If acls are being inherited, we already have this checked */
 	if (!(flags & XFS_ATTR_NOACL)) {
 		if (mp->m_flags & XFS_MOUNT_RDONLY)
-			return XFS_ERROR(EROFS);
+			return -EROFS;
 
 		if (XFS_FORCED_SHUTDOWN(mp))
-			return XFS_ERROR(EIO);
+			return -EIO;
 
-		error = -inode_change_ok(inode, iattr);
+		error = inode_change_ok(inode, iattr);
 		if (error)
-			return XFS_ERROR(error);
+			return error;
 	}
 
 	ASSERT((mask & ATTR_SIZE) == 0);
@@ -703,7 +703,7 @@ xfs_setattr_nonsize(
 	xfs_qm_dqrele(gdqp);
 
 	if (error)
-		return XFS_ERROR(error);
+		return error;
 
 	/*
 	 * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
@@ -713,9 +713,9 @@ xfs_setattr_nonsize(
 	 * 	     Posix ACL code seems to care about this issue either.
 	 */
 	if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
-		error = -posix_acl_chmod(inode, inode->i_mode);
+		error = posix_acl_chmod(inode, inode->i_mode);
 		if (error)
-			return XFS_ERROR(error);
+			return error;
 	}
 
 	return 0;
@@ -748,14 +748,14 @@ xfs_setattr_size(
 	trace_xfs_setattr(ip);
 
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
-		return XFS_ERROR(EROFS);
+		return -EROFS;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
-	error = -inode_change_ok(inode, iattr);
+	error = inode_change_ok(inode, iattr);
 	if (error)
-		return XFS_ERROR(error);
+		return error;
 
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	ASSERT(S_ISREG(ip->i_d.di_mode));
@@ -818,7 +818,7 @@ xfs_setattr_size(
 	 * care about here.
 	 */
 	if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
-		error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
 						      ip->i_d.di_size, newsize);
 		if (error)
 			return error;
@@ -844,7 +844,7 @@ xfs_setattr_size(
 	 * much we can do about this, except to hope that the caller sees ENOMEM
 	 * and retries the truncate operation.
 	 */
-	error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+	error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
 	if (error)
 		return error;
 	truncate_setsize(inode, newsize);
@@ -950,7 +950,7 @@ xfs_vn_setattr(
 		error = xfs_setattr_nonsize(ip, iattr, 0);
 	}
 
-	return -error;
+	return error;
 }
 
 STATIC int
@@ -970,7 +970,7 @@ xfs_vn_update_time(
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
 	if (error) {
 		xfs_trans_cancel(tp, 0);
-		return -error;
+		return error;
 	}
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -991,7 +991,7 @@ xfs_vn_update_time(
 	}
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
-	return -xfs_trans_commit(tp, 0);
+	return xfs_trans_commit(tp, 0);
 }
 
 #define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1036,7 +1036,7 @@ xfs_fiemap_format(
 		*full = 1;	/* user array now full */
 	}
 
-	return -error;
+	return error;
 }
 
 STATIC int
@@ -1055,12 +1055,12 @@ xfs_vn_fiemap(
 		return error;
 
 	/* Set up bmap header for xfs internal routine */
-	bm.bmv_offset = BTOBB(start);
+	bm.bmv_offset = BTOBBT(start);
 	/* Special case for whole file */
 	if (length == FIEMAP_MAX_OFFSET)
 		bm.bmv_length = -1LL;
 	else
-		bm.bmv_length = BTOBB(length);
+		bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
 
 	/* We add one because in getbmap world count includes the header */
 	bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
@@ -1075,7 +1075,7 @@ xfs_vn_fiemap(
 
 	error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
 	if (error)
-		return -error;
+		return error;
 
 	return 0;
 }
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cb64f222d607..f71be9c68017 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -67,19 +67,17 @@ xfs_bulkstat_one_int(
 	*stat = BULKSTAT_RV_NOTHING;
 
 	if (!buffer || xfs_internal_inum(mp, ino))
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 
 	buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
 	if (!buf)
-		return XFS_ERROR(ENOMEM);
+		return -ENOMEM;
 
 	error = xfs_iget(mp, NULL, ino,
 			 (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
 			 XFS_ILOCK_SHARED, &ip);
-	if (error) {
-		*stat = BULKSTAT_RV_NOTHING;
+	if (error)
 		goto out_free;
-	}
 
 	ASSERT(ip != NULL);
 	ASSERT(ip->i_imap.im_blkno != 0);
@@ -136,7 +134,6 @@ xfs_bulkstat_one_int(
 	IRELE(ip);
 
 	error = formatter(buffer, ubsize, ubused, buf);
-
 	if (!error)
 		*stat = BULKSTAT_RV_DIDONE;
 
@@ -154,9 +151,9 @@ xfs_bulkstat_one_fmt(
 	const xfs_bstat_t	*buffer)
 {
 	if (ubsize < sizeof(*buffer))
-		return XFS_ERROR(ENOMEM);
+		return -ENOMEM;
 	if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
-		return XFS_ERROR(EFAULT);
+		return -EFAULT;
 	if (ubused)
 		*ubused = sizeof(*buffer);
 	return 0;
@@ -175,9 +172,170 @@ xfs_bulkstat_one(
 				    xfs_bulkstat_one_fmt, ubused, stat);
 }
 
+/*
+ * Loop over all clusters in a chunk for a given incore inode allocation btree
+ * record.  Do a readahead if there are any allocated inodes in that cluster.
+ */
+STATIC void
+xfs_bulkstat_ichunk_ra(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno,
+	struct xfs_inobt_rec_incore	*irec)
+{
+	xfs_agblock_t			agbno;
+	struct blk_plug			plug;
+	int				blks_per_cluster;
+	int				inodes_per_cluster;
+	int				i;	/* inode chunk index */
+
+	agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
+	blks_per_cluster = xfs_icluster_size_fsb(mp);
+	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+
+	blk_start_plug(&plug);
+	for (i = 0; i < XFS_INODES_PER_CHUNK;
+	     i += inodes_per_cluster, agbno += blks_per_cluster) {
+		if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) {
+			xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster,
+					     &xfs_inode_buf_ops);
+		}
+	}
+	blk_finish_plug(&plug);
+}
+
+/*
+ * Lookup the inode chunk that the given inode lives in and then get the record
+ * if we found the chunk.  If the inode was not the last in the chunk and there
+ * are some left allocated, update the data for the pointed-to record as well as
+ * return the count of grabbed inodes.
+ */
+STATIC int
+xfs_bulkstat_grab_ichunk(
+	struct xfs_btree_cur		*cur,	/* btree cursor */
+	xfs_agino_t			agino,	/* starting inode of chunk */
+	int				*icount,/* return # of inodes grabbed */
+	struct xfs_inobt_rec_incore	*irec)	/* btree record */
+{
+	int				idx;	/* index into inode chunk */
+	int				stat;
+	int				error = 0;
+
+	/* Lookup the inode chunk that this inode lives in */
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat);
+	if (error)
+		return error;
+	if (!stat) {
+		*icount = 0;
+		return error;
+	}
+
+	/* Get the record, should always work */
+	error = xfs_inobt_get_rec(cur, irec, &stat);
+	if (error)
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(stat == 1);
+
+	/* Check if the record contains the inode in request */
+	if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
+		return -EINVAL;
+
+	idx = agino - irec->ir_startino + 1;
+	if (idx < XFS_INODES_PER_CHUNK &&
+	    (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) {
+		int	i;
+
+		/* We got a right chunk with some left inodes allocated at it.
+		 * Grab the chunk record.  Mark all the uninteresting inodes
+		 * free -- because they're before our start point.
+		 */
+		for (i = 0; i < idx; i++) {
+			if (XFS_INOBT_MASK(i) & ~irec->ir_free)
+				irec->ir_freecount++;
+		}
+
+		irec->ir_free |= xfs_inobt_maskn(0, idx);
+		*icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
+	}
+
+	return 0;
+}
+
 #define XFS_BULKSTAT_UBLEFT(ubleft)	((ubleft) >= statstruct_size)
 
 /*
+ * Process inodes in chunk with a pointer to a formatter function
+ * that will iget the inode and fill in the appropriate structure.
+ */
+int
+xfs_bulkstat_ag_ichunk(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno,
+	struct xfs_inobt_rec_incore	*irbp,
+	bulkstat_one_pf			formatter,
+	size_t				statstruct_size,
+	struct xfs_bulkstat_agichunk	*acp)
+{
+	xfs_ino_t			lastino = acp->ac_lastino;
+	char				__user **ubufp = acp->ac_ubuffer;
+	int				ubleft = acp->ac_ubleft;
+	int				ubelem = acp->ac_ubelem;
+	int				chunkidx, clustidx;
+	int				error = 0;
+	xfs_agino_t			agino;
+
+	for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
+	     XFS_BULKSTAT_UBLEFT(ubleft) &&
+	     irbp->ir_freecount < XFS_INODES_PER_CHUNK;
+	     chunkidx++, clustidx++, agino++) {
+		int		fmterror;	/* bulkstat formatter result */
+		int		ubused;
+		xfs_ino_t	ino = XFS_AGINO_TO_INO(mp, agno, agino);
+
+		ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+
+		/* Skip if this inode is free */
+		if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
+			lastino = ino;
+			continue;
+		}
+
+		/*
+		 * Count used inodes as free so we can tell when the
+		 * chunk is used up.
+		 */
+		irbp->ir_freecount++;
+
+		/* Get the inode and fill in a single buffer */
+		ubused = statstruct_size;
+		error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror);
+		if (fmterror == BULKSTAT_RV_NOTHING) {
+			if (error && error != -ENOENT && error != -EINVAL) {
+				ubleft = 0;
+				break;
+			}
+			lastino = ino;
+			continue;
+		}
+		if (fmterror == BULKSTAT_RV_GIVEUP) {
+			ubleft = 0;
+			ASSERT(error);
+			break;
+		}
+		if (*ubufp)
+			*ubufp += ubused;
+		ubleft -= ubused;
+		ubelem++;
+		lastino = ino;
+	}
+
+	acp->ac_lastino = lastino;
+	acp->ac_ubleft = ubleft;
+	acp->ac_ubelem = ubelem;
+
+	return error;
+}
+
+/*
  * Return stat information in bulk (by-inode) for the filesystem.
  */
 int					/* error status */
@@ -190,13 +348,10 @@ xfs_bulkstat(
 	char			__user *ubuffer, /* buffer with inode stats */
 	int			*done)	/* 1 if there are more stats to get */
 {
-	xfs_agblock_t		agbno=0;/* allocation group block number */
 	xfs_buf_t		*agbp;	/* agi header buffer */
 	xfs_agi_t		*agi;	/* agi header data */
 	xfs_agino_t		agino;	/* inode # in allocation group */
 	xfs_agnumber_t		agno;	/* allocation group number */
-	int			chunkidx; /* current index into inode chunk */
-	int			clustidx; /* current index into inode cluster */
 	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
 	int			end_of_ag; /* set if we've seen the ag end */
 	int			error;	/* error code */
@@ -209,8 +364,6 @@ xfs_bulkstat(
 	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
 	xfs_inobt_rec_incore_t	*irbufend; /* end of good irec buffer entries */
 	xfs_ino_t		lastino; /* last inode number returned */
-	int			blks_per_cluster; /* # of blocks per cluster */
-	int			inodes_per_cluster;/* # of inodes per cluster */
 	int			nirbuf;	/* size of irbuf */
 	int			rval;	/* return value error code */
 	int			tmp;	/* result value from btree calls */
@@ -218,7 +371,6 @@ xfs_bulkstat(
 	int			ubleft;	/* bytes left in user's buffer */
 	char			__user *ubufp;	/* pointer into user's buffer */
 	int			ubelem;	/* spaces used in user's buffer */
-	int			ubused;	/* bytes used by formatter */
 
 	/*
 	 * Get the last inode value, see if there's nothing to do.
@@ -233,20 +385,16 @@ xfs_bulkstat(
 		*ubcountp = 0;
 		return 0;
 	}
-	if (!ubcountp || *ubcountp <= 0) {
-		return EINVAL;
-	}
+
 	ubcount = *ubcountp; /* statstruct's */
 	ubleft = ubcount * statstruct_size; /* bytes */
 	*ubcountp = ubelem = 0;
 	*done = 0;
 	fmterror = 0;
 	ubufp = ubuffer;
-	blks_per_cluster = xfs_icluster_size_fsb(mp);
-	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
 	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
 	if (!irbuf)
-		return ENOMEM;
+		return -ENOMEM;
 
 	nirbuf = irbsize / sizeof(*irbuf);
 
@@ -258,14 +406,8 @@ xfs_bulkstat(
 	while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
 		cond_resched();
 		error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-		if (error) {
-			/*
-			 * Skip this allocation group and go to the next one.
-			 */
-			agno++;
-			agino = 0;
-			continue;
-		}
+		if (error)
+			break;
 		agi = XFS_BUF_TO_AGI(agbp);
 		/*
 		 * Allocate and initialize a btree cursor for ialloc btree.
@@ -275,96 +417,39 @@ xfs_bulkstat(
 		irbp = irbuf;
 		irbufend = irbuf + nirbuf;
 		end_of_ag = 0;
-		/*
-		 * If we're returning in the middle of an allocation group,
-		 * we need to get the remainder of the chunk we're in.
-		 */
+		icount = 0;
 		if (agino > 0) {
-			xfs_inobt_rec_incore_t r;
-
 			/*
-			 * Lookup the inode chunk that this inode lives in.
+			 * In the middle of an allocation group, we need to get
+			 * the remainder of the chunk we're in.
 			 */
-			error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
-						 &tmp);
-			if (!error &&	/* no I/O error */
-			    tmp &&	/* lookup succeeded */
-					/* got the record, should always work */
-			    !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
-			    i == 1 &&
-					/* this is the right chunk */
-			    agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
-					/* lastino was not last in chunk */
-			    (chunkidx = agino - r.ir_startino + 1) <
-				    XFS_INODES_PER_CHUNK &&
-					/* there are some left allocated */
-			    xfs_inobt_maskn(chunkidx,
-				    XFS_INODES_PER_CHUNK - chunkidx) &
-				    ~r.ir_free) {
-				/*
-				 * Grab the chunk record.  Mark all the
-				 * uninteresting inodes (because they're
-				 * before our start point) free.
-				 */
-				for (i = 0; i < chunkidx; i++) {
-					if (XFS_INOBT_MASK(i) & ~r.ir_free)
-						r.ir_freecount++;
-				}
-				r.ir_free |= xfs_inobt_maskn(0, chunkidx);
+			struct xfs_inobt_rec_incore	r;
+
+			error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
+			if (error)
+				break;
+			if (icount) {
 				irbp->ir_startino = r.ir_startino;
 				irbp->ir_freecount = r.ir_freecount;
 				irbp->ir_free = r.ir_free;
 				irbp++;
 				agino = r.ir_startino + XFS_INODES_PER_CHUNK;
-				icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
-			} else {
-				/*
-				 * If any of those tests failed, bump the
-				 * inode number (just in case).
-				 */
-				agino++;
-				icount = 0;
 			}
-			/*
-			 * In any case, increment to the next record.
-			 */
-			if (!error)
-				error = xfs_btree_increment(cur, 0, &tmp);
+			/* Increment to the next record */
+			error = xfs_btree_increment(cur, 0, &tmp);
 		} else {
-			/*
-			 * Start of ag.  Lookup the first inode chunk.
-			 */
+			/* Start of ag.  Lookup the first inode chunk */
 			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
-			icount = 0;
 		}
+		if (error)
+			break;
+
 		/*
 		 * Loop through inode btree records in this ag,
 		 * until we run out of inodes or space in the buffer.
 		 */
 		while (irbp < irbufend && icount < ubcount) {
-			xfs_inobt_rec_incore_t r;
-
-			/*
-			 * Loop as long as we're unable to read the
-			 * inode btree.
-			 */
-			while (error) {
-				agino += XFS_INODES_PER_CHUNK;
-				if (XFS_AGINO_TO_AGBNO(mp, agino) >=
-						be32_to_cpu(agi->agi_length))
-					break;
-				error = xfs_inobt_lookup(cur, agino,
-							 XFS_LOOKUP_GE, &tmp);
-				cond_resched();
-			}
-			/*
-			 * If ran off the end of the ag either with an error,
-			 * or the normal way, set end and stop collecting.
-			 */
-			if (error) {
-				end_of_ag = 1;
-				break;
-			}
+			struct xfs_inobt_rec_incore	r;
 
 			error = xfs_inobt_get_rec(cur, &r, &i);
 			if (error || i == 0) {
@@ -377,25 +462,7 @@ xfs_bulkstat(
 			 * Also start read-ahead now for this chunk.
 			 */
 			if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
-				struct blk_plug	plug;
-				/*
-				 * Loop over all clusters in the next chunk.
-				 * Do a readahead if there are any allocated
-				 * inodes in that cluster.
-				 */
-				blk_start_plug(&plug);
-				agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
-				for (chunkidx = 0;
-				     chunkidx < XFS_INODES_PER_CHUNK;
-				     chunkidx += inodes_per_cluster,
-				     agbno += blks_per_cluster) {
-					if (xfs_inobt_maskn(chunkidx,
-					    inodes_per_cluster) & ~r.ir_free)
-						xfs_btree_reada_bufs(mp, agno,
-							agbno, blks_per_cluster,
-							&xfs_inode_buf_ops);
-				}
-				blk_finish_plug(&plug);
+				xfs_bulkstat_ichunk_ra(mp, agno, &r);
 				irbp->ir_startino = r.ir_startino;
 				irbp->ir_freecount = r.ir_freecount;
 				irbp->ir_free = r.ir_free;
@@ -422,57 +489,20 @@ xfs_bulkstat(
 		irbufend = irbp;
 		for (irbp = irbuf;
 		     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
-			/*
-			 * Now process this chunk of inodes.
-			 */
-			for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
-			     XFS_BULKSTAT_UBLEFT(ubleft) &&
-				irbp->ir_freecount < XFS_INODES_PER_CHUNK;
-			     chunkidx++, clustidx++, agino++) {
-				ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
-
-				ino = XFS_AGINO_TO_INO(mp, agno, agino);
-				/*
-				 * Skip if this inode is free.
-				 */
-				if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
-					lastino = ino;
-					continue;
-				}
-				/*
-				 * Count used inodes as free so we can tell
-				 * when the chunk is used up.
-				 */
-				irbp->ir_freecount++;
-
-				/*
-				 * Get the inode and fill in a single buffer.
-				 */
-				ubused = statstruct_size;
-				error = formatter(mp, ino, ubufp, ubleft,
-						  &ubused, &fmterror);
-				if (fmterror == BULKSTAT_RV_NOTHING) {
-					if (error && error != ENOENT &&
-						error != EINVAL) {
-						ubleft = 0;
-						rval = error;
-						break;
-					}
-					lastino = ino;
-					continue;
-				}
-				if (fmterror == BULKSTAT_RV_GIVEUP) {
-					ubleft = 0;
-					ASSERT(error);
-					rval = error;
-					break;
-				}
-				if (ubufp)
-					ubufp += ubused;
-				ubleft -= ubused;
-				ubelem++;
-				lastino = ino;
-			}
+			struct xfs_bulkstat_agichunk ac;
+
+			ac.ac_lastino = lastino;
+			ac.ac_ubuffer = &ubuffer;
+			ac.ac_ubleft = ubleft;
+			ac.ac_ubelem = ubelem;
+			error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
+					formatter, statstruct_size, &ac);
+			if (error)
+				rval = error;
+
+			lastino = ac.ac_lastino;
+			ubleft = ac.ac_ubleft;
+			ubelem = ac.ac_ubelem;
 
 			cond_resched();
 		}
@@ -512,58 +542,10 @@ xfs_bulkstat(
 	return rval;
 }
 
-/*
- * Return stat information in bulk (by-inode) for the filesystem.
- * Special case for non-sequential one inode bulkstat.
- */
-int					/* error status */
-xfs_bulkstat_single(
-	xfs_mount_t		*mp,	/* mount point for filesystem */
-	xfs_ino_t		*lastinop, /* inode to return */
-	char			__user *buffer, /* buffer with inode stats */
-	int			*done)	/* 1 if there are more stats to get */
-{
-	int			count;	/* count value for bulkstat call */
-	int			error;	/* return value */
-	xfs_ino_t		ino;	/* filesystem inode number */
-	int			res;	/* result from bs1 */
-
-	/*
-	 * note that requesting valid inode numbers which are not allocated
-	 * to inodes will most likely cause xfs_imap_to_bp to generate warning
-	 * messages about bad magic numbers. This is ok. The fact that
-	 * the inode isn't actually an inode is handled by the
-	 * error check below. Done this way to make the usual case faster
-	 * at the expense of the error case.
-	 */
-
-	ino = *lastinop;
-	error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
-				 NULL, &res);
-	if (error) {
-		/*
-		 * Special case way failed, do it the "long" way
-		 * to see if that works.
-		 */
-		(*lastinop)--;
-		count = 1;
-		if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
-				sizeof(xfs_bstat_t), buffer, done))
-			return error;
-		if (count == 0 || (xfs_ino_t)*lastinop != ino)
-			return error == EFSCORRUPTED ?
-				XFS_ERROR(EINVAL) : error;
-		else
-			return 0;
-	}
-	*done = 0;
-	return 0;
-}
-
 int
 xfs_inumbers_fmt(
 	void			__user *ubuffer, /* buffer to write to */
-	const xfs_inogrp_t	*buffer,	/* buffer to read from */
+	const struct xfs_inogrp	*buffer,	/* buffer to read from */
 	long			count,		/* # of elements to read */
 	long			*written)	/* # of bytes written */
 {
@@ -578,127 +560,104 @@ xfs_inumbers_fmt(
  */
 int					/* error status */
 xfs_inumbers(
-	xfs_mount_t	*mp,		/* mount point for filesystem */
-	xfs_ino_t	*lastino,	/* last inode returned */
-	int		*count,		/* size of buffer/count returned */
-	void		__user *ubuffer,/* buffer with inode descriptions */
-	inumbers_fmt_pf	formatter)
+	struct xfs_mount	*mp,/* mount point for filesystem */
+	xfs_ino_t		*lastino,/* last inode returned */
+	int			*count,/* size of buffer/count returned */
+	void			__user *ubuffer,/* buffer with inode descriptions */
+	inumbers_fmt_pf		formatter)
 {
-	xfs_buf_t	*agbp;
-	xfs_agino_t	agino;
-	xfs_agnumber_t	agno;
-	int		bcount;
-	xfs_inogrp_t	*buffer;
-	int		bufidx;
-	xfs_btree_cur_t	*cur;
-	int		error;
-	xfs_inobt_rec_incore_t r;
-	int		i;
-	xfs_ino_t	ino;
-	int		left;
-	int		tmp;
-
-	ino = (xfs_ino_t)*lastino;
-	agno = XFS_INO_TO_AGNO(mp, ino);
-	agino = XFS_INO_TO_AGINO(mp, ino);
-	left = *count;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, *lastino);
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, *lastino);
+	struct xfs_btree_cur	*cur = NULL;
+	struct xfs_buf		*agbp = NULL;
+	struct xfs_inogrp	*buffer;
+	int			bcount;
+	int			left = *count;
+	int			bufidx = 0;
+	int			error = 0;
+
 	*count = 0;
+	if (agno >= mp->m_sb.sb_agcount ||
+	    *lastino != XFS_AGINO_TO_INO(mp, agno, agino))
+		return error;
+
 	bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));
 	buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
-	error = bufidx = 0;
-	cur = NULL;
-	agbp = NULL;
-	while (left > 0 && agno < mp->m_sb.sb_agcount) {
-		if (agbp == NULL) {
+	do {
+		struct xfs_inobt_rec_incore	r;
+		int				stat;
+
+		if (!agbp) {
 			error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-			if (error) {
-				/*
-				 * If we can't read the AGI of this ag,
-				 * then just skip to the next one.
-				 */
-				ASSERT(cur == NULL);
-				agbp = NULL;
-				agno++;
-				agino = 0;
-				continue;
-			}
+			if (error)
+				break;
+
 			cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
 						    XFS_BTNUM_INO);
 			error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
-						 &tmp);
-			if (error) {
-				xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-				cur = NULL;
-				xfs_buf_relse(agbp);
-				agbp = NULL;
-				/*
-				 * Move up the last inode in the current
-				 * chunk.  The lookup_ge will always get
-				 * us the first inode in the next chunk.
-				 */
-				agino += XFS_INODES_PER_CHUNK - 1;
-				continue;
-			}
-		}
-		error = xfs_inobt_get_rec(cur, &r, &i);
-		if (error || i == 0) {
-			xfs_buf_relse(agbp);
-			agbp = NULL;
-			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-			cur = NULL;
-			agno++;
-			agino = 0;
-			continue;
+						 &stat);
+			if (error)
+				break;
+			if (!stat)
+				goto next_ag;
 		}
+
+		error = xfs_inobt_get_rec(cur, &r, &stat);
+		if (error)
+			break;
+		if (!stat)
+			goto next_ag;
+
 		agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
 		buffer[bufidx].xi_startino =
 			XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
 		buffer[bufidx].xi_alloccount =
 			XFS_INODES_PER_CHUNK - r.ir_freecount;
 		buffer[bufidx].xi_allocmask = ~r.ir_free;
-		bufidx++;
-		left--;
-		if (bufidx == bcount) {
-			long written;
-			if (formatter(ubuffer, buffer, bufidx, &written)) {
-				error = XFS_ERROR(EFAULT);
+		if (++bufidx == bcount) {
+			long	written;
+
+			error = formatter(ubuffer, buffer, bufidx, &written);
+			if (error)
 				break;
-			}
 			ubuffer += written;
 			*count += bufidx;
 			bufidx = 0;
 		}
-		if (left) {
-			error = xfs_btree_increment(cur, 0, &tmp);
-			if (error) {
-				xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-				cur = NULL;
-				xfs_buf_relse(agbp);
-				agbp = NULL;
-				/*
-				 * The agino value has already been bumped.
-				 * Just try to skip up to it.
-				 */
-				agino += XFS_INODES_PER_CHUNK;
-				continue;
-			}
-		}
-	}
+		if (!--left)
+			break;
+
+		error = xfs_btree_increment(cur, 0, &stat);
+		if (error)
+			break;
+		if (stat)
+			continue;
+
+next_ag:
+		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+		cur = NULL;
+		xfs_buf_relse(agbp);
+		agbp = NULL;
+		agino = 0;
+	} while (++agno < mp->m_sb.sb_agcount);
+
 	if (!error) {
 		if (bufidx) {
-			long written;
-			if (formatter(ubuffer, buffer, bufidx, &written))
-				error = XFS_ERROR(EFAULT);
-			else
+			long	written;
+
+			error = formatter(ubuffer, buffer, bufidx, &written);
+			if (!error)
 				*count += bufidx;
 		}
 		*lastino = XFS_AGINO_TO_INO(mp, agno, agino);
 	}
+
 	kmem_free(buffer);
 	if (cur)
 		xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
 					   XFS_BTREE_NOERROR));
 	if (agbp)
 		xfs_buf_relse(agbp);
+
 	return error;
 }
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 97295d91d170..aaed08022eb9 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -30,6 +30,22 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
 			       int		*ubused,
 			       int		*stat);
 
+struct xfs_bulkstat_agichunk {
+	xfs_ino_t	ac_lastino;	/* last inode returned */
+	char		__user **ac_ubuffer;/* pointer into user's buffer */
+	int		ac_ubleft;	/* bytes left in user's buffer */
+	int		ac_ubelem;	/* spaces used in user's buffer */
+};
+
+int
+xfs_bulkstat_ag_ichunk(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno,
+	struct xfs_inobt_rec_incore	*irbp,
+	bulkstat_one_pf			formatter,
+	size_t				statstruct_size,
+	struct xfs_bulkstat_agichunk	*acp);
+
 /*
  * Values for stat return value.
  */
@@ -50,13 +66,6 @@ xfs_bulkstat(
 	char		__user *ubuffer,/* buffer with inode stats */
 	int		*done);		/* 1 if there are more stats to get */
 
-int
-xfs_bulkstat_single(
-	xfs_mount_t		*mp,
-	xfs_ino_t		*lastinop,
-	char			__user *buffer,
-	int			*done);
-
 typedef int (*bulkstat_one_fmt_pf)(  /* used size in bytes or negative error */
 	void			__user *ubuffer, /* buffer to write to */
 	int			ubsize,		 /* remaining user buffer sz */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 825249d2dfc1..d10dc8f397c9 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -21,18 +21,6 @@
 #include <linux/types.h>
 
 /*
- * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
- * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
- */
-#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
-# define XFS_BIG_BLKNOS	1
-# define XFS_BIG_INUMS	1
-#else
-# define XFS_BIG_BLKNOS	0
-# define XFS_BIG_INUMS	0
-#endif
-
-/*
  * Kernel specific type declarations for XFS
  */
 typedef signed char		__int8_t;
@@ -113,7 +101,7 @@ typedef __uint64_t __psunsigned_t;
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 
-#include "xfs_vnode.h"
+#include "xfs_fs.h"
 #include "xfs_stats.h"
 #include "xfs_sysctl.h"
 #include "xfs_iops.h"
@@ -191,6 +179,17 @@ typedef __uint64_t __psunsigned_t;
 #define MAX(a,b)	(max(a,b))
 #define howmany(x, y)	(((x)+((y)-1))/(y))
 
+/*
+ * XFS wrapper structure for sysfs support. It depends on external data
+ * structures and is embedded in various internal data structures to implement
+ * the XFS sysfs object heirarchy. Define it here for broad access throughout
+ * the codebase.
+ */
+struct xfs_kobj {
+	struct kobject		kobject;
+	struct completion	complete;
+};
+
 /* Kernel uid/gid conversion. These are used to convert to/from the on disk
  * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
  * The conversion here is type only, the value will remain the same since we
@@ -331,7 +330,7 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
 {
 	x += y - 1;
 	do_div(x, y);
-	return(x * y);
+	return x * y;
 }
 
 static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 292308dede6d..ca4fd5bd8522 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,7 @@
 #include "xfs_trace.h"
 #include "xfs_fsops.h"
 #include "xfs_cksum.h"
+#include "xfs_sysfs.h"
 
 kmem_zone_t	*xfs_log_ticket_zone;
 
@@ -283,7 +284,7 @@ xlog_grant_head_wait(
 	return 0;
 shutdown:
 	list_del_init(&tic->t_queue);
-	return XFS_ERROR(EIO);
+	return -EIO;
 }
 
 /*
@@ -377,7 +378,7 @@ xfs_log_regrant(
 	int			error = 0;
 
 	if (XLOG_FORCED_SHUTDOWN(log))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	XFS_STATS_INC(xs_try_logspace);
 
@@ -446,7 +447,7 @@ xfs_log_reserve(
 	ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
 
 	if (XLOG_FORCED_SHUTDOWN(log))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	XFS_STATS_INC(xs_try_logspace);
 
@@ -454,7 +455,7 @@ xfs_log_reserve(
 	tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
 				KM_SLEEP | KM_MAYFAIL);
 	if (!tic)
-		return XFS_ERROR(ENOMEM);
+		return -ENOMEM;
 
 	tic->t_trans_type = t_type;
 	*ticp = tic;
@@ -590,7 +591,7 @@ xfs_log_release_iclog(
 {
 	if (xlog_state_release_iclog(mp->m_log, iclog)) {
 		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-		return EIO;
+		return -EIO;
 	}
 
 	return 0;
@@ -628,7 +629,7 @@ xfs_log_mount(
 
 	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
 	if (IS_ERR(mp->m_log)) {
-		error = -PTR_ERR(mp->m_log);
+		error = PTR_ERR(mp->m_log);
 		goto out;
 	}
 
@@ -652,18 +653,18 @@ xfs_log_mount(
 		xfs_warn(mp,
 		"Log size %d blocks too small, minimum size is %d blocks",
 			 mp->m_sb.sb_logblocks, min_logfsbs);
-		error = EINVAL;
+		error = -EINVAL;
 	} else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
 		xfs_warn(mp,
 		"Log size %d blocks too large, maximum size is %lld blocks",
 			 mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
-		error = EINVAL;
+		error = -EINVAL;
 	} else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
 		xfs_warn(mp,
 		"log size %lld bytes too large, maximum size is %lld bytes",
 			 XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
 			 XFS_MAX_LOG_BYTES);
-		error = EINVAL;
+		error = -EINVAL;
 	}
 	if (error) {
 		if (xfs_sb_version_hascrc(&mp->m_sb)) {
@@ -707,6 +708,11 @@ xfs_log_mount(
 		}
 	}
 
+	error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
+			       "log");
+	if (error)
+		goto out_destroy_ail;
+
 	/* Normal transactions can now occur */
 	mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
 
@@ -947,6 +953,9 @@ xfs_log_unmount(
 	xfs_log_quiesce(mp);
 
 	xfs_trans_ail_destroy(mp);
+
+	xfs_sysfs_del(&mp->m_log->l_kobj);
+
 	xlog_dealloc_log(mp->m_log);
 }
 
@@ -1313,7 +1322,7 @@ xlog_alloc_log(
 	xlog_in_core_t		*iclog, *prev_iclog=NULL;
 	xfs_buf_t		*bp;
 	int			i;
-	int			error = ENOMEM;
+	int			error = -ENOMEM;
 	uint			log2_size = 0;
 
 	log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
@@ -1340,7 +1349,7 @@ xlog_alloc_log(
 	xlog_grant_head_init(&log->l_reserve_head);
 	xlog_grant_head_init(&log->l_write_head);
 
-	error = EFSCORRUPTED;
+	error = -EFSCORRUPTED;
 	if (xfs_sb_version_hassector(&mp->m_sb)) {
 	        log2_size = mp->m_sb.sb_logsectlog;
 		if (log2_size < BBSHIFT) {
@@ -1369,8 +1378,14 @@ xlog_alloc_log(
 
 	xlog_get_iclog_buffer_size(mp, log);
 
-	error = ENOMEM;
-	bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
+	/*
+	 * Use a NULL block for the extra log buffer used during splits so that
+	 * it will trigger errors if we ever try to do IO on it without first
+	 * having set it up properly.
+	 */
+	error = -ENOMEM;
+	bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
+			   BTOBB(log->l_iclog_size), 0);
 	if (!bp)
 		goto out_free_log;
 
@@ -1463,7 +1478,7 @@ out_free_iclog:
 out_free_log:
 	kmem_free(log);
 out:
-	return ERR_PTR(-error);
+	return ERR_PTR(error);
 }	/* xlog_alloc_log */
 
 
@@ -1661,7 +1676,7 @@ xlog_bdstrat(
 
 	xfs_buf_lock(bp);
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
-		xfs_buf_ioerror(bp, EIO);
+		xfs_buf_ioerror(bp, -EIO);
 		xfs_buf_stale(bp);
 		xfs_buf_ioend(bp, 0);
 		/*
@@ -2360,7 +2375,7 @@ xlog_write(
 
 			ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
 			if (!ophdr)
-				return XFS_ERROR(EIO);
+				return -EIO;
 
 			xlog_write_adv_cnt(&ptr, &len, &log_offset,
 					   sizeof(struct xlog_op_header));
@@ -2859,7 +2874,7 @@ restart:
 	spin_lock(&log->l_icloglock);
 	if (XLOG_FORCED_SHUTDOWN(log)) {
 		spin_unlock(&log->l_icloglock);
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 
 	iclog = log->l_iclog;
@@ -3047,7 +3062,7 @@ xlog_state_release_iclog(
 	int		sync = 0;	/* do we sync? */
 
 	if (iclog->ic_state & XLOG_STATE_IOERROR)
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
 	if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
@@ -3055,7 +3070,7 @@ xlog_state_release_iclog(
 
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		spin_unlock(&log->l_icloglock);
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
 	       iclog->ic_state == XLOG_STATE_WANT_SYNC);
@@ -3172,7 +3187,7 @@ _xfs_log_force(
 	iclog = log->l_iclog;
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		spin_unlock(&log->l_icloglock);
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 
 	/* If the head iclog is not active nor dirty, we just attach
@@ -3210,7 +3225,7 @@ _xfs_log_force(
 				spin_unlock(&log->l_icloglock);
 
 				if (xlog_state_release_iclog(log, iclog))
-					return XFS_ERROR(EIO);
+					return -EIO;
 
 				if (log_flushed)
 					*log_flushed = 1;
@@ -3246,7 +3261,7 @@ maybe_sleep:
 		 */
 		if (iclog->ic_state & XLOG_STATE_IOERROR) {
 			spin_unlock(&log->l_icloglock);
-			return XFS_ERROR(EIO);
+			return -EIO;
 		}
 		XFS_STATS_INC(xs_log_force_sleep);
 		xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
@@ -3256,7 +3271,7 @@ maybe_sleep:
 		 * and the memory read should be atomic.
 		 */
 		if (iclog->ic_state & XLOG_STATE_IOERROR)
-			return XFS_ERROR(EIO);
+			return -EIO;
 		if (log_flushed)
 			*log_flushed = 1;
 	} else {
@@ -3324,7 +3339,7 @@ try_again:
 	iclog = log->l_iclog;
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		spin_unlock(&log->l_icloglock);
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 
 	do {
@@ -3375,7 +3390,7 @@ try_again:
 			xlog_state_switch_iclogs(log, iclog, 0);
 			spin_unlock(&log->l_icloglock);
 			if (xlog_state_release_iclog(log, iclog))
-				return XFS_ERROR(EIO);
+				return -EIO;
 			if (log_flushed)
 				*log_flushed = 1;
 			spin_lock(&log->l_icloglock);
@@ -3390,7 +3405,7 @@ try_again:
 			 */
 			if (iclog->ic_state & XLOG_STATE_IOERROR) {
 				spin_unlock(&log->l_icloglock);
-				return XFS_ERROR(EIO);
+				return -EIO;
 			}
 			XFS_STATS_INC(xs_log_force_sleep);
 			xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
@@ -3400,7 +3415,7 @@ try_again:
 			 * and the memory read should be atomic.
 			 */
 			if (iclog->ic_state & XLOG_STATE_IOERROR)
-				return XFS_ERROR(EIO);
+				return -EIO;
 
 			if (log_flushed)
 				*log_flushed = 1;
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index b3425b34e3d5..f6b79e5325dd 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,8 +78,6 @@ xlog_cil_init_post_recovery(
 {
 	log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
 	log->l_cilp->xc_ctx->sequence = 1;
-	log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
-								log->l_curr_block);
 }
 
 /*
@@ -634,7 +632,7 @@ out_abort_free_ticket:
 	xfs_log_ticket_put(tic);
 out_abort:
 	xlog_cil_committed(ctx, XFS_LI_ABORTED);
-	return XFS_ERROR(EIO);
+	return -EIO;
 }
 
 static void
@@ -928,12 +926,12 @@ xlog_cil_init(
 
 	cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
 	if (!cil)
-		return ENOMEM;
+		return -ENOMEM;
 
 	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
 	if (!ctx) {
 		kmem_free(cil);
-		return ENOMEM;
+		return -ENOMEM;
 	}
 
 	INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9bc403a9e54f..db7cbdeb2b42 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -405,6 +405,8 @@ struct xlog {
 	struct xlog_grant_head	l_reserve_head;
 	struct xlog_grant_head	l_write_head;
 
+	struct xfs_kobj		l_kobj;
+
 	/* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
 	char			*l_iclog_bak[XLOG_MAX_ICLOGS];
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 981af0f6504b..1fd5787add99 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -179,7 +179,7 @@ xlog_bread_noalign(
 		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 			nbblks);
 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
-		return EFSCORRUPTED;
+		return -EFSCORRUPTED;
 	}
 
 	blk_no = round_down(blk_no, log->l_sectBBsize);
@@ -194,7 +194,7 @@ xlog_bread_noalign(
 	bp->b_error = 0;
 
 	if (XFS_FORCED_SHUTDOWN(log->l_mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	xfs_buf_iorequest(bp);
 	error = xfs_buf_iowait(bp);
@@ -268,7 +268,7 @@ xlog_bwrite(
 		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 			nbblks);
 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
-		return EFSCORRUPTED;
+		return -EFSCORRUPTED;
 	}
 
 	blk_no = round_down(blk_no, log->l_sectBBsize);
@@ -330,14 +330,14 @@ xlog_header_check_recover(
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_recover(1)",
 				 XFS_ERRLEVEL_HIGH, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 		xfs_warn(mp,
 	"dirty log entry has mismatched uuid - can't recover");
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_recover(2)",
 				 XFS_ERRLEVEL_HIGH, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	return 0;
 }
@@ -364,7 +364,7 @@ xlog_header_check_mount(
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_mount",
 				 XFS_ERRLEVEL_HIGH, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	return 0;
 }
@@ -462,7 +462,7 @@ xlog_find_verify_cycle(
 	while (!(bp = xlog_get_bp(log, bufblks))) {
 		bufblks >>= 1;
 		if (bufblks < log->l_sectBBsize)
-			return ENOMEM;
+			return -ENOMEM;
 	}
 
 	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
@@ -524,7 +524,7 @@ xlog_find_verify_log_record(
 
 	if (!(bp = xlog_get_bp(log, num_blks))) {
 		if (!(bp = xlog_get_bp(log, 1)))
-			return ENOMEM;
+			return -ENOMEM;
 		smallmem = 1;
 	} else {
 		error = xlog_bread(log, start_blk, num_blks, bp, &offset);
@@ -539,7 +539,7 @@ xlog_find_verify_log_record(
 			xfs_warn(log->l_mp,
 		"Log inconsistent (didn't find previous header)");
 			ASSERT(0);
-			error = XFS_ERROR(EIO);
+			error = -EIO;
 			goto out;
 		}
 
@@ -564,7 +564,7 @@ xlog_find_verify_log_record(
 	 * will be called again for the end of the physical log.
 	 */
 	if (i == -1) {
-		error = -1;
+		error = 1;
 		goto out;
 	}
 
@@ -628,7 +628,12 @@ xlog_find_head(
 	int		error, log_bbnum = log->l_logBBsize;
 
 	/* Is the end of the log device zeroed? */
-	if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
+	error = xlog_find_zeroed(log, &first_blk);
+	if (error < 0) {
+		xfs_warn(log->l_mp, "empty log check failed");
+		return error;
+	}
+	if (error == 1) {
 		*return_head_blk = first_blk;
 
 		/* Is the whole lot zeroed? */
@@ -641,15 +646,12 @@ xlog_find_head(
 		}
 
 		return 0;
-	} else if (error) {
-		xfs_warn(log->l_mp, "empty log check failed");
-		return error;
 	}
 
 	first_blk = 0;			/* get cycle # of 1st block */
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
-		return ENOMEM;
+		return -ENOMEM;
 
 	error = xlog_bread(log, 0, 1, bp, &offset);
 	if (error)
@@ -818,29 +820,29 @@ validate_head:
 		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 
 		/* start ptr at last block ptr before head_blk */
-		if ((error = xlog_find_verify_log_record(log, start_blk,
-							&head_blk, 0)) == -1) {
-			error = XFS_ERROR(EIO);
-			goto bp_err;
-		} else if (error)
+		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
+		if (error == 1)
+			error = -EIO;
+		if (error)
 			goto bp_err;
 	} else {
 		start_blk = 0;
 		ASSERT(head_blk <= INT_MAX);
-		if ((error = xlog_find_verify_log_record(log, start_blk,
-							&head_blk, 0)) == -1) {
+		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
+		if (error < 0)
+			goto bp_err;
+		if (error == 1) {
 			/* We hit the beginning of the log during our search */
 			start_blk = log_bbnum - (num_scan_bblks - head_blk);
 			new_blk = log_bbnum;
 			ASSERT(start_blk <= INT_MAX &&
 				(xfs_daddr_t) log_bbnum-start_blk >= 0);
 			ASSERT(head_blk <= INT_MAX);
-			if ((error = xlog_find_verify_log_record(log,
-							start_blk, &new_blk,
-							(int)head_blk)) == -1) {
-				error = XFS_ERROR(EIO);
-				goto bp_err;
-			} else if (error)
+			error = xlog_find_verify_log_record(log, start_blk,
+							&new_blk, (int)head_blk);
+			if (error == 1)
+				error = -EIO;
+			if (error)
 				goto bp_err;
 			if (new_blk != log_bbnum)
 				head_blk = new_blk;
@@ -911,7 +913,7 @@ xlog_find_tail(
 
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
-		return ENOMEM;
+		return -ENOMEM;
 	if (*head_blk == 0) {				/* special case */
 		error = xlog_bread(log, 0, 1, bp, &offset);
 		if (error)
@@ -961,7 +963,7 @@ xlog_find_tail(
 		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
 		xlog_put_bp(bp);
 		ASSERT(0);
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 
 	/* find blk_no of tail of log */
@@ -1092,8 +1094,8 @@ done:
  *
  * Return:
  *	0  => the log is completely written to
- *	-1 => use *blk_no as the first block of the log
- *	>0 => error has occurred
+ *	1 => use *blk_no as the first block of the log
+ *	<0 => error has occurred
  */
 STATIC int
 xlog_find_zeroed(
@@ -1112,7 +1114,7 @@ xlog_find_zeroed(
 	/* check totally zeroed log */
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
-		return ENOMEM;
+		return -ENOMEM;
 	error = xlog_bread(log, 0, 1, bp, &offset);
 	if (error)
 		goto bp_err;
@@ -1121,7 +1123,7 @@ xlog_find_zeroed(
 	if (first_cycle == 0) {		/* completely zeroed log */
 		*blk_no = 0;
 		xlog_put_bp(bp);
-		return -1;
+		return 1;
 	}
 
 	/* check partially zeroed log */
@@ -1141,7 +1143,7 @@ xlog_find_zeroed(
 		 */
 		xfs_warn(log->l_mp,
 			"Log inconsistent or not a log (last==0, first!=1)");
-		error = XFS_ERROR(EINVAL);
+		error = -EINVAL;
 		goto bp_err;
 	}
 
@@ -1179,19 +1181,18 @@ xlog_find_zeroed(
 	 * Potentially backup over partial log record write.  We don't need
 	 * to search the end of the log because we know it is zero.
 	 */
-	if ((error = xlog_find_verify_log_record(log, start_blk,
-				&last_blk, 0)) == -1) {
-	    error = XFS_ERROR(EIO);
-	    goto bp_err;
-	} else if (error)
-	    goto bp_err;
+	error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
+	if (error == 1)
+		error = -EIO;
+	if (error)
+		goto bp_err;
 
 	*blk_no = last_blk;
 bp_err:
 	xlog_put_bp(bp);
 	if (error)
 		return error;
-	return -1;
+	return 1;
 }
 
 /*
@@ -1251,7 +1252,7 @@ xlog_write_log_records(
 	while (!(bp = xlog_get_bp(log, bufblks))) {
 		bufblks >>= 1;
 		if (bufblks < sectbb)
-			return ENOMEM;
+			return -ENOMEM;
 	}
 
 	/* We may need to do a read at the start to fill in part of
@@ -1354,7 +1355,7 @@ xlog_clear_stale_blocks(
 		if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
 			XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
 					 XFS_ERRLEVEL_LOW, log->l_mp);
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 		}
 		tail_distance = tail_block + (log->l_logBBsize - head_block);
 	} else {
@@ -1366,7 +1367,7 @@ xlog_clear_stale_blocks(
 		if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
 			XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
 					 XFS_ERRLEVEL_LOW, log->l_mp);
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 		}
 		tail_distance = tail_block - head_block;
 	}
@@ -1551,7 +1552,7 @@ xlog_recover_add_to_trans(
 			xfs_warn(log->l_mp, "%s: bad header magic number",
 				__func__);
 			ASSERT(0);
-			return XFS_ERROR(EIO);
+			return -EIO;
 		}
 		if (len == sizeof(xfs_trans_header_t))
 			xlog_recover_add_item(&trans->r_itemq);
@@ -1581,7 +1582,7 @@ xlog_recover_add_to_trans(
 				  in_f->ilf_size);
 			ASSERT(0);
 			kmem_free(ptr);
-			return XFS_ERROR(EIO);
+			return -EIO;
 		}
 
 		item->ri_total = in_f->ilf_size;
@@ -1702,7 +1703,7 @@ xlog_recover_reorder_trans(
 			 */
 			if (!list_empty(&sort_list))
 				list_splice_init(&sort_list, &trans->r_itemq);
-			error = XFS_ERROR(EIO);
+			error = -EIO;
 			goto out;
 		}
 	}
@@ -1943,7 +1944,7 @@ xlog_recover_do_inode_buffer(
 				item, bp);
 			XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
 					 XFS_ERRLEVEL_LOW, mp);
-			return XFS_ERROR(EFSCORRUPTED);
+			return -EFSCORRUPTED;
 		}
 
 		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
@@ -2125,6 +2126,17 @@ xlog_recover_validate_buf_type(
 	__uint16_t		magic16;
 	__uint16_t		magicda;
 
+	/*
+	 * We can only do post recovery validation on items on CRC enabled
+	 * fielsystems as we need to know when the buffer was written to be able
+	 * to determine if we should have replayed the item. If we replay old
+	 * metadata over a newer buffer, then it will enter a temporarily
+	 * inconsistent state resulting in verification failures. Hence for now
+	 * just avoid the verification stage for non-crc filesystems
+	 */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
 	magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
 	magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
 	magicda = be16_to_cpu(info->magic);
@@ -2162,8 +2174,6 @@ xlog_recover_validate_buf_type(
 		bp->b_ops = &xfs_agf_buf_ops;
 		break;
 	case XFS_BLFT_AGFL_BUF:
-		if (!xfs_sb_version_hascrc(&mp->m_sb))
-			break;
 		if (magic32 != XFS_AGFL_MAGIC) {
 			xfs_warn(mp, "Bad AGFL block magic!");
 			ASSERT(0);
@@ -2196,10 +2206,6 @@ xlog_recover_validate_buf_type(
 #endif
 		break;
 	case XFS_BLFT_DINO_BUF:
-		/*
-		 * we get here with inode allocation buffers, not buffers that
-		 * track unlinked list changes.
-		 */
 		if (magic16 != XFS_DINODE_MAGIC) {
 			xfs_warn(mp, "Bad INODE block magic!");
 			ASSERT(0);
@@ -2279,8 +2285,6 @@ xlog_recover_validate_buf_type(
 		bp->b_ops = &xfs_attr3_leaf_buf_ops;
 		break;
 	case XFS_BLFT_ATTR_RMT_BUF:
-		if (!xfs_sb_version_hascrc(&mp->m_sb))
-			break;
 		if (magic32 != XFS_ATTR3_RMT_MAGIC) {
 			xfs_warn(mp, "Bad attr remote magic!");
 			ASSERT(0);
@@ -2387,16 +2391,7 @@ xlog_recover_do_reg_buffer(
 	/* Shouldn't be any more regions */
 	ASSERT(i == item->ri_total);
 
-	/*
-	 * We can only do post recovery validation on items on CRC enabled
-	 * fielsystems as we need to know when the buffer was written to be able
-	 * to determine if we should have replayed the item. If we replay old
-	 * metadata over a newer buffer, then it will enter a temporarily
-	 * inconsistent state resulting in verification failures. Hence for now
-	 * just avoid the verification stage for non-crc filesystems
-	 */
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		xlog_recover_validate_buf_type(mp, bp, buf_f);
+	xlog_recover_validate_buf_type(mp, bp, buf_f);
 }
 
 /*
@@ -2404,8 +2399,11 @@ xlog_recover_do_reg_buffer(
  * Simple algorithm: if we have found a QUOTAOFF log item of the same type
  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
  * Else, treat it as a regular buffer and do recovery.
+ *
+ * Return false if the buffer was tossed and true if we recovered the buffer to
+ * indicate to the caller if the buffer needs writing.
  */
-STATIC void
+STATIC bool
 xlog_recover_do_dquot_buffer(
 	struct xfs_mount		*mp,
 	struct xlog			*log,
@@ -2420,9 +2418,8 @@ xlog_recover_do_dquot_buffer(
 	/*
 	 * Filesystems are required to send in quota flags at mount time.
 	 */
-	if (mp->m_qflags == 0) {
-		return;
-	}
+	if (!mp->m_qflags)
+		return false;
 
 	type = 0;
 	if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
@@ -2435,9 +2432,10 @@ xlog_recover_do_dquot_buffer(
 	 * This type of quotas was turned off, so ignore this buffer
 	 */
 	if (log->l_quotaoffs_flag & type)
-		return;
+		return false;
 
 	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+	return true;
 }
 
 /*
@@ -2496,7 +2494,7 @@ xlog_recover_buffer_pass2(
 	bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
 			  buf_flags, NULL);
 	if (!bp)
-		return XFS_ERROR(ENOMEM);
+		return -ENOMEM;
 	error = bp->b_error;
 	if (error) {
 		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
@@ -2504,23 +2502,44 @@ xlog_recover_buffer_pass2(
 	}
 
 	/*
-	 * recover the buffer only if we get an LSN from it and it's less than
+	 * Recover the buffer only if we get an LSN from it and it's less than
 	 * the lsn of the transaction we are replaying.
+	 *
+	 * Note that we have to be extremely careful of readahead here.
+	 * Readahead does not attach verfiers to the buffers so if we don't
+	 * actually do any replay after readahead because of the LSN we found
+	 * in the buffer if more recent than that current transaction then we
+	 * need to attach the verifier directly. Failure to do so can lead to
+	 * future recovery actions (e.g. EFI and unlinked list recovery) can
+	 * operate on the buffers and they won't get the verifier attached. This
+	 * can lead to blocks on disk having the correct content but a stale
+	 * CRC.
+	 *
+	 * It is safe to assume these clean buffers are currently up to date.
+	 * If the buffer is dirtied by a later transaction being replayed, then
+	 * the verifier will be reset to match whatever recover turns that
+	 * buffer into.
 	 */
 	lsn = xlog_recover_get_buf_lsn(mp, bp);
-	if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
+	if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+		xlog_recover_validate_buf_type(mp, bp, buf_f);
 		goto out_release;
+	}
 
 	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
 		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+		if (error)
+			goto out_release;
 	} else if (buf_f->blf_flags &
 		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
-		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+		bool	dirty;
+
+		dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+		if (!dirty)
+			goto out_release;
 	} else {
 		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 	}
-	if (error)
-		goto out_release;
 
 	/*
 	 * Perform delayed write on the buffer.  Asynchronous writes will be
@@ -2598,7 +2617,7 @@ xfs_recover_inode_owner_change(
 
 	ip = xfs_inode_alloc(mp, in_f->ilf_ino);
 	if (!ip)
-		return ENOMEM;
+		return -ENOMEM;
 
 	/* instantiate the inode */
 	xfs_dinode_from_disk(&ip->i_d, dip);
@@ -2676,7 +2695,7 @@ xlog_recover_inode_pass2(
 	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
 			  &xfs_inode_buf_ops);
 	if (!bp) {
-		error = ENOMEM;
+		error = -ENOMEM;
 		goto error;
 	}
 	error = bp->b_error;
@@ -2697,7 +2716,7 @@ xlog_recover_inode_pass2(
 			__func__, dip, bp, in_f->ilf_ino);
 		XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
 				 XFS_ERRLEVEL_LOW, mp);
-		error = EFSCORRUPTED;
+		error = -EFSCORRUPTED;
 		goto out_release;
 	}
 	dicp = item->ri_buf[1].i_addr;
@@ -2707,7 +2726,7 @@ xlog_recover_inode_pass2(
 			__func__, item, in_f->ilf_ino);
 		XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
 				 XFS_ERRLEVEL_LOW, mp);
-		error = EFSCORRUPTED;
+		error = -EFSCORRUPTED;
 		goto out_release;
 	}
 
@@ -2764,7 +2783,7 @@ xlog_recover_inode_pass2(
 		"%s: Bad regular inode log record, rec ptr 0x%p, "
 		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				__func__, item, dip, bp, in_f->ilf_ino);
-			error = EFSCORRUPTED;
+			error = -EFSCORRUPTED;
 			goto out_release;
 		}
 	} else if (unlikely(S_ISDIR(dicp->di_mode))) {
@@ -2777,7 +2796,7 @@ xlog_recover_inode_pass2(
 		"%s: Bad dir inode log record, rec ptr 0x%p, "
 		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				__func__, item, dip, bp, in_f->ilf_ino);
-			error = EFSCORRUPTED;
+			error = -EFSCORRUPTED;
 			goto out_release;
 		}
 	}
@@ -2790,7 +2809,7 @@ xlog_recover_inode_pass2(
 			__func__, item, dip, bp, in_f->ilf_ino,
 			dicp->di_nextents + dicp->di_anextents,
 			dicp->di_nblocks);
-		error = EFSCORRUPTED;
+		error = -EFSCORRUPTED;
 		goto out_release;
 	}
 	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
@@ -2800,7 +2819,7 @@ xlog_recover_inode_pass2(
 	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
 	"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
 			item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
-		error = EFSCORRUPTED;
+		error = -EFSCORRUPTED;
 		goto out_release;
 	}
 	isize = xfs_icdinode_size(dicp->di_version);
@@ -2810,7 +2829,7 @@ xlog_recover_inode_pass2(
 		xfs_alert(mp,
 			"%s: Bad inode log record length %d, rec ptr 0x%p",
 			__func__, item->ri_buf[1].i_len, item);
-		error = EFSCORRUPTED;
+		error = -EFSCORRUPTED;
 		goto out_release;
 	}
 
@@ -2898,7 +2917,7 @@ xlog_recover_inode_pass2(
 		default:
 			xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
 			ASSERT(0);
-			error = EIO;
+			error = -EIO;
 			goto out_release;
 		}
 	}
@@ -2919,7 +2938,7 @@ out_release:
 error:
 	if (need_free)
 		kmem_free(in_f);
-	return XFS_ERROR(error);
+	return error;
 }
 
 /*
@@ -2946,7 +2965,7 @@ xlog_recover_quotaoff_pass1(
 	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
 		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
 
-	return (0);
+	return 0;
 }
 
 /*
@@ -2971,17 +2990,17 @@ xlog_recover_dquot_pass2(
 	 * Filesystems are required to send in quota flags at mount time.
 	 */
 	if (mp->m_qflags == 0)
-		return (0);
+		return 0;
 
 	recddq = item->ri_buf[1].i_addr;
 	if (recddq == NULL) {
 		xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 	if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
 		xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
 			item->ri_buf[1].i_len, __func__);
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 
 	/*
@@ -2990,7 +3009,7 @@ xlog_recover_dquot_pass2(
 	type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
 	ASSERT(type);
 	if (log->l_quotaoffs_flag & type)
-		return (0);
+		return 0;
 
 	/*
 	 * At this point we know that quota was _not_ turned off.
@@ -3007,12 +3026,19 @@ xlog_recover_dquot_pass2(
 	error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
 			   "xlog_recover_dquot_pass2 (log copy)");
 	if (error)
-		return XFS_ERROR(EIO);
+		return -EIO;
 	ASSERT(dq_f->qlf_len == 1);
 
+	/*
+	 * At this point we are assuming that the dquots have been allocated
+	 * and hence the buffer has valid dquots stamped in it. It should,
+	 * therefore, pass verifier validation. If the dquot is bad, then the
+	 * we'll return an error here, so we don't need to specifically check
+	 * the dquot in the buffer after the verifier has run.
+	 */
 	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
 				   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
-				   NULL);
+				   &xfs_dquot_buf_ops);
 	if (error)
 		return error;
 
@@ -3020,18 +3046,6 @@ xlog_recover_dquot_pass2(
 	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
 
 	/*
-	 * At least the magic num portion should be on disk because this
-	 * was among a chunk of dquots created earlier, and we did some
-	 * minimal initialization then.
-	 */
-	error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-			   "xlog_recover_dquot_pass2");
-	if (error) {
-		xfs_buf_relse(bp);
-		return XFS_ERROR(EIO);
-	}
-
-	/*
 	 * If the dquot has an LSN in it, recover the dquot only if it's less
 	 * than the lsn of the transaction we are replaying.
 	 */
@@ -3178,38 +3192,38 @@ xlog_recover_do_icreate_pass2(
 	icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
 	if (icl->icl_type != XFS_LI_ICREATE) {
 		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
-		return EINVAL;
+		return -EINVAL;
 	}
 
 	if (icl->icl_size != 1) {
 		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
-		return EINVAL;
+		return -EINVAL;
 	}
 
 	agno = be32_to_cpu(icl->icl_ag);
 	if (agno >= mp->m_sb.sb_agcount) {
 		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
-		return EINVAL;
+		return -EINVAL;
 	}
 	agbno = be32_to_cpu(icl->icl_agbno);
 	if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
 		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
-		return EINVAL;
+		return -EINVAL;
 	}
 	isize = be32_to_cpu(icl->icl_isize);
 	if (isize != mp->m_sb.sb_inodesize) {
 		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
-		return EINVAL;
+		return -EINVAL;
 	}
 	count = be32_to_cpu(icl->icl_count);
 	if (!count) {
 		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
-		return EINVAL;
+		return -EINVAL;
 	}
 	length = be32_to_cpu(icl->icl_length);
 	if (!length || length >= mp->m_sb.sb_agblocks) {
 		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
-		return EINVAL;
+		return -EINVAL;
 	}
 
 	/* existing allocation is fixed value */
@@ -3218,7 +3232,7 @@ xlog_recover_do_icreate_pass2(
 	if (count != mp->m_ialloc_inos ||
 	     length != mp->m_ialloc_blks) {
 		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
-		return EINVAL;
+		return -EINVAL;
 	}
 
 	/*
@@ -3389,7 +3403,7 @@ xlog_recover_commit_pass1(
 		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
 			__func__, ITEM_TYPE(item));
 		ASSERT(0);
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 }
 
@@ -3425,7 +3439,7 @@ xlog_recover_commit_pass2(
 		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
 			__func__, ITEM_TYPE(item));
 		ASSERT(0);
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 }
 
@@ -3560,7 +3574,7 @@ xlog_recover_process_data(
 
 	/* check the log format matches our own - else we can't recover */
 	if (xlog_header_check_recover(log->l_mp, rhead))
-		return (XFS_ERROR(EIO));
+		return -EIO;
 
 	while ((dp < lp) && num_logops) {
 		ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
@@ -3571,7 +3585,7 @@ xlog_recover_process_data(
 			xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
 					__func__, ohead->oh_clientid);
 			ASSERT(0);
-			return (XFS_ERROR(EIO));
+			return -EIO;
 		}
 		tid = be32_to_cpu(ohead->oh_tid);
 		hash = XLOG_RHASH(tid);
@@ -3585,7 +3599,7 @@ xlog_recover_process_data(
 				xfs_warn(log->l_mp, "%s: bad length 0x%x",
 					__func__, be32_to_cpu(ohead->oh_len));
 				WARN_ON(1);
-				return (XFS_ERROR(EIO));
+				return -EIO;
 			}
 			flags = ohead->oh_flags & ~XLOG_END_TRANS;
 			if (flags & XLOG_WAS_CONT_TRANS)
@@ -3607,7 +3621,7 @@ xlog_recover_process_data(
 				xfs_warn(log->l_mp, "%s: bad transaction",
 					__func__);
 				ASSERT(0);
-				error = XFS_ERROR(EIO);
+				error = -EIO;
 				break;
 			case 0:
 			case XLOG_CONTINUE_TRANS:
@@ -3618,7 +3632,7 @@ xlog_recover_process_data(
 				xfs_warn(log->l_mp, "%s: bad flag 0x%x",
 					__func__, flags);
 				ASSERT(0);
-				error = XFS_ERROR(EIO);
+				error = -EIO;
 				break;
 			}
 			if (error) {
@@ -3669,7 +3683,7 @@ xlog_recover_process_efi(
 			 */
 			set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
 			xfs_efi_release(efip, efip->efi_format.efi_nextents);
-			return XFS_ERROR(EIO);
+			return -EIO;
 		}
 	}
 
@@ -3969,7 +3983,7 @@ xlog_unpack_data_crc(
 		 * CRC protection by punting an error back up the stack.
 		 */
 		if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
-			return EFSCORRUPTED;
+			return -EFSCORRUPTED;
 	}
 
 	return 0;
@@ -4018,14 +4032,14 @@ xlog_valid_rec_header(
 	if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
 		XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
 				XFS_ERRLEVEL_LOW, log->l_mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	if (unlikely(
 	    (!rhead->h_version ||
 	    (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
 		xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
 			__func__, be32_to_cpu(rhead->h_version));
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 
 	/* LR body must have data or it wouldn't have been written */
@@ -4033,12 +4047,12 @@ xlog_valid_rec_header(
 	if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
 		XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
 				XFS_ERRLEVEL_LOW, log->l_mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
 		XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
 				XFS_ERRLEVEL_LOW, log->l_mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 	return 0;
 }
@@ -4081,7 +4095,7 @@ xlog_do_recovery_pass(
 		 */
 		hbp = xlog_get_bp(log, 1);
 		if (!hbp)
-			return ENOMEM;
+			return -ENOMEM;
 
 		error = xlog_bread(log, tail_blk, 1, hbp, &offset);
 		if (error)
@@ -4110,11 +4124,11 @@ xlog_do_recovery_pass(
 	}
 
 	if (!hbp)
-		return ENOMEM;
+		return -ENOMEM;
 	dbp = xlog_get_bp(log, BTOBB(h_size));
 	if (!dbp) {
 		xlog_put_bp(hbp);
-		return ENOMEM;
+		return -ENOMEM;
 	}
 
 	memset(rhash, 0, sizeof(rhash));
@@ -4388,7 +4402,7 @@ xlog_do_recover(
 	 * If IO errors happened during recovery, bail out.
 	 */
 	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
-		return (EIO);
+		return -EIO;
 	}
 
 	/*
@@ -4415,7 +4429,7 @@ xlog_do_recover(
 
 	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
 		xfs_buf_relse(bp);
-		return XFS_ERROR(EIO);
+		return -EIO;
 	}
 
 	xfs_buf_iorequest(bp);
@@ -4492,7 +4506,7 @@ xlog_recover(
 "Please recover the log on a kernel that supports the unknown features.",
 				(log->l_mp->m_sb.sb_features_log_incompat &
 					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
-			return EINVAL;
+			return -EINVAL;
 		}
 
 		xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3507cd0ec400..fbf0384a466f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_dinode.h"
+#include "xfs_sysfs.h"
 
 
 #ifdef HAVE_PERCPU_SB
@@ -60,6 +61,8 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
 static uuid_t *xfs_uuid_table;
 
+extern struct kset *xfs_kset;
+
 /*
  * See if the UUID is unique among mounted XFS filesystems.
  * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
@@ -76,7 +79,7 @@ xfs_uuid_mount(
 
 	if (uuid_is_nil(uuid)) {
 		xfs_warn(mp, "Filesystem has nil UUID - can't mount");
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	mutex_lock(&xfs_uuid_table_mutex);
@@ -104,7 +107,7 @@ xfs_uuid_mount(
  out_duplicate:
 	mutex_unlock(&xfs_uuid_table_mutex);
 	xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
-	return XFS_ERROR(EINVAL);
+	return -EINVAL;
 }
 
 STATIC void
@@ -173,13 +176,9 @@ xfs_sb_validate_fsb_count(
 	ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
 	ASSERT(sbp->sb_blocklog >= BBSHIFT);
 
-#if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
+	/* Limited by ULONG_MAX of page cache index */
 	if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
-		return EFBIG;
-#else                  /* Limited by UINT_MAX of sectors */
-	if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
-		return EFBIG;
-#endif
+		return -EFBIG;
 	return 0;
 }
 
@@ -250,9 +249,9 @@ xfs_initialize_perag(
 		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
 
 	if (mp->m_flags & XFS_MOUNT_32BITINODES)
-		index = xfs_set_inode32(mp);
+		index = xfs_set_inode32(mp, agcount);
 	else
-		index = xfs_set_inode64(mp);
+		index = xfs_set_inode64(mp, agcount);
 
 	if (maxagi)
 		*maxagi = index;
@@ -308,15 +307,15 @@ reread:
 	if (!bp) {
 		if (loud)
 			xfs_warn(mp, "SB buffer read failed");
-		return EIO;
+		return -EIO;
 	}
 	if (bp->b_error) {
 		error = bp->b_error;
 		if (loud)
 			xfs_warn(mp, "SB validate failed with error %d.", error);
 		/* bad CRC means corrupted metadata */
-		if (error == EFSBADCRC)
-			error = EFSCORRUPTED;
+		if (error == -EFSBADCRC)
+			error = -EFSCORRUPTED;
 		goto release_buf;
 	}
 
@@ -324,7 +323,6 @@ reread:
 	 * Initialize the mount structure from the superblock.
 	 */
 	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
-	xfs_sb_quota_from_disk(sbp);
 
 	/*
 	 * If we haven't validated the superblock, do so now before we try
@@ -333,7 +331,7 @@ reread:
 	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
 		if (loud)
 			xfs_warn(mp, "Invalid superblock magic number");
-		error = EINVAL;
+		error = -EINVAL;
 		goto release_buf;
 	}
 
@@ -344,7 +342,7 @@ reread:
 		if (loud)
 			xfs_warn(mp, "device supports %u byte sectors (not %u)",
 				sector_size, sbp->sb_sectsize);
-		error = ENOSYS;
+		error = -ENOSYS;
 		goto release_buf;
 	}
 
@@ -392,7 +390,7 @@ xfs_update_alignment(xfs_mount_t *mp)
 			xfs_warn(mp,
 		"alignment check failed: sunit/swidth vs. blocksize(%d)",
 				sbp->sb_blocksize);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		} else {
 			/*
 			 * Convert the stripe unit and width to FSBs.
@@ -402,14 +400,14 @@ xfs_update_alignment(xfs_mount_t *mp)
 				xfs_warn(mp,
 			"alignment check failed: sunit/swidth vs. agsize(%d)",
 					 sbp->sb_agblocks);
-				return XFS_ERROR(EINVAL);
+				return -EINVAL;
 			} else if (mp->m_dalign) {
 				mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
 			} else {
 				xfs_warn(mp,
 			"alignment check failed: sunit(%d) less than bsize(%d)",
 					 mp->m_dalign, sbp->sb_blocksize);
-				return XFS_ERROR(EINVAL);
+				return -EINVAL;
 			}
 		}
 
@@ -429,7 +427,7 @@ xfs_update_alignment(xfs_mount_t *mp)
 		} else {
 			xfs_warn(mp,
 	"cannot change alignment: superblock does not support data alignment");
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 	} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
 		    xfs_sb_version_hasdalign(&mp->m_sb)) {
@@ -556,14 +554,14 @@ xfs_check_sizes(xfs_mount_t *mp)
 	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
 	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
 		xfs_warn(mp, "filesystem size mismatch detected");
-		return XFS_ERROR(EFBIG);
+		return -EFBIG;
 	}
 	bp = xfs_buf_read_uncached(mp->m_ddev_targp,
 					d - XFS_FSS_TO_BB(mp, 1),
 					XFS_FSS_TO_BB(mp, 1), 0, NULL);
 	if (!bp) {
 		xfs_warn(mp, "last sector read failed");
-		return EIO;
+		return -EIO;
 	}
 	xfs_buf_relse(bp);
 
@@ -571,14 +569,14 @@ xfs_check_sizes(xfs_mount_t *mp)
 		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
 		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
 			xfs_warn(mp, "log size mismatch detected");
-			return XFS_ERROR(EFBIG);
+			return -EFBIG;
 		}
 		bp = xfs_buf_read_uncached(mp->m_logdev_targp,
 					d - XFS_FSB_TO_BB(mp, 1),
 					XFS_FSB_TO_BB(mp, 1), 0, NULL);
 		if (!bp) {
 			xfs_warn(mp, "log device read failed");
-			return EIO;
+			return -EIO;
 		}
 		xfs_buf_relse(bp);
 	}
@@ -731,10 +729,15 @@ xfs_mountfs(
 
 	xfs_set_maxicount(mp);
 
-	error = xfs_uuid_mount(mp);
+	mp->m_kobj.kobject.kset = xfs_kset;
+	error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
 	if (error)
 		goto out;
 
+	error = xfs_uuid_mount(mp);
+	if (error)
+		goto out_remove_sysfs;
+
 	/*
 	 * Set the minimum read and write sizes
 	 */
@@ -816,7 +819,7 @@ xfs_mountfs(
 	if (!sbp->sb_logblocks) {
 		xfs_warn(mp, "no log defined");
 		XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
-		error = XFS_ERROR(EFSCORRUPTED);
+		error = -EFSCORRUPTED;
 		goto out_free_perag;
 	}
 
@@ -855,7 +858,7 @@ xfs_mountfs(
 	     !mp->m_sb.sb_inprogress) {
 		error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
 		if (error)
-			goto out_fail_wait;
+			goto out_log_dealloc;
 	}
 
 	/*
@@ -876,7 +879,7 @@ xfs_mountfs(
 		xfs_iunlock(rip, XFS_ILOCK_EXCL);
 		XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
 				 mp);
-		error = XFS_ERROR(EFSCORRUPTED);
+		error = -EFSCORRUPTED;
 		goto out_rele_rip;
 	}
 	mp->m_rootip = rip;	/* save it */
@@ -927,7 +930,7 @@ xfs_mountfs(
 			xfs_notice(mp, "resetting quota flags");
 			error = xfs_mount_reset_sbqflags(mp);
 			if (error)
-				return error;
+				goto out_rtunmount;
 		}
 	}
 
@@ -989,6 +992,8 @@ xfs_mountfs(
 	xfs_da_unmount(mp);
  out_remove_uuid:
 	xfs_uuid_unmount(mp);
+ out_remove_sysfs:
+	xfs_sysfs_del(&mp->m_kobj);
  out:
 	return error;
 }
@@ -1071,6 +1076,8 @@ xfs_unmountfs(
 	xfs_errortag_clearall(mp, 0);
 #endif
 	xfs_free_perag(mp);
+
+	xfs_sysfs_del(&mp->m_kobj);
 }
 
 int
@@ -1152,7 +1159,7 @@ xfs_mod_incore_sb_unlocked(
 		lcounter += delta;
 		if (lcounter < 0) {
 			ASSERT(0);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 		mp->m_sb.sb_icount = lcounter;
 		return 0;
@@ -1161,7 +1168,7 @@ xfs_mod_incore_sb_unlocked(
 		lcounter += delta;
 		if (lcounter < 0) {
 			ASSERT(0);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 		mp->m_sb.sb_ifree = lcounter;
 		return 0;
@@ -1191,7 +1198,7 @@ xfs_mod_incore_sb_unlocked(
 			 * blocks if were allowed to.
 			 */
 			if (!rsvd)
-				return XFS_ERROR(ENOSPC);
+				return -ENOSPC;
 
 			lcounter = (long long)mp->m_resblks_avail + delta;
 			if (lcounter >= 0) {
@@ -1202,7 +1209,7 @@ xfs_mod_incore_sb_unlocked(
 				"Filesystem \"%s\": reserve blocks depleted! "
 				"Consider increasing reserve pool size.",
 				mp->m_fsname);
-			return XFS_ERROR(ENOSPC);
+			return -ENOSPC;
 		}
 
 		mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -1211,7 +1218,7 @@ xfs_mod_incore_sb_unlocked(
 		lcounter = (long long)mp->m_sb.sb_frextents;
 		lcounter += delta;
 		if (lcounter < 0) {
-			return XFS_ERROR(ENOSPC);
+			return -ENOSPC;
 		}
 		mp->m_sb.sb_frextents = lcounter;
 		return 0;
@@ -1220,7 +1227,7 @@ xfs_mod_incore_sb_unlocked(
 		lcounter += delta;
 		if (lcounter < 0) {
 			ASSERT(0);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 		mp->m_sb.sb_dblocks = lcounter;
 		return 0;
@@ -1229,7 +1236,7 @@ xfs_mod_incore_sb_unlocked(
 		scounter += delta;
 		if (scounter < 0) {
 			ASSERT(0);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 		mp->m_sb.sb_agcount = scounter;
 		return 0;
@@ -1238,7 +1245,7 @@ xfs_mod_incore_sb_unlocked(
 		scounter += delta;
 		if (scounter < 0) {
 			ASSERT(0);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 		mp->m_sb.sb_imax_pct = scounter;
 		return 0;
@@ -1247,7 +1254,7 @@ xfs_mod_incore_sb_unlocked(
 		scounter += delta;
 		if (scounter < 0) {
 			ASSERT(0);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 		mp->m_sb.sb_rextsize = scounter;
 		return 0;
@@ -1256,7 +1263,7 @@ xfs_mod_incore_sb_unlocked(
 		scounter += delta;
 		if (scounter < 0) {
 			ASSERT(0);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 		mp->m_sb.sb_rbmblocks = scounter;
 		return 0;
@@ -1265,7 +1272,7 @@ xfs_mod_incore_sb_unlocked(
 		lcounter += delta;
 		if (lcounter < 0) {
 			ASSERT(0);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 		mp->m_sb.sb_rblocks = lcounter;
 		return 0;
@@ -1274,7 +1281,7 @@ xfs_mod_incore_sb_unlocked(
 		lcounter += delta;
 		if (lcounter < 0) {
 			ASSERT(0);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 		mp->m_sb.sb_rextents = lcounter;
 		return 0;
@@ -1283,13 +1290,13 @@ xfs_mod_incore_sb_unlocked(
 		scounter += delta;
 		if (scounter < 0) {
 			ASSERT(0);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 		mp->m_sb.sb_rextslog = scounter;
 		return 0;
 	default:
 		ASSERT(0);
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 }
 
@@ -1452,7 +1459,7 @@ xfs_dev_is_read_only(
 	    (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
 		xfs_notice(mp, "%s required on read-only device.", message);
 		xfs_notice(mp, "write access unavailable, cannot proceed.");
-		return EROFS;
+		return -EROFS;
 	}
 	return 0;
 }
@@ -1995,7 +2002,7 @@ slow_path:
 	 * (e.g. lots of space just got freed). After that
 	 * we are done.
 	 */
-	if (ret != ENOSPC)
+	if (ret != -ENOSPC)
 		xfs_icsb_balance_counter(mp, field, 0);
 	xfs_icsb_unlock(mp);
 	return ret;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7295a0b7c343..b0447c86e7e2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -166,6 +166,7 @@ typedef struct xfs_mount {
 						   on the next remount,rw */
 	int64_t			m_low_space[XFS_LOWSP_MAX];
 						/* low free space thresholds */
+	struct xfs_kobj		m_kobj;
 
 	struct workqueue_struct	*m_data_workqueue;
 	struct workqueue_struct	*m_unwritten_workqueue;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index f99b4933dc22..1eb6f3df698c 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -337,20 +337,20 @@ xfs_mru_cache_create(
 		*mrup = NULL;
 
 	if (!mrup || !grp_count || !lifetime_ms || !free_func)
-		return EINVAL;
+		return -EINVAL;
 
 	if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
-		return EINVAL;
+		return -EINVAL;
 
 	if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
-		return ENOMEM;
+		return -ENOMEM;
 
 	/* An extra list is needed to avoid reaping up to a grp_time early. */
 	mru->grp_count = grp_count + 1;
 	mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
 
 	if (!mru->lists) {
-		err = ENOMEM;
+		err = -ENOMEM;
 		goto exit;
 	}
 
@@ -434,16 +434,16 @@ xfs_mru_cache_insert(
 
 	ASSERT(mru && mru->lists);
 	if (!mru || !mru->lists)
-		return EINVAL;
+		return -EINVAL;
 
 	if (radix_tree_preload(GFP_KERNEL))
-		return ENOMEM;
+		return -ENOMEM;
 
 	INIT_LIST_HEAD(&elem->list_node);
 	elem->key = key;
 
 	spin_lock(&mru->lock);
-	error = -radix_tree_insert(&mru->store, key, elem);
+	error = radix_tree_insert(&mru->store, key, elem);
 	radix_tree_preload_end();
 	if (!error)
 		_xfs_mru_cache_list_insert(mru, elem);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6d26759c779a..10232102b4a6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -98,18 +98,18 @@ restart:
 			next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
 
 			error = execute(batch[i], data);
-			if (error == EAGAIN) {
+			if (error == -EAGAIN) {
 				skipped++;
 				continue;
 			}
-			if (error && last_error != EFSCORRUPTED)
+			if (error && last_error != -EFSCORRUPTED)
 				last_error = error;
 		}
 
 		mutex_unlock(&qi->qi_tree_lock);
 
 		/* bail out if the filesystem is corrupted.  */
-		if (last_error == EFSCORRUPTED) {
+		if (last_error == -EFSCORRUPTED) {
 			skipped = 0;
 			break;
 		}
@@ -138,7 +138,7 @@ xfs_qm_dqpurge(
 	xfs_dqlock(dqp);
 	if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
 		xfs_dqunlock(dqp);
-		return EAGAIN;
+		return -EAGAIN;
 	}
 
 	dqp->dq_flags |= XFS_DQ_FREEING;
@@ -221,100 +221,6 @@ xfs_qm_unmount(
 	}
 }
 
-
-/*
- * This is called from xfs_mountfs to start quotas and initialize all
- * necessary data structures like quotainfo.  This is also responsible for
- * running a quotacheck as necessary.  We are guaranteed that the superblock
- * is consistently read in at this point.
- *
- * If we fail here, the mount will continue with quota turned off. We don't
- * need to inidicate success or failure at all.
- */
-void
-xfs_qm_mount_quotas(
-	xfs_mount_t	*mp)
-{
-	int		error = 0;
-	uint		sbf;
-
-	/*
-	 * If quotas on realtime volumes is not supported, we disable
-	 * quotas immediately.
-	 */
-	if (mp->m_sb.sb_rextents) {
-		xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
-		mp->m_qflags = 0;
-		goto write_changes;
-	}
-
-	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-	/*
-	 * Allocate the quotainfo structure inside the mount struct, and
-	 * create quotainode(s), and change/rev superblock if necessary.
-	 */
-	error = xfs_qm_init_quotainfo(mp);
-	if (error) {
-		/*
-		 * We must turn off quotas.
-		 */
-		ASSERT(mp->m_quotainfo == NULL);
-		mp->m_qflags = 0;
-		goto write_changes;
-	}
-	/*
-	 * If any of the quotas are not consistent, do a quotacheck.
-	 */
-	if (XFS_QM_NEED_QUOTACHECK(mp)) {
-		error = xfs_qm_quotacheck(mp);
-		if (error) {
-			/* Quotacheck failed and disabled quotas. */
-			return;
-		}
-	}
-	/* 
-	 * If one type of quotas is off, then it will lose its
-	 * quotachecked status, since we won't be doing accounting for
-	 * that type anymore.
-	 */
-	if (!XFS_IS_UQUOTA_ON(mp))
-		mp->m_qflags &= ~XFS_UQUOTA_CHKD;
-	if (!XFS_IS_GQUOTA_ON(mp))
-		mp->m_qflags &= ~XFS_GQUOTA_CHKD;
-	if (!XFS_IS_PQUOTA_ON(mp))
-		mp->m_qflags &= ~XFS_PQUOTA_CHKD;
-
- write_changes:
-	/*
-	 * We actually don't have to acquire the m_sb_lock at all.
-	 * This can only be called from mount, and that's single threaded. XXX
-	 */
-	spin_lock(&mp->m_sb_lock);
-	sbf = mp->m_sb.sb_qflags;
-	mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
-	spin_unlock(&mp->m_sb_lock);
-
-	if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
-		if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
-			/*
-			 * We could only have been turning quotas off.
-			 * We aren't in very good shape actually because
-			 * the incore structures are convinced that quotas are
-			 * off, but the on disk superblock doesn't know that !
-			 */
-			ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
-			xfs_alert(mp, "%s: Superblock update failed!",
-				__func__);
-		}
-	}
-
-	if (error) {
-		xfs_warn(mp, "Failed to initialize disk quotas.");
-		return;
-	}
-}
-
 /*
  * Called from the vfsops layer.
  */
@@ -671,7 +577,7 @@ xfs_qm_init_quotainfo(
 
 	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
 
-	error = -list_lru_init(&qinf->qi_lru);
+	error = list_lru_init(&qinf->qi_lru);
 	if (error)
 		goto out_free_qinf;
 
@@ -995,7 +901,7 @@ xfs_qm_dqiter_bufs(
 		 * will leave a trace in the log indicating corruption has
 		 * been detected.
 		 */
-		if (error == EFSCORRUPTED) {
+		if (error == -EFSCORRUPTED) {
 			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
 				      XFS_FSB_TO_DADDR(mp, bno),
 				      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
@@ -1005,6 +911,12 @@ xfs_qm_dqiter_bufs(
 		if (error)
 			break;
 
+		/*
+		 * A corrupt buffer might not have a verifier attached, so
+		 * make sure we have the correct one attached before writeback
+		 * occurs.
+		 */
+		bp->b_ops = &xfs_dquot_buf_ops;
 		xfs_qm_reset_dqcounts(mp, bp, firstid, type);
 		xfs_buf_delwri_queue(bp, buffer_list);
 		xfs_buf_relse(bp);
@@ -1090,7 +1002,7 @@ xfs_qm_dqiterate(
 					xfs_buf_readahead(mp->m_ddev_targp,
 					       XFS_FSB_TO_DADDR(mp, rablkno),
 					       mp->m_quotainfo->qi_dqchunklen,
-					       NULL);
+					       &xfs_dquot_buf_ops);
 					rablkno++;
 				}
 			}
@@ -1138,8 +1050,8 @@ xfs_qm_quotacheck_dqadjust(
 		/*
 		 * Shouldn't be able to turn off quotas here.
 		 */
-		ASSERT(error != ESRCH);
-		ASSERT(error != ENOENT);
+		ASSERT(error != -ESRCH);
+		ASSERT(error != -ENOENT);
 		return error;
 	}
 
@@ -1226,7 +1138,7 @@ xfs_qm_dqusage_adjust(
 	 */
 	if (xfs_is_quota_inode(&mp->m_sb, ino)) {
 		*res = BULKSTAT_RV_NOTHING;
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	/*
@@ -1330,7 +1242,7 @@ out_unlock:
  * Walk thru all the filesystem inodes and construct a consistent view
  * of the disk quota world. If the quotacheck fails, disable quotas.
  */
-int
+STATIC int
 xfs_qm_quotacheck(
 	xfs_mount_t	*mp)
 {
@@ -1463,7 +1375,100 @@ xfs_qm_quotacheck(
 		}
 	} else
 		xfs_notice(mp, "Quotacheck: Done.");
-	return (error);
+	return error;
+}
+
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo.  This is also responsible for
+ * running a quotacheck as necessary.  We are guaranteed that the superblock
+ * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
+ */
+void
+xfs_qm_mount_quotas(
+	struct xfs_mount	*mp)
+{
+	int			error = 0;
+	uint			sbf;
+
+	/*
+	 * If quotas on realtime volumes is not supported, we disable
+	 * quotas immediately.
+	 */
+	if (mp->m_sb.sb_rextents) {
+		xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * Allocate the quotainfo structure inside the mount struct, and
+	 * create quotainode(s), and change/rev superblock if necessary.
+	 */
+	error = xfs_qm_init_quotainfo(mp);
+	if (error) {
+		/*
+		 * We must turn off quotas.
+		 */
+		ASSERT(mp->m_quotainfo == NULL);
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+	/*
+	 * If any of the quotas are not consistent, do a quotacheck.
+	 */
+	if (XFS_QM_NEED_QUOTACHECK(mp)) {
+		error = xfs_qm_quotacheck(mp);
+		if (error) {
+			/* Quotacheck failed and disabled quotas. */
+			return;
+		}
+	}
+	/*
+	 * If one type of quotas is off, then it will lose its
+	 * quotachecked status, since we won't be doing accounting for
+	 * that type anymore.
+	 */
+	if (!XFS_IS_UQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_UQUOTA_CHKD;
+	if (!XFS_IS_GQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+	if (!XFS_IS_PQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_PQUOTA_CHKD;
+
+ write_changes:
+	/*
+	 * We actually don't have to acquire the m_sb_lock at all.
+	 * This can only be called from mount, and that's single threaded. XXX
+	 */
+	spin_lock(&mp->m_sb_lock);
+	sbf = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+	spin_unlock(&mp->m_sb_lock);
+
+	if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+		if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+			/*
+			 * We could only have been turning quotas off.
+			 * We aren't in very good shape actually because
+			 * the incore structures are convinced that quotas are
+			 * off, but the on disk superblock doesn't know that !
+			 */
+			ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+			xfs_alert(mp, "%s: Superblock update failed!",
+				__func__);
+		}
+	}
+
+	if (error) {
+		xfs_warn(mp, "Failed to initialize disk quotas.");
+		return;
+	}
 }
 
 /*
@@ -1493,7 +1498,7 @@ xfs_qm_init_quotainos(
 			error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
 					     0, 0, &uip);
 			if (error)
-				return XFS_ERROR(error);
+				return error;
 		}
 		if (XFS_IS_GQUOTA_ON(mp) &&
 		    mp->m_sb.sb_gquotino != NULLFSINO) {
@@ -1563,7 +1568,7 @@ error_rele:
 		IRELE(gip);
 	if (pip)
 		IRELE(pip);
-	return XFS_ERROR(error);
+	return error;
 }
 
 STATIC void
@@ -1679,7 +1684,7 @@ xfs_qm_vop_dqalloc(
 						 XFS_QMOPT_DOWARN,
 						 &uq);
 			if (error) {
-				ASSERT(error != ENOENT);
+				ASSERT(error != -ENOENT);
 				return error;
 			}
 			/*
@@ -1706,7 +1711,7 @@ xfs_qm_vop_dqalloc(
 						 XFS_QMOPT_DOWARN,
 						 &gq);
 			if (error) {
-				ASSERT(error != ENOENT);
+				ASSERT(error != -ENOENT);
 				goto error_rele;
 			}
 			xfs_dqunlock(gq);
@@ -1726,7 +1731,7 @@ xfs_qm_vop_dqalloc(
 						 XFS_QMOPT_DOWARN,
 						 &pq);
 			if (error) {
-				ASSERT(error != ENOENT);
+				ASSERT(error != -ENOENT);
 				goto error_rele;
 			}
 			xfs_dqunlock(pq);
@@ -1895,7 +1900,7 @@ xfs_qm_vop_chown_reserve(
 				-((xfs_qcnt_t)delblks), 0, blkflags);
 	}
 
-	return (0);
+	return 0;
 }
 
 int
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 797fd4636273..3a07a937e232 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -157,7 +157,6 @@ struct xfs_dquot_acct {
 #define XFS_QM_RTBWARNLIMIT	5
 
 extern void		xfs_qm_destroy_quotainfo(struct xfs_mount *);
-extern int		xfs_qm_quotacheck(struct xfs_mount *);
 extern int		xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
 
 /* dquot stuff */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index e9be63abd8d2..2c61e61b0205 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -117,7 +117,7 @@ xfs_qm_newmount(
 			(uquotaondisk ? " usrquota" : ""),
 			(gquotaondisk ? " grpquota" : ""),
 			(pquotaondisk ? " prjquota" : ""));
-		return XFS_ERROR(EPERM);
+		return -EPERM;
 	}
 
 	if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index bbc813caba4c..80f2d77d929a 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -64,10 +64,10 @@ xfs_qm_scall_quotaoff(
 	/*
 	 * No file system can have quotas enabled on disk but not in core.
 	 * Note that quota utilities (like quotaoff) _expect_
-	 * errno == EEXIST here.
+	 * errno == -EEXIST here.
 	 */
 	if ((mp->m_qflags & flags) == 0)
-		return XFS_ERROR(EEXIST);
+		return -EEXIST;
 	error = 0;
 
 	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
@@ -94,7 +94,7 @@ xfs_qm_scall_quotaoff(
 
 		/* XXX what to do if error ? Revert back to old vals incore ? */
 		error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
-		return (error);
+		return error;
 	}
 
 	dqtype = 0;
@@ -198,7 +198,7 @@ xfs_qm_scall_quotaoff(
 	if (mp->m_qflags == 0) {
 		mutex_unlock(&q->qi_quotaofflock);
 		xfs_qm_destroy_quotainfo(mp);
-		return (0);
+		return 0;
 	}
 
 	/*
@@ -278,13 +278,13 @@ xfs_qm_scall_trunc_qfiles(
 	xfs_mount_t	*mp,
 	uint		flags)
 {
-	int		error = EINVAL;
+	int		error = -EINVAL;
 
 	if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
 	    (flags & ~XFS_DQ_ALLTYPES)) {
 		xfs_debug(mp, "%s: flags=%x m_qflags=%x",
 			__func__, flags, mp->m_qflags);
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	if (flags & XFS_DQ_USER) {
@@ -328,7 +328,7 @@ xfs_qm_scall_quotaon(
 	if (flags == 0) {
 		xfs_debug(mp, "%s: zero flags, m_qflags=%x",
 			__func__, mp->m_qflags);
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	/* No fs can turn on quotas with a delayed effect */
@@ -351,13 +351,13 @@ xfs_qm_scall_quotaon(
 		xfs_debug(mp,
 			"%s: Can't enforce without acct, flags=%x sbflags=%x",
 			__func__, flags, mp->m_sb.sb_qflags);
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 	/*
 	 * If everything's up to-date incore, then don't waste time.
 	 */
 	if ((mp->m_qflags & flags) == flags)
-		return XFS_ERROR(EEXIST);
+		return -EEXIST;
 
 	/*
 	 * Change sb_qflags on disk but not incore mp->qflags
@@ -372,11 +372,11 @@ xfs_qm_scall_quotaon(
 	 * There's nothing to change if it's the same.
 	 */
 	if ((qf & flags) == flags && sbflags == 0)
-		return XFS_ERROR(EEXIST);
+		return -EEXIST;
 	sbflags |= XFS_SB_QFLAGS;
 
 	if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
-		return (error);
+		return error;
 	/*
 	 * If we aren't trying to switch on quota enforcement, we are done.
 	 */
@@ -387,10 +387,10 @@ xfs_qm_scall_quotaon(
 	     ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
 	     (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
 	    (flags & XFS_ALL_QUOTA_ENFD) == 0)
-		return (0);
+		return 0;
 
 	if (! XFS_IS_QUOTA_RUNNING(mp))
-		return XFS_ERROR(ESRCH);
+		return -ESRCH;
 
 	/*
 	 * Switch on quota enforcement in core.
@@ -399,7 +399,7 @@ xfs_qm_scall_quotaon(
 	mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
 	mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
 
-	return (0);
+	return 0;
 }
 
 
@@ -426,7 +426,7 @@ xfs_qm_scall_getqstat(
 	if (!xfs_sb_version_hasquota(&mp->m_sb)) {
 		out->qs_uquota.qfs_ino = NULLFSINO;
 		out->qs_gquota.qfs_ino = NULLFSINO;
-		return (0);
+		return 0;
 	}
 
 	out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
@@ -514,7 +514,7 @@ xfs_qm_scall_getqstatv(
 		out->qs_uquota.qfs_ino = NULLFSINO;
 		out->qs_gquota.qfs_ino = NULLFSINO;
 		out->qs_pquota.qfs_ino = NULLFSINO;
-		return (0);
+		return 0;
 	}
 
 	out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
@@ -595,7 +595,7 @@ xfs_qm_scall_setqlim(
 	xfs_qcnt_t		hard, soft;
 
 	if (newlim->d_fieldmask & ~XFS_DQ_MASK)
-		return EINVAL;
+		return -EINVAL;
 	if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
 		return 0;
 
@@ -615,7 +615,7 @@ xfs_qm_scall_setqlim(
 	 */
 	error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
 	if (error) {
-		ASSERT(error != ENOENT);
+		ASSERT(error != -ENOENT);
 		goto out_unlock;
 	}
 	xfs_dqunlock(dqp);
@@ -758,7 +758,7 @@ xfs_qm_log_quotaoff_end(
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
 	if (error) {
 		xfs_trans_cancel(tp, 0);
-		return (error);
+		return error;
 	}
 
 	qoffi = xfs_trans_get_qoff_item(tp, startqoff,
@@ -772,7 +772,7 @@ xfs_qm_log_quotaoff_end(
 	 */
 	xfs_trans_set_sync(tp);
 	error = xfs_trans_commit(tp, 0);
-	return (error);
+	return error;
 }
 
 
@@ -822,7 +822,7 @@ error0:
 		spin_unlock(&mp->m_sb_lock);
 	}
 	*qoffstartp = qoffi;
-	return (error);
+	return error;
 }
 
 
@@ -850,7 +850,7 @@ xfs_qm_scall_getquota(
 	 * our utility programs are concerned.
 	 */
 	if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
-		error = XFS_ERROR(ENOENT);
+		error = -ENOENT;
 		goto out_put;
 	}
 
@@ -953,7 +953,7 @@ xfs_qm_export_flags(
 		uflags |= FS_QUOTA_GDQ_ENFD;
 	if (flags & XFS_PQUOTA_ENFD)
 		uflags |= FS_QUOTA_PDQ_ENFD;
-	return (uflags);
+	return uflags;
 }
 
 
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 2ad1b9822e92..b238027df987 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -51,7 +51,7 @@ xfs_fs_get_xstate(
 
 	if (!XFS_IS_QUOTA_RUNNING(mp))
 		return -ENOSYS;
-	return -xfs_qm_scall_getqstat(mp, fqs);
+	return xfs_qm_scall_getqstat(mp, fqs);
 }
 
 STATIC int
@@ -63,7 +63,7 @@ xfs_fs_get_xstatev(
 
 	if (!XFS_IS_QUOTA_RUNNING(mp))
 		return -ENOSYS;
-	return -xfs_qm_scall_getqstatv(mp, fqs);
+	return xfs_qm_scall_getqstatv(mp, fqs);
 }
 
 STATIC int
@@ -95,11 +95,11 @@ xfs_fs_set_xstate(
 
 	switch (op) {
 	case Q_XQUOTAON:
-		return -xfs_qm_scall_quotaon(mp, flags);
+		return xfs_qm_scall_quotaon(mp, flags);
 	case Q_XQUOTAOFF:
 		if (!XFS_IS_QUOTA_ON(mp))
 			return -EINVAL;
-		return -xfs_qm_scall_quotaoff(mp, flags);
+		return xfs_qm_scall_quotaoff(mp, flags);
 	}
 
 	return -EINVAL;
@@ -112,7 +112,7 @@ xfs_fs_rm_xquota(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 	unsigned int		flags = 0;
-	
+
 	if (sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
@@ -123,11 +123,11 @@ xfs_fs_rm_xquota(
 		flags |= XFS_DQ_USER;
 	if (uflags & FS_GROUP_QUOTA)
 		flags |= XFS_DQ_GROUP;
-	if (uflags & FS_USER_QUOTA)
+	if (uflags & FS_PROJ_QUOTA)
 		flags |= XFS_DQ_PROJ;
 
-	return -xfs_qm_scall_trunc_qfiles(mp, flags);
-}	
+	return xfs_qm_scall_trunc_qfiles(mp, flags);
+}
 
 STATIC int
 xfs_fs_get_dqblk(
@@ -142,7 +142,7 @@ xfs_fs_get_dqblk(
 	if (!XFS_IS_QUOTA_ON(mp))
 		return -ESRCH;
 
-	return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
+	return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
 				      xfs_quota_type(qid.type), fdq);
 }
 
@@ -161,7 +161,7 @@ xfs_fs_set_dqblk(
 	if (!XFS_IS_QUOTA_ON(mp))
 		return -ESRCH;
 
-	return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
+	return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
 				     xfs_quota_type(qid.type), fdq);
 }
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ec5ca65c6211..909e143b87ae 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -863,7 +863,7 @@ xfs_growfs_rt_alloc(
 					XFS_BMAPI_METADATA, &firstblock,
 					resblks, &map, &nmap, &flist);
 		if (!error && nmap < 1)
-			error = XFS_ERROR(ENOSPC);
+			error = -ENOSPC;
 		if (error)
 			goto error_cancel;
 		/*
@@ -903,7 +903,7 @@ xfs_growfs_rt_alloc(
 			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 				mp->m_bsize, 0);
 			if (bp == NULL) {
-				error = XFS_ERROR(EIO);
+				error = -EIO;
 error_cancel:
 				xfs_trans_cancel(tp, cancelflags);
 				goto error;
@@ -944,9 +944,9 @@ xfs_growfs_rt(
 	xfs_buf_t	*bp;		/* temporary buffer */
 	int		error;		/* error return value */
 	xfs_mount_t	*nmp;		/* new (fake) mount structure */
-	xfs_drfsbno_t	nrblocks;	/* new number of realtime blocks */
+	xfs_rfsblock_t	nrblocks;	/* new number of realtime blocks */
 	xfs_extlen_t	nrbmblocks;	/* new number of rt bitmap blocks */
-	xfs_drtbno_t	nrextents;	/* new number of realtime extents */
+	xfs_rtblock_t	nrextents;	/* new number of realtime extents */
 	uint8_t		nrextslog;	/* new log2 of sb_rextents */
 	xfs_extlen_t	nrsumblocks;	/* new number of summary blocks */
 	uint		nrsumlevels;	/* new rt summary levels */
@@ -962,11 +962,11 @@ xfs_growfs_rt(
 	 * Initial error checking.
 	 */
 	if (!capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
+		return -EPERM;
 	if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
 	    (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
 	    (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks)))
 		return error;
 	/*
@@ -976,7 +976,7 @@ xfs_growfs_rt(
 				XFS_FSB_TO_BB(mp, nrblocks - 1),
 				XFS_FSB_TO_BB(mp, 1), 0, NULL);
 	if (!bp)
-		return EIO;
+		return -EIO;
 	if (bp->b_error) {
 		error = bp->b_error;
 		xfs_buf_relse(bp);
@@ -1001,7 +1001,7 @@ xfs_growfs_rt(
 	 * since we'll log basically the whole summary file at once.
 	 */
 	if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	/*
 	 * Get the old block counts for bitmap and summary inodes.
 	 * These can't change since other growfs callers are locked out.
@@ -1208,7 +1208,7 @@ xfs_rtallocate_extent(
 				len, &sumbp, &sb, prod, &r);
 		break;
 	default:
-		error = EIO;
+		error = -EIO;
 		ASSERT(0);
 	}
 	if (error)
@@ -1247,7 +1247,7 @@ xfs_rtmount_init(
 	if (mp->m_rtdev_targp == NULL) {
 		xfs_warn(mp,
 	"Filesystem has a realtime volume, use rtdev=device option");
-		return XFS_ERROR(ENODEV);
+		return -ENODEV;
 	}
 	mp->m_rsumlevels = sbp->sb_rextslog + 1;
 	mp->m_rsumsize =
@@ -1263,7 +1263,7 @@ xfs_rtmount_init(
 		xfs_warn(mp, "realtime mount -- %llu != %llu",
 			(unsigned long long) XFS_BB_TO_FSB(mp, d),
 			(unsigned long long) mp->m_sb.sb_rblocks);
-		return XFS_ERROR(EFBIG);
+		return -EFBIG;
 	}
 	bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
 					d - XFS_FSB_TO_BB(mp, 1),
@@ -1272,7 +1272,7 @@ xfs_rtmount_init(
 		xfs_warn(mp, "realtime device size check failed");
 		if (bp)
 			xfs_buf_relse(bp);
-		return EIO;
+		return -EIO;
 	}
 	xfs_buf_relse(bp);
 	return 0;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 752b63d10300..c642795324af 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -132,7 +132,7 @@ xfs_rtmount_init(
 		return 0;
 
 	xfs_warn(mp, "Not built with CONFIG_XFS_RT");
-	return ENOSYS;
+	return -ENOSYS;
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtunmount_inodes(m)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8f0333b3f7a0..b194652033cd 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -61,6 +61,7 @@
 static const struct super_operations xfs_super_operations;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
+struct kset *xfs_kset;
 
 #define MNTOPT_LOGBUFS	"logbufs"	/* number of XFS log buffers */
 #define MNTOPT_LOGBSIZE	"logbsize"	/* size of XFS log buffers */
@@ -185,7 +186,7 @@ xfs_parseargs(
 	 */
 	mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
 	if (!mp->m_fsname)
-		return ENOMEM;
+		return -ENOMEM;
 	mp->m_fsname_len = strlen(mp->m_fsname) + 1;
 
 	/*
@@ -204,9 +205,6 @@ xfs_parseargs(
 	 */
 	mp->m_flags |= XFS_MOUNT_BARRIER;
 	mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-#if !XFS_BIG_INUMS
-	mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-#endif
 
 	/*
 	 * These can be overridden by the mount option parsing.
@@ -227,57 +225,57 @@ xfs_parseargs(
 			if (!value || !*value) {
 				xfs_warn(mp, "%s option requires an argument",
 					this_char);
-				return EINVAL;
+				return -EINVAL;
 			}
 			if (kstrtoint(value, 10, &mp->m_logbufs))
-				return EINVAL;
+				return -EINVAL;
 		} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
 			if (!value || !*value) {
 				xfs_warn(mp, "%s option requires an argument",
 					this_char);
-				return EINVAL;
+				return -EINVAL;
 			}
 			if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
-				return EINVAL;
+				return -EINVAL;
 		} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
 			if (!value || !*value) {
 				xfs_warn(mp, "%s option requires an argument",
 					this_char);
-				return EINVAL;
+				return -EINVAL;
 			}
 			mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
 			if (!mp->m_logname)
-				return ENOMEM;
+				return -ENOMEM;
 		} else if (!strcmp(this_char, MNTOPT_MTPT)) {
 			xfs_warn(mp, "%s option not allowed on this system",
 				this_char);
-			return EINVAL;
+			return -EINVAL;
 		} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
 			if (!value || !*value) {
 				xfs_warn(mp, "%s option requires an argument",
 					this_char);
-				return EINVAL;
+				return -EINVAL;
 			}
 			mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
 			if (!mp->m_rtname)
-				return ENOMEM;
+				return -ENOMEM;
 		} else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
 			if (!value || !*value) {
 				xfs_warn(mp, "%s option requires an argument",
 					this_char);
-				return EINVAL;
+				return -EINVAL;
 			}
 			if (kstrtoint(value, 10, &iosize))
-				return EINVAL;
+				return -EINVAL;
 			iosizelog = ffs(iosize) - 1;
 		} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
 			if (!value || !*value) {
 				xfs_warn(mp, "%s option requires an argument",
 					this_char);
-				return EINVAL;
+				return -EINVAL;
 			}
 			if (suffix_kstrtoint(value, 10, &iosize))
-				return EINVAL;
+				return -EINVAL;
 			iosizelog = ffs(iosize) - 1;
 		} else if (!strcmp(this_char, MNTOPT_GRPID) ||
 			   !strcmp(this_char, MNTOPT_BSDGROUPS)) {
@@ -297,27 +295,22 @@ xfs_parseargs(
 			if (!value || !*value) {
 				xfs_warn(mp, "%s option requires an argument",
 					this_char);
-				return EINVAL;
+				return -EINVAL;
 			}
 			if (kstrtoint(value, 10, &dsunit))
-				return EINVAL;
+				return -EINVAL;
 		} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
 			if (!value || !*value) {
 				xfs_warn(mp, "%s option requires an argument",
 					this_char);
-				return EINVAL;
+				return -EINVAL;
 			}
 			if (kstrtoint(value, 10, &dswidth))
-				return EINVAL;
+				return -EINVAL;
 		} else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
 			mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
 		} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
 			mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
-#if !XFS_BIG_INUMS
-			xfs_warn(mp, "%s option not allowed on this system",
-				this_char);
-			return EINVAL;
-#endif
 		} else if (!strcmp(this_char, MNTOPT_NOUUID)) {
 			mp->m_flags |= XFS_MOUNT_NOUUID;
 		} else if (!strcmp(this_char, MNTOPT_BARRIER)) {
@@ -390,7 +383,7 @@ xfs_parseargs(
 	"irixsgid is now a sysctl(2) variable, option is deprecated.");
 		} else {
 			xfs_warn(mp, "unknown mount option [%s].", this_char);
-			return EINVAL;
+			return -EINVAL;
 		}
 	}
 
@@ -400,32 +393,32 @@ xfs_parseargs(
 	if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
 	    !(mp->m_flags & XFS_MOUNT_RDONLY)) {
 		xfs_warn(mp, "no-recovery mounts must be read-only.");
-		return EINVAL;
+		return -EINVAL;
 	}
 
 	if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
 		xfs_warn(mp,
 	"sunit and swidth options incompatible with the noalign option");
-		return EINVAL;
+		return -EINVAL;
 	}
 
 #ifndef CONFIG_XFS_QUOTA
 	if (XFS_IS_QUOTA_RUNNING(mp)) {
 		xfs_warn(mp, "quota support not available in this kernel.");
-		return EINVAL;
+		return -EINVAL;
 	}
 #endif
 
 	if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
 		xfs_warn(mp, "sunit and swidth must be specified together");
-		return EINVAL;
+		return -EINVAL;
 	}
 
 	if (dsunit && (dswidth % dsunit != 0)) {
 		xfs_warn(mp,
 	"stripe width (%d) must be a multiple of the stripe unit (%d)",
 			dswidth, dsunit);
-		return EINVAL;
+		return -EINVAL;
 	}
 
 done:
@@ -446,7 +439,7 @@ done:
 	     mp->m_logbufs > XLOG_MAX_ICLOGS)) {
 		xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
 			mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 	if (mp->m_logbsize != -1 &&
 	    mp->m_logbsize !=  0 &&
@@ -456,7 +449,7 @@ done:
 		xfs_warn(mp,
 			"invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
 			mp->m_logbsize);
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	if (iosizelog) {
@@ -465,7 +458,7 @@ done:
 			xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
 				iosizelog, XFS_MIN_IO_LOG,
 				XFS_MAX_IO_LOG);
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 
 		mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
@@ -597,15 +590,20 @@ xfs_max_file_offset(
 	return (((__uint64_t)pagefactor) << bitshift) - 1;
 }
 
+/*
+ * xfs_set_inode32() and xfs_set_inode64() are passed an agcount
+ * because in the growfs case, mp->m_sb.sb_agcount is not updated
+ * yet to the potentially higher ag count.
+ */
 xfs_agnumber_t
-xfs_set_inode32(struct xfs_mount *mp)
+xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
 {
 	xfs_agnumber_t	index = 0;
 	xfs_agnumber_t	maxagi = 0;
 	xfs_sb_t	*sbp = &mp->m_sb;
 	xfs_agnumber_t	max_metadata;
-	xfs_agino_t	agino =	XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0);
-	xfs_ino_t	ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino);
+	xfs_agino_t	agino;
+	xfs_ino_t	ino;
 	xfs_perag_t	*pag;
 
 	/* Calculate how much should be reserved for inodes to meet
@@ -620,10 +618,12 @@ xfs_set_inode32(struct xfs_mount *mp)
 		do_div(icount, sbp->sb_agblocks);
 		max_metadata = icount;
 	} else {
-		max_metadata = sbp->sb_agcount;
+		max_metadata = agcount;
 	}
 
-	for (index = 0; index < sbp->sb_agcount; index++) {
+	agino =	XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+
+	for (index = 0; index < agcount; index++) {
 		ino = XFS_AGINO_TO_INO(mp, index, agino);
 
 		if (ino > XFS_MAXINUMBER_32) {
@@ -648,11 +648,11 @@ xfs_set_inode32(struct xfs_mount *mp)
 }
 
 xfs_agnumber_t
-xfs_set_inode64(struct xfs_mount *mp)
+xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount)
 {
 	xfs_agnumber_t index = 0;
 
-	for (index = 0; index < mp->m_sb.sb_agcount; index++) {
+	for (index = 0; index < agcount; index++) {
 		struct xfs_perag	*pag;
 
 		pag = xfs_perag_get(mp, index);
@@ -686,7 +686,7 @@ xfs_blkdev_get(
 		xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
 	}
 
-	return -error;
+	return error;
 }
 
 STATIC void
@@ -756,7 +756,7 @@ xfs_open_devices(
 		if (rtdev == ddev || rtdev == logdev) {
 			xfs_warn(mp,
 	"Cannot mount filesystem with identical rtdev and ddev/logdev.");
-			error = EINVAL;
+			error = -EINVAL;
 			goto out_close_rtdev;
 		}
 	}
@@ -764,7 +764,7 @@ xfs_open_devices(
 	/*
 	 * Setup xfs_mount buffer target pointers
 	 */
-	error = ENOMEM;
+	error = -ENOMEM;
 	mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
 	if (!mp->m_ddev_targp)
 		goto out_close_rtdev;
@@ -1188,6 +1188,7 @@ xfs_fs_remount(
 	char			*options)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
+	xfs_sb_t		*sbp = &mp->m_sb;
 	substring_t		args[MAX_OPT_ARGS];
 	char			*p;
 	int			error;
@@ -1208,10 +1209,10 @@ xfs_fs_remount(
 			mp->m_flags &= ~XFS_MOUNT_BARRIER;
 			break;
 		case Opt_inode64:
-			mp->m_maxagi = xfs_set_inode64(mp);
+			mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount);
 			break;
 		case Opt_inode32:
-			mp->m_maxagi = xfs_set_inode32(mp);
+			mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount);
 			break;
 		default:
 			/*
@@ -1295,7 +1296,7 @@ xfs_fs_freeze(
 
 	xfs_save_resvblks(mp);
 	xfs_quiesce_attr(mp);
-	return -xfs_fs_log_dummy(mp);
+	return xfs_fs_log_dummy(mp);
 }
 
 STATIC int
@@ -1314,7 +1315,7 @@ xfs_fs_show_options(
 	struct seq_file		*m,
 	struct dentry		*root)
 {
-	return -xfs_showargs(XFS_M(root->d_sb), m);
+	return xfs_showargs(XFS_M(root->d_sb), m);
 }
 
 /*
@@ -1336,14 +1337,14 @@ xfs_finish_flags(
 			   mp->m_logbsize < mp->m_sb.sb_logsunit) {
 			xfs_warn(mp,
 		"logbuf size must be greater than or equal to log stripe size");
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 	} else {
 		/* Fail a mount if the logbuf is larger than 32K */
 		if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
 			xfs_warn(mp,
 		"logbuf size for version 1 logs must be 16K or 32K");
-			return XFS_ERROR(EINVAL);
+			return -EINVAL;
 		}
 	}
 
@@ -1355,7 +1356,7 @@ xfs_finish_flags(
 		xfs_warn(mp,
 "Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
 			MNTOPT_NOATTR2, MNTOPT_ATTR2);
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	/*
@@ -1372,7 +1373,7 @@ xfs_finish_flags(
 	if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
 		xfs_warn(mp,
 			"cannot mount a read-only filesystem as read-write");
-		return XFS_ERROR(EROFS);
+		return -EROFS;
 	}
 
 	if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
@@ -1380,7 +1381,7 @@ xfs_finish_flags(
 	    !xfs_sb_version_has_pquotino(&mp->m_sb)) {
 		xfs_warn(mp,
 		  "Super block does not support project and group quota together");
-		return XFS_ERROR(EINVAL);
+		return -EINVAL;
 	}
 
 	return 0;
@@ -1394,7 +1395,7 @@ xfs_fs_fill_super(
 {
 	struct inode		*root;
 	struct xfs_mount	*mp = NULL;
-	int			flags = 0, error = ENOMEM;
+	int			flags = 0, error = -ENOMEM;
 
 	mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
 	if (!mp)
@@ -1428,11 +1429,11 @@ xfs_fs_fill_super(
 	if (error)
 		goto out_free_fsname;
 
-	error = -xfs_init_mount_workqueues(mp);
+	error = xfs_init_mount_workqueues(mp);
 	if (error)
 		goto out_close_devices;
 
-	error = -xfs_icsb_init_counters(mp);
+	error = xfs_icsb_init_counters(mp);
 	if (error)
 		goto out_destroy_workqueues;
 
@@ -1474,12 +1475,12 @@ xfs_fs_fill_super(
 
 	root = igrab(VFS_I(mp->m_rootip));
 	if (!root) {
-		error = ENOENT;
+		error = -ENOENT;
 		goto out_unmount;
 	}
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
-		error = ENOMEM;
+		error = -ENOMEM;
 		goto out_unmount;
 	}
 
@@ -1499,7 +1500,7 @@ out_destroy_workqueues:
 	xfs_free_fsname(mp);
 	kfree(mp);
  out:
-	return -error;
+	return error;
 
  out_unmount:
 	xfs_filestream_unmount(mp);
@@ -1761,9 +1762,15 @@ init_xfs_fs(void)
 	if (error)
 		goto out_cleanup_procfs;
 
+	xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
+	if (!xfs_kset) {
+		error = -ENOMEM;
+		goto out_sysctl_unregister;;
+	}
+
 	error = xfs_qm_init();
 	if (error)
-		goto out_sysctl_unregister;
+		goto out_kset_unregister;
 
 	error = register_filesystem(&xfs_fs_type);
 	if (error)
@@ -1772,6 +1779,8 @@ init_xfs_fs(void)
 
  out_qm_exit:
 	xfs_qm_exit();
+ out_kset_unregister:
+	kset_unregister(xfs_kset);
  out_sysctl_unregister:
 	xfs_sysctl_unregister();
  out_cleanup_procfs:
@@ -1793,6 +1802,7 @@ exit_xfs_fs(void)
 {
 	xfs_qm_exit();
 	unregister_filesystem(&xfs_fs_type);
+	kset_unregister(xfs_kset);
 	xfs_sysctl_unregister();
 	xfs_cleanup_procfs();
 	xfs_buf_terminate();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index bbe3d15a7904..2b830c2f322e 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -44,16 +44,6 @@ extern void xfs_qm_exit(void);
 # define XFS_REALTIME_STRING
 #endif
 
-#if XFS_BIG_BLKNOS
-# if XFS_BIG_INUMS
-#  define XFS_BIGFS_STRING	"large block/inode numbers, "
-# else
-#  define XFS_BIGFS_STRING	"large block numbers, "
-# endif
-#else
-# define XFS_BIGFS_STRING
-#endif
-
 #ifdef DEBUG
 # define XFS_DBG_STRING		"debug"
 #else
@@ -64,7 +54,6 @@ extern void xfs_qm_exit(void);
 #define XFS_BUILD_OPTIONS	XFS_ACL_STRING \
 				XFS_SECURITY_STRING \
 				XFS_REALTIME_STRING \
-				XFS_BIGFS_STRING \
 				XFS_DBG_STRING /* DBG must be last */
 
 struct xfs_inode;
@@ -76,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
 
 extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
-extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
-extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
+extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount);
+extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount);
 
 extern const struct export_operations xfs_export_operations;
 extern const struct xattr_handler *xfs_xattr_handlers[];
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index d69363c833e1..6a944a2cd36f 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -76,15 +76,15 @@ xfs_readlink_bmap(
 		bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
 				  &xfs_symlink_buf_ops);
 		if (!bp)
-			return XFS_ERROR(ENOMEM);
+			return -ENOMEM;
 		error = bp->b_error;
 		if (error) {
 			xfs_buf_ioerror_alert(bp, __func__);
 			xfs_buf_relse(bp);
 
 			/* bad CRC means corrupted metadata */
-			if (error == EFSBADCRC)
-				error = EFSCORRUPTED;
+			if (error == -EFSBADCRC)
+				error = -EFSCORRUPTED;
 			goto out;
 		}
 		byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -95,7 +95,7 @@ xfs_readlink_bmap(
 		if (xfs_sb_version_hascrc(&mp->m_sb)) {
 			if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
 							byte_cnt, bp)) {
-				error = EFSCORRUPTED;
+				error = -EFSCORRUPTED;
 				xfs_alert(mp,
 "symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
 					offset, byte_cnt, ip->i_ino);
@@ -135,7 +135,7 @@ xfs_readlink(
 	trace_xfs_readlink(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 
@@ -148,7 +148,7 @@ xfs_readlink(
 			 __func__, (unsigned long long) ip->i_ino,
 			 (long long) pathlen);
 		ASSERT(0);
-		error = XFS_ERROR(EFSCORRUPTED);
+		error = -EFSCORRUPTED;
 		goto out;
 	}
 
@@ -203,14 +203,14 @@ xfs_symlink(
 	trace_xfs_symlink(dp, link_name);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	/*
 	 * Check component lengths of the target path name.
 	 */
 	pathlen = strlen(target_path);
 	if (pathlen >= MAXPATHLEN)      /* total string too long */
-		return XFS_ERROR(ENAMETOOLONG);
+		return -ENAMETOOLONG;
 
 	udqp = gdqp = NULL;
 	prid = xfs_get_initial_prid(dp);
@@ -238,7 +238,7 @@ xfs_symlink(
 		fs_blocks = xfs_symlink_blocks(mp, pathlen);
 	resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
-	if (error == ENOSPC && fs_blocks == 0) {
+	if (error == -ENOSPC && fs_blocks == 0) {
 		resblks = 0;
 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
 	}
@@ -254,7 +254,7 @@ xfs_symlink(
 	 * Check whether the directory allows new symlinks or not.
 	 */
 	if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
-		error = XFS_ERROR(EPERM);
+		error = -EPERM;
 		goto error_return;
 	}
 
@@ -284,7 +284,7 @@ xfs_symlink(
 	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
 			       prid, resblks > 0, &ip, NULL);
 	if (error) {
-		if (error == ENOSPC)
+		if (error == -ENOSPC)
 			goto error_return;
 		goto error1;
 	}
@@ -348,7 +348,7 @@ xfs_symlink(
 			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 					       BTOBB(byte_cnt), 0);
 			if (!bp) {
-				error = ENOMEM;
+				error = -ENOMEM;
 				goto error2;
 			}
 			bp->b_ops = &xfs_symlink_buf_ops;
@@ -489,7 +489,7 @@ xfs_inactive_symlink_rmt(
 			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
 			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
 		if (!bp) {
-			error = ENOMEM;
+			error = -ENOMEM;
 			goto error_bmap_cancel;
 		}
 		xfs_trans_binval(tp, bp);
@@ -562,7 +562,7 @@ xfs_inactive_symlink(
 	trace_xfs_inactive_symlink(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
+		return -EIO;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 
@@ -580,7 +580,7 @@ xfs_inactive_symlink(
 			 __func__, (unsigned long long)ip->i_ino, pathlen);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		ASSERT(0);
-		return XFS_ERROR(EFSCORRUPTED);
+		return -EFSCORRUPTED;
 	}
 
 	if (ip->i_df.if_flags & XFS_IFINLINE) {
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
new file mode 100644
index 000000000000..9835139ce1ec
--- /dev/null
+++ b/fs/xfs/xfs_sysfs.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_sysfs.h"
+#include "xfs_log_format.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+
+struct xfs_sysfs_attr {
+	struct attribute attr;
+	ssize_t (*show)(char *buf, void *data);
+	ssize_t (*store)(const char *buf, size_t count, void *data);
+};
+
+static inline struct xfs_sysfs_attr *
+to_attr(struct attribute *attr)
+{
+	return container_of(attr, struct xfs_sysfs_attr, attr);
+}
+
+#define XFS_SYSFS_ATTR_RW(name) \
+	static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
+#define XFS_SYSFS_ATTR_RO(name) \
+	static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+
+#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
+
+/*
+ * xfs_mount kobject. This currently has no attributes and thus no need for show
+ * and store helpers. The mp kobject serves as the per-mount parent object that
+ * is identified by the fsname under sysfs.
+ */
+
+struct kobj_type xfs_mp_ktype = {
+	.release = xfs_sysfs_release,
+};
+
+/* xlog */
+
+STATIC ssize_t
+log_head_lsn_show(
+	char	*buf,
+	void	*data)
+{
+	struct xlog *log = data;
+	int cycle;
+	int block;
+
+	spin_lock(&log->l_icloglock);
+	cycle = log->l_curr_cycle;
+	block = log->l_curr_block;
+	spin_unlock(&log->l_icloglock);
+
+	return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_head_lsn);
+
+STATIC ssize_t
+log_tail_lsn_show(
+	char	*buf,
+	void	*data)
+{
+	struct xlog *log = data;
+	int cycle;
+	int block;
+
+	xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
+	return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_tail_lsn);
+
+STATIC ssize_t
+reserve_grant_head_show(
+	char	*buf,
+	void	*data)
+{
+	struct xlog *log = data;
+	int cycle;
+	int bytes;
+
+	xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
+	return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(reserve_grant_head);
+
+STATIC ssize_t
+write_grant_head_show(
+	char	*buf,
+	void	*data)
+{
+	struct xlog *log = data;
+	int cycle;
+	int bytes;
+
+	xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
+	return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(write_grant_head);
+
+static struct attribute *xfs_log_attrs[] = {
+	ATTR_LIST(log_head_lsn),
+	ATTR_LIST(log_tail_lsn),
+	ATTR_LIST(reserve_grant_head),
+	ATTR_LIST(write_grant_head),
+	NULL,
+};
+
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+	return container_of(kobj, struct xlog, l_kobj);
+}
+
+STATIC ssize_t
+xfs_log_show(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	char			*buf)
+{
+	struct xlog *log = to_xlog(kobject);
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
+}
+
+STATIC ssize_t
+xfs_log_store(
+	struct kobject		*kobject,
+	struct attribute	*attr,
+	const char		*buf,
+	size_t			count)
+{
+	struct xlog *log = to_xlog(kobject);
+	struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+	return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
+}
+
+static struct sysfs_ops xfs_log_ops = {
+	.show = xfs_log_show,
+	.store = xfs_log_store,
+};
+
+struct kobj_type xfs_log_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_log_ops,
+	.default_attrs = xfs_log_attrs,
+};
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
new file mode 100644
index 000000000000..54a2091183c0
--- /dev/null
+++ b/fs/xfs/xfs_sysfs.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __XFS_SYSFS_H__
+#define __XFS_SYSFS_H__
+
+extern struct kobj_type xfs_mp_ktype;	/* xfs_mount */
+extern struct kobj_type xfs_log_ktype;	/* xlog */
+
+static inline struct xfs_kobj *
+to_kobj(struct kobject *kobject)
+{
+	return container_of(kobject, struct xfs_kobj, kobject);
+}
+
+static inline void
+xfs_sysfs_release(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+	complete(&kobj->complete);
+}
+
+static inline int
+xfs_sysfs_init(
+	struct xfs_kobj		*kobj,
+	struct kobj_type	*ktype,
+	struct xfs_kobj		*parent_kobj,
+	const char		*name)
+{
+	init_completion(&kobj->complete);
+	return kobject_init_and_add(&kobj->kobject, ktype,
+				    &parent_kobj->kobject, "%s", name);
+}
+
+static inline void
+xfs_sysfs_del(
+	struct xfs_kobj	*kobj)
+{
+	kobject_del(&kobj->kobject);
+	kobject_put(&kobj->kobject);
+	wait_for_completion(&kobj->complete);
+}
+
+#endif	/* __XFS_SYSFS_H__ */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d03932564ccb..30e8e3410955 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -190,7 +190,7 @@ xfs_trans_reserve(
 					  -((int64_t)blocks), rsvd);
 		if (error != 0) {
 			current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-			return (XFS_ERROR(ENOSPC));
+			return -ENOSPC;
 		}
 		tp->t_blk_res += blocks;
 	}
@@ -241,7 +241,7 @@ xfs_trans_reserve(
 		error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
 					  -((int64_t)rtextents), rsvd);
 		if (error) {
-			error = XFS_ERROR(ENOSPC);
+			error = -ENOSPC;
 			goto undo_log;
 		}
 		tp->t_rtx_res += rtextents;
@@ -874,7 +874,7 @@ xfs_trans_commit(
 		goto out_unreserve;
 
 	if (XFS_FORCED_SHUTDOWN(mp)) {
-		error = XFS_ERROR(EIO);
+		error = -EIO;
 		goto out_unreserve;
 	}
 
@@ -917,7 +917,7 @@ out_unreserve:
 	if (tp->t_ticket) {
 		commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
 		if (commit_lsn == -1 && !error)
-			error = XFS_ERROR(EIO);
+			error = -EIO;
 	}
 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 	xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
@@ -1024,7 +1024,7 @@ xfs_trans_roll(
 	 */
 	error = xfs_trans_commit(trans, 0);
 	if (error)
-		return (error);
+		return error;
 
 	trans = *tpp;
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index cb0f3a84cc68..859482f53b5a 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -762,7 +762,7 @@ xfs_trans_ail_init(
 
 	ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
 	if (!ailp)
-		return ENOMEM;
+		return -ENOMEM;
 
 	ailp->xa_mount = mp;
 	INIT_LIST_HEAD(&ailp->xa_ail);
@@ -781,7 +781,7 @@ xfs_trans_ail_init(
 
 out_free_ailp:
 	kmem_free(ailp);
-	return ENOMEM;
+	return -ENOMEM;
 }
 
 void
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index b8eef0549f3f..96c898e7ac9a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -166,7 +166,7 @@ xfs_trans_get_buf_map(
 		ASSERT(atomic_read(&bip->bli_refcount) > 0);
 		bip->bli_recur++;
 		trace_xfs_trans_get_buf_recur(bip);
-		return (bp);
+		return bp;
 	}
 
 	bp = xfs_buf_get_map(target, map, nmaps, flags);
@@ -178,7 +178,7 @@ xfs_trans_get_buf_map(
 
 	_xfs_trans_bjoin(tp, bp, 1);
 	trace_xfs_trans_get_buf(bp->b_fspriv);
-	return (bp);
+	return bp;
 }
 
 /*
@@ -201,9 +201,8 @@ xfs_trans_getsb(xfs_trans_t	*tp,
 	 * Default to just trying to lock the superblock buffer
 	 * if tp is NULL.
 	 */
-	if (tp == NULL) {
-		return (xfs_getsb(mp, flags));
-	}
+	if (tp == NULL)
+		return xfs_getsb(mp, flags);
 
 	/*
 	 * If the superblock buffer already has this transaction
@@ -218,7 +217,7 @@ xfs_trans_getsb(xfs_trans_t	*tp,
 		ASSERT(atomic_read(&bip->bli_refcount) > 0);
 		bip->bli_recur++;
 		trace_xfs_trans_getsb_recur(bip);
-		return (bp);
+		return bp;
 	}
 
 	bp = xfs_getsb(mp, flags);
@@ -227,7 +226,7 @@ xfs_trans_getsb(xfs_trans_t	*tp,
 
 	_xfs_trans_bjoin(tp, bp, 1);
 	trace_xfs_trans_getsb(bp->b_fspriv);
-	return (bp);
+	return bp;
 }
 
 #ifdef DEBUG
@@ -267,7 +266,7 @@ xfs_trans_read_buf_map(
 		bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
 		if (!bp)
 			return (flags & XBF_TRYLOCK) ?
-					EAGAIN : XFS_ERROR(ENOMEM);
+					-EAGAIN : -ENOMEM;
 
 		if (bp->b_error) {
 			error = bp->b_error;
@@ -277,8 +276,8 @@ xfs_trans_read_buf_map(
 			xfs_buf_relse(bp);
 
 			/* bad CRC means corrupted metadata */
-			if (error == EFSBADCRC)
-				error = EFSCORRUPTED;
+			if (error == -EFSBADCRC)
+				error = -EFSCORRUPTED;
 			return error;
 		}
 #ifdef DEBUG
@@ -287,7 +286,7 @@ xfs_trans_read_buf_map(
 				if (((xfs_req_num++) % xfs_error_mod) == 0) {
 					xfs_buf_relse(bp);
 					xfs_debug(mp, "Returning error!");
-					return XFS_ERROR(EIO);
+					return -EIO;
 				}
 			}
 		}
@@ -343,8 +342,8 @@ xfs_trans_read_buf_map(
 					xfs_force_shutdown(tp->t_mountp,
 							SHUTDOWN_META_IO_ERROR);
 				/* bad CRC means corrupted metadata */
-				if (error == EFSBADCRC)
-					error = EFSCORRUPTED;
+				if (error == -EFSBADCRC)
+					error = -EFSCORRUPTED;
 				return error;
 			}
 		}
@@ -355,7 +354,7 @@ xfs_trans_read_buf_map(
 		if (XFS_FORCED_SHUTDOWN(mp)) {
 			trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
 			*bpp = NULL;
-			return XFS_ERROR(EIO);
+			return -EIO;
 		}
 
 
@@ -372,7 +371,7 @@ xfs_trans_read_buf_map(
 	if (bp == NULL) {
 		*bpp = NULL;
 		return (flags & XBF_TRYLOCK) ?
-					0 : XFS_ERROR(ENOMEM);
+					0 : -ENOMEM;
 	}
 	if (bp->b_error) {
 		error = bp->b_error;
@@ -384,8 +383,8 @@ xfs_trans_read_buf_map(
 		xfs_buf_relse(bp);
 
 		/* bad CRC means corrupted metadata */
-		if (error == EFSBADCRC)
-			error = EFSCORRUPTED;
+		if (error == -EFSBADCRC)
+			error = -EFSCORRUPTED;
 		return error;
 	}
 #ifdef DEBUG
@@ -396,7 +395,7 @@ xfs_trans_read_buf_map(
 						   SHUTDOWN_META_IO_ERROR);
 				xfs_buf_relse(bp);
 				xfs_debug(mp, "Returning trans error!");
-				return XFS_ERROR(EIO);
+				return -EIO;
 			}
 		}
 	}
@@ -414,7 +413,7 @@ shutdown_abort:
 	trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
 	xfs_buf_relse(bp);
 	*bpp = NULL;
-	return XFS_ERROR(EIO);
+	return -EIO;
 }
 
 /*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 41172861e857..846e061c2e98 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -722,8 +722,8 @@ xfs_trans_dqresv(
 error_return:
 	xfs_dqunlock(dqp);
 	if (flags & XFS_QMOPT_ENOSPC)
-		return ENOSPC;
-	return EDQUOT;
+		return -ENOSPC;
+	return -EDQUOT;
 }
 
 
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 65c6e6650b1a..b79dc66b2ecd 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -38,43 +38,18 @@ typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
 typedef	__uint32_t	xfs_dablk_t;	/* dir/attr block number (in file) */
 typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
 
-/*
- * These types are 64 bits on disk but are either 32 or 64 bits in memory.
- * Disk based types:
- */
-typedef __uint64_t	xfs_dfsbno_t;	/* blockno in filesystem (agno|agbno) */
-typedef __uint64_t	xfs_drfsbno_t;	/* blockno in filesystem (raw) */
-typedef	__uint64_t	xfs_drtbno_t;	/* extent (block) in realtime area */
-typedef	__uint64_t	xfs_dfiloff_t;	/* block number in a file */
-typedef	__uint64_t	xfs_dfilblks_t;	/* number of blocks in a file */
-
-/*
- * Memory based types are conditional.
- */
-#if XFS_BIG_BLKNOS
 typedef	__uint64_t	xfs_fsblock_t;	/* blockno in filesystem (agno|agbno) */
 typedef __uint64_t	xfs_rfsblock_t;	/* blockno in filesystem (raw) */
 typedef __uint64_t	xfs_rtblock_t;	/* extent (block) in realtime area */
-typedef	__int64_t	xfs_srtblock_t;	/* signed version of xfs_rtblock_t */
-#else
-typedef	__uint32_t	xfs_fsblock_t;	/* blockno in filesystem (agno|agbno) */
-typedef __uint32_t	xfs_rfsblock_t;	/* blockno in filesystem (raw) */
-typedef __uint32_t	xfs_rtblock_t;	/* extent (block) in realtime area */
-typedef	__int32_t	xfs_srtblock_t;	/* signed version of xfs_rtblock_t */
-#endif
 typedef __uint64_t	xfs_fileoff_t;	/* block number in a file */
-typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
 typedef __uint64_t	xfs_filblks_t;	/* number of blocks in a file */
 
+typedef	__int64_t	xfs_srtblock_t;	/* signed version of xfs_rtblock_t */
+typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
 
 /*
  * Null values for the types.
  */
-#define	NULLDFSBNO	((xfs_dfsbno_t)-1)
-#define	NULLDRFSBNO	((xfs_drfsbno_t)-1)
-#define	NULLDRTBNO	((xfs_drtbno_t)-1)
-#define	NULLDFILOFF	((xfs_dfiloff_t)-1)
-
 #define	NULLFSBLOCK	((xfs_fsblock_t)-1)
 #define	NULLRFSBLOCK	((xfs_rfsblock_t)-1)
 #define	NULLRTBLOCK	((xfs_rtblock_t)-1)
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
deleted file mode 100644
index e8a77383c0d5..000000000000
--- a/fs/xfs/xfs_vnode.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VNODE_H__
-#define __XFS_VNODE_H__
-
-#include "xfs_fs.h"
-
-struct file;
-struct xfs_inode;
-struct attrlist_cursor_kern;
-
-/*
- * Flags for read/write calls - same values as IRIX
- */
-#define IO_ISDIRECT	0x00004		/* bypass page cache */
-#define IO_INVIS	0x00020		/* don't update inode timestamps */
-
-#define XFS_IO_FLAGS \
-	{ IO_ISDIRECT,	"DIRECT" }, \
-	{ IO_INVIS,	"INVIS"}
-
-/*
- * Some useful predicates.
- */
-#define VN_MAPPED(vp)	mapping_mapped(vp->i_mapping)
-#define VN_CACHED(vp)	(vp->i_mapping->nrpages)
-#define VN_DIRTY(vp)	mapping_tagged(vp->i_mapping, \
-					PAGECACHE_TAG_DIRTY)
-
-
-#endif	/* __XFS_VNODE_H__ */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 78ed92a46fdd..93455b998041 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -49,7 +49,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
 		value = NULL;
 	}
 
-	error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+	error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
 	if (error)
 		return error;
 	return asize;
@@ -71,8 +71,8 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
 		xflags |= ATTR_REPLACE;
 
 	if (!value)
-		return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
-	return -xfs_attr_set(ip, (unsigned char *)name,
+		return xfs_attr_remove(ip, (unsigned char *)name, xflags);
+	return xfs_attr_set(ip, (unsigned char *)name,
 				(void *)value, size, xflags);
 }
 
diff --git a/include/dt-bindings/dma/nbpfaxi.h b/include/dt-bindings/dma/nbpfaxi.h
new file mode 100644
index 000000000000..c1a5b9e0d6a4
--- /dev/null
+++ b/include/dt-bindings/dma/nbpfaxi.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2013-2014 Renesas Electronics Europe Ltd.
+ * Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef DT_BINDINGS_NBPFAXI_H
+#define DT_BINDINGS_NBPFAXI_H
+
+/**
+ * Use "#dma-cells = <2>;" with the second integer defining slave DMA flags:
+ */
+#define NBPF_SLAVE_RQ_HIGH	1
+#define NBPF_SLAVE_RQ_LOW	2
+#define NBPF_SLAVE_RQ_LEVEL	4
+
+#endif
diff --git a/include/linux/acct.h b/include/linux/acct.h
index 4a5b7cb56079..dccc2d4fe7de 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -24,14 +24,10 @@ struct super_block;
 struct pacct_struct;
 struct pid_namespace;
 extern int acct_parm[]; /* for sysctl */
-extern void acct_auto_close_mnt(struct vfsmount *m);
-extern void acct_auto_close(struct super_block *sb);
 extern void acct_collect(long exitcode, int group_dead);
 extern void acct_process(void);
 extern void acct_exit_ns(struct pid_namespace *);
 #else
-#define acct_auto_close_mnt(x)	do { } while (0)
-#define acct_auto_close(x)	do { } while (0)
 #define acct_collect(x,y)	do { } while (0)
 #define acct_process()		do { } while (0)
 #define acct_exit_ns(ns)	do { } while (0)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d2633ee099d9..b39e5000ff58 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -308,6 +308,7 @@ struct bio_integrity_payload {
 
 	unsigned short		bip_slab;	/* slab the bip came from */
 	unsigned short		bip_vcnt;	/* # of integrity bio_vecs */
+	unsigned short		bip_max_vcnt;	/* integrity bio_vec slots */
 	unsigned		bip_owns_buf:1;	/* should free bip_buf */
 
 	struct work_struct	bip_work;	/* I/O completion */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8699bcf5f099..518b46555b80 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -21,6 +21,7 @@
 #include <linux/bsg.h>
 #include <linux/smp.h>
 #include <linux/rcupdate.h>
+#include <linux/percpu-refcount.h>
 
 #include <asm/scatterlist.h>
 
@@ -470,6 +471,7 @@ struct request_queue {
 	struct mutex		sysfs_lock;
 
 	int			bypass_depth;
+	int			mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
@@ -483,7 +485,7 @@ struct request_queue {
 #endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
-	struct percpu_counter	mq_usage_counter;
+	struct percpu_ref	mq_usage_counter;
 	struct list_head	all_q_node;
 
 	struct blk_mq_tag_set	*tag_set;
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index d21f2dba0731..40ae58e3e9db 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -285,19 +285,9 @@ extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
 
 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 				     bool can_fail);
-extern void ceph_msg_kfree(struct ceph_msg *m);
 
-
-static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
-{
-	kref_get(&msg->kref);
-	return msg;
-}
-extern void ceph_msg_last_put(struct kref *kref);
-static inline void ceph_msg_put(struct ceph_msg *msg)
-{
-	kref_put(&msg->kref, ceph_msg_last_put);
-}
+extern struct ceph_msg *ceph_msg_get(struct ceph_msg *msg);
+extern void ceph_msg_put(struct ceph_msg *msg);
 
 extern void ceph_msg_dump(struct ceph_msg *msg);
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 94ec69672164..03aeb27fcc69 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -117,7 +117,7 @@ struct ceph_osd_request {
 	struct list_head r_req_lru_item;
 	struct list_head r_osd_item;
 	struct list_head r_linger_item;
-	struct list_head r_linger_osd;
+	struct list_head r_linger_osd_item;
 	struct ceph_osd *r_osd;
 	struct ceph_pg   r_pgid;
 	int              r_pg_osds[CEPH_PG_MAX_SIZE];
@@ -325,22 +325,14 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 
 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
 					 struct ceph_osd_request *req);
-extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
-						struct ceph_osd_request *req);
-
-static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
-{
-	kref_get(&req->r_kref);
-}
-extern void ceph_osdc_release_request(struct kref *kref);
-static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
-{
-	kref_put(&req->r_kref, ceph_osdc_release_request);
-}
+
+extern void ceph_osdc_get_request(struct ceph_osd_request *req);
+extern void ceph_osdc_put_request(struct ceph_osd_request *req);
 
 extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 				   struct ceph_osd_request *req,
 				   bool nofail);
+extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 				  struct ceph_osd_request *req);
 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 3c7ec327ebd2..e4ae2ad48d07 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -249,6 +249,7 @@ extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
 extern struct dentry *d_find_any_alias(struct inode *inode);
 extern struct dentry * d_obtain_alias(struct inode *);
+extern struct dentry * d_obtain_root(struct inode *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
 extern void shrink_dcache_for_umount(struct super_block *);
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 3d1c2aa51530..1f9e642c66ad 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -37,7 +37,6 @@
  */
 typedef s32 dma_cookie_t;
 #define DMA_MIN_COOKIE	1
-#define DMA_MAX_COOKIE	INT_MAX
 
 static inline int dma_submit_error(dma_cookie_t cookie)
 {
@@ -671,7 +670,7 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_dma_cyclic)(
 		struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
 		size_t period_len, enum dma_transfer_direction direction,
-		unsigned long flags, void *context);
+		unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
 		struct dma_chan *chan, struct dma_interleaved_template *xt,
 		unsigned long flags);
@@ -746,7 +745,7 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic(
 		unsigned long flags)
 {
 	return chan->device->device_prep_dma_cyclic(chan, buf_addr, buf_len,
-						period_len, dir, flags, NULL);
+						period_len, dir, flags);
 }
 
 static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 3dbe9bd57a09..debb70d40547 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -52,7 +52,7 @@
 #endif
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.3"
+#define REL_VERSION "8.4.5"
 #define API_VERSION 1
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 101
@@ -245,7 +245,7 @@ enum drbd_disk_state {
 	D_DISKLESS,
 	D_ATTACHING,      /* In the process of reading the meta-data */
 	D_FAILED,         /* Becomes D_DISKLESS as soon as we told it the peer */
-			/* when >= D_FAILED it is legal to access mdev->bc */
+			  /* when >= D_FAILED it is legal to access mdev->ldev */
 	D_NEGOTIATING,    /* Late attaching state, we need to talk to the peer */
 	D_INCONSISTENT,
 	D_OUTDATED,
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index 4193f5f2636c..7b131ed8f9c6 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -171,6 +171,10 @@ GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
 	__flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT,	tentative)
 	__flg_field_def(29,	DRBD_GENLA_F_MANDATORY,	use_rle, DRBD_USE_RLE_DEF)
 	/* 9: __u32_field_def(30,	DRBD_GENLA_F_MANDATORY,	fencing_policy, DRBD_FENCING_DEF) */
+	/* 9: __str_field_def(31,     DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */
+	/* 9: __u32_field(32,         DRBD_F_REQUIRED | DRBD_F_INVARIANT,     peer_node_id) */
+	__flg_field_def(33, 0 /* OPTIONAL */,	csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF)
+	__u32_field_def(34, 0 /* OPTIONAL */, sock_check_timeo, DRBD_SOCKET_CHECK_TIMEO_DEF)
 )
 
 GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms,
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 17e50bb00521..8ac8c5d9a3ad 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -214,6 +214,7 @@
 #define DRBD_ALLOW_TWO_PRIMARIES_DEF	0
 #define DRBD_ALWAYS_ASBP_DEF	0
 #define DRBD_USE_RLE_DEF	1
+#define DRBD_CSUMS_AFTER_CRASH_ONLY_DEF 0
 
 #define DRBD_AL_STRIPES_MIN     1
 #define DRBD_AL_STRIPES_MAX     1024
@@ -224,4 +225,9 @@
 #define DRBD_AL_STRIPE_SIZE_MAX   16777216
 #define DRBD_AL_STRIPE_SIZE_DEF   32
 #define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */
+
+#define DRBD_SOCKET_CHECK_TIMEO_MIN 0
+#define DRBD_SOCKET_CHECK_TIMEO_MAX DRBD_PING_TIMEO_MAX
+#define DRBD_SOCKET_CHECK_TIMEO_DEF 0
+#define DRBD_SOCKET_CHECK_TIMEO_SCALE '1'
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f0890e4a7c25..94187721ad41 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1275,6 +1275,7 @@ struct super_block {
 
 	/* AIO completions deferred from interrupt context */
 	struct workqueue_struct *s_dio_done_wq;
+	struct hlist_head s_pins;
 
 	/*
 	 * Keep the lru lists last in the structure so they always sit on their
@@ -2360,6 +2361,7 @@ extern int do_pipe_flags(int *, int);
 
 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
 extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t);
+extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
 extern struct file * open_exec(const char *);
  
 /* fs/dcache.c -- generic fs support functions */
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
new file mode 100644
index 000000000000..f66525e72ccf
--- /dev/null
+++ b/include/linux/fs_pin.h
@@ -0,0 +1,17 @@
+#include <linux/fs.h>
+
+struct fs_pin {
+	atomic_long_t		count;
+	union {
+		struct {
+			struct hlist_node	s_list;
+			struct hlist_node	m_list;
+		};
+		struct rcu_head rcu;
+	};
+	void (*kill)(struct fs_pin *);
+};
+
+void pin_put(struct fs_pin *);
+void pin_remove(struct fs_pin *);
+void pin_insert(struct fs_pin *, struct vfsmount *);
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 4967916fe4ac..d69f0577a319 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -187,7 +187,6 @@ vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio)
 }
 
 extern bool vlan_do_receive(struct sk_buff **skb);
-extern struct sk_buff *vlan_untag(struct sk_buff *skb);
 
 extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid);
 extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid);
@@ -241,11 +240,6 @@ static inline bool vlan_do_receive(struct sk_buff **skb)
 	return false;
 }
 
-static inline struct sk_buff *vlan_untag(struct sk_buff *skb)
-{
-	return skb;
-}
-
 static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
 {
 	return 0;
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index babaea93bca6..29ce014ab421 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -213,6 +213,8 @@ struct dw_mci_dma_ops {
 #define DW_MCI_QUIRK_HIGHSPEED			BIT(2)
 /* Unreliable card detection */
 #define DW_MCI_QUIRK_BROKEN_CARD_DETECTION	BIT(3)
+/* No write protect */
+#define DW_MCI_QUIRK_NO_WRITE_PROTECT		BIT(4)
 
 /* Slot level quirks */
 /* This slot has no write protect */
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 08abe9941884..09ebe57d5ce9 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -104,9 +104,6 @@ struct sdhci_host {
 
 	const struct sdhci_ops *ops;	/* Low level hw interface */
 
-	struct regulator *vmmc;		/* Power regulator (vmmc) */
-	struct regulator *vqmmc;	/* Signaling regulator (vccq) */
-
 	/* Internal data */
 	struct mmc_host *mmc;	/* MMC structure */
 	u64 dma_mask;		/* custom DMA mask */
diff --git a/include/linux/mount.h b/include/linux/mount.h
index b0c1e6574e7f..9262e4bf0cc3 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -69,6 +69,7 @@ struct vfsmount {
 };
 
 struct file; /* forward dec */
+struct path;
 
 extern int mnt_want_write(struct vfsmount *mnt);
 extern int mnt_want_write_file(struct file *file);
@@ -77,8 +78,7 @@ extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mnt_drop_write_file(struct file *file);
 extern void mntput(struct vfsmount *mnt);
 extern struct vfsmount *mntget(struct vfsmount *mnt);
-extern void mnt_pin(struct vfsmount *mnt);
-extern void mnt_unpin(struct vfsmount *mnt);
+extern struct vfsmount *mnt_clone_internal(struct path *path);
 extern int __mnt_is_readonly(struct vfsmount *mnt);
 
 struct file_system_type;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index e30f6059ecd6..5180a7ededec 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -52,6 +52,7 @@ struct nfs_access_entry {
 	unsigned long		jiffies;
 	struct rpc_cred *	cred;
 	int			mask;
+	struct rcu_head		rcu_head;
 };
 
 struct nfs_lockowner {
@@ -352,6 +353,7 @@ extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
+extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 1150ea41b626..922be2e050f5 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -45,6 +45,7 @@ struct nfs_client {
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
+	char *			cl_acceptor;	/* GSSAPI acceptor name */
 	struct list_head	cl_share_link;	/* link in global client list */
 	struct list_head	cl_superblocks;	/* List of nfs_server structs */
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 7d9096d95d4a..6ad2bbcad405 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -26,7 +26,7 @@ enum {
 	PG_MAPPED,		/* page private set for buffered io */
 	PG_CLEAN,		/* write succeeded */
 	PG_COMMIT_TO_DS,	/* used by pnfs layouts */
-	PG_INODE_REF,		/* extra ref held by inode (head req only) */
+	PG_INODE_REF,		/* extra ref held by inode when in writeback */
 	PG_HEADLOCK,		/* page group lock of wb_head */
 	PG_TEARDOWN,		/* page group sync for destroy */
 	PG_UNLOCKPAGE,		/* page group sync bit in read path */
@@ -62,12 +62,13 @@ struct nfs_pageio_ops {
 
 struct nfs_rw_ops {
 	const fmode_t rw_mode;
-	struct nfs_rw_header *(*rw_alloc_header)(void);
-	void (*rw_free_header)(struct nfs_rw_header *);
-	void (*rw_release)(struct nfs_pgio_data *);
-	int  (*rw_done)(struct rpc_task *, struct nfs_pgio_data *, struct inode *);
-	void (*rw_result)(struct rpc_task *, struct nfs_pgio_data *);
-	void (*rw_initiate)(struct nfs_pgio_data *, struct rpc_message *,
+	struct nfs_pgio_header *(*rw_alloc_header)(void);
+	void (*rw_free_header)(struct nfs_pgio_header *);
+	void (*rw_release)(struct nfs_pgio_header *);
+	int  (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
+			struct inode *);
+	void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);
+	void (*rw_initiate)(struct nfs_pgio_header *, struct rpc_message *,
 			    struct rpc_task_setup *, int);
 };
 
@@ -111,6 +112,8 @@ extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 			     int how);
 extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
 				   struct nfs_page *);
+extern  int nfs_pageio_resend(struct nfs_pageio_descriptor *,
+			      struct nfs_pgio_header *);
 extern	void nfs_pageio_complete(struct nfs_pageio_descriptor *desc);
 extern	void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t);
 extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
@@ -119,7 +122,7 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
 extern	void nfs_unlock_and_release_request(struct nfs_page *);
-extern void nfs_page_group_lock(struct nfs_page *);
+extern int nfs_page_group_lock(struct nfs_page *, bool);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9a1396e70310..0040629894df 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -993,6 +993,7 @@ struct nfs4_setclientid {
 	unsigned int			sc_uaddr_len;
 	char				sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
 	u32				sc_cb_ident;
+	struct rpc_cred			*sc_cred;
 };
 
 struct nfs4_setclientid_res {
@@ -1253,18 +1254,12 @@ enum {
 	NFS_IOHDR_ERROR = 0,
 	NFS_IOHDR_EOF,
 	NFS_IOHDR_REDO,
-	NFS_IOHDR_NEED_COMMIT,
-	NFS_IOHDR_NEED_RESCHED,
 };
 
-struct nfs_pgio_data;
-
 struct nfs_pgio_header {
 	struct inode		*inode;
 	struct rpc_cred		*cred;
 	struct list_head	pages;
-	struct nfs_pgio_data	*data;
-	atomic_t		refcnt;
 	struct nfs_page		*req;
 	struct nfs_writeverf	verf;		/* Used for writes */
 	struct pnfs_layout_segment *lseg;
@@ -1281,28 +1276,22 @@ struct nfs_pgio_header {
 	int			error;		/* merge with pnfs_error */
 	unsigned long		good_bytes;	/* boundary of good data */
 	unsigned long		flags;
-};
 
-struct nfs_pgio_data {
-	struct nfs_pgio_header	*header;
+	/*
+	 * rpc data
+	 */
 	struct rpc_task		task;
 	struct nfs_fattr	fattr;
-	struct nfs_writeverf	verf;		/* Used for writes */
 	struct nfs_pgio_args	args;		/* argument struct */
 	struct nfs_pgio_res	res;		/* result struct */
 	unsigned long		timestamp;	/* For lease renewal */
-	int (*pgio_done_cb) (struct rpc_task *task, struct nfs_pgio_data *data);
+	int (*pgio_done_cb)(struct rpc_task *, struct nfs_pgio_header *);
 	__u64			mds_offset;	/* Filelayout dense stripe */
-	struct nfs_page_array	pages;
+	struct nfs_page_array	page_array;
 	struct nfs_client	*ds_clp;	/* pNFS data server */
 	int			ds_idx;		/* ds index if ds_clp is set */
 };
 
-struct nfs_rw_header {
-	struct nfs_pgio_header	header;
-	struct nfs_pgio_data	rpc_data;
-};
-
 struct nfs_mds_commit_info {
 	atomic_t rpcs_out;
 	unsigned long		ncommit;
@@ -1432,11 +1421,12 @@ struct nfs_rpc_ops {
 			     struct nfs_pathconf *);
 	int	(*set_capabilities)(struct nfs_server *, struct nfs_fh *);
 	int	(*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int);
-	int	(*pgio_rpc_prepare)(struct rpc_task *, struct nfs_pgio_data *);
-	void	(*read_setup)   (struct nfs_pgio_data *, struct rpc_message *);
-	int	(*read_done)  (struct rpc_task *, struct nfs_pgio_data *);
-	void	(*write_setup)  (struct nfs_pgio_data *, struct rpc_message *);
-	int	(*write_done)  (struct rpc_task *, struct nfs_pgio_data *);
+	int	(*pgio_rpc_prepare)(struct rpc_task *,
+				    struct nfs_pgio_header *);
+	void	(*read_setup)(struct nfs_pgio_header *, struct rpc_message *);
+	int	(*read_done)(struct rpc_task *, struct nfs_pgio_header *);
+	void	(*write_setup)(struct nfs_pgio_header *, struct rpc_message *);
+	int	(*write_done)(struct rpc_task *, struct nfs_pgio_header *);
 	void	(*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
 	void	(*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
 	int	(*commit_done) (struct rpc_task *, struct nfs_commit_data *);
diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h
index ae36298ba076..56bc026c143f 100644
--- a/include/linux/of_dma.h
+++ b/include/linux/of_dma.h
@@ -41,6 +41,8 @@ extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
 						     const char *name);
 extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
 		struct of_dma *ofdma);
+extern struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec,
+		struct of_dma *ofdma);
 #else
 static inline int of_dma_controller_register(struct device_node *np,
 		struct dma_chan *(*of_dma_xlate)
@@ -66,6 +68,8 @@ static inline struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_s
 	return NULL;
 }
 
+#define of_dma_xlate_by_chan_id NULL
+
 #endif
 
 #endif /* __LINUX_OF_DMA_H */
diff --git a/include/linux/platform_data/dma-imx.h b/include/linux/platform_data/dma-imx.h
index d05542aafa3e..6a1357d31871 100644
--- a/include/linux/platform_data/dma-imx.h
+++ b/include/linux/platform_data/dma-imx.h
@@ -40,6 +40,7 @@ enum sdma_peripheral_type {
 	IMX_DMATYPE_ASRC,	/* ASRC */
 	IMX_DMATYPE_ESAI,	/* ESAI */
 	IMX_DMATYPE_SSI_DUAL,	/* SSI Dual FIFO */
+	IMX_DMATYPE_ASRC_SP,	/* Shared ASRC */
 };
 
 enum imx_dma_prio {
diff --git a/include/linux/platform_data/edma.h b/include/linux/platform_data/edma.h
index eb8d5627d080..bdb2710e2aab 100644
--- a/include/linux/platform_data/edma.h
+++ b/include/linux/platform_data/edma.h
@@ -150,6 +150,8 @@ void edma_clear_event(unsigned channel);
 void edma_pause(unsigned channel);
 void edma_resume(unsigned channel);
 
+void edma_assign_channel_eventq(unsigned channel, enum dma_event_q eventq_no);
+
 struct edma_rsv_info {
 
 	const s16	(*rsv_chans)[2];
diff --git a/include/linux/platform_data/mmc-omap.h b/include/linux/platform_data/mmc-omap.h
index 2bf1b30cb5dc..51e70cf25cbc 100644
--- a/include/linux/platform_data/mmc-omap.h
+++ b/include/linux/platform_data/mmc-omap.h
@@ -28,6 +28,7 @@
  */
 #define OMAP_HSMMC_SUPPORTS_DUAL_VOLT		BIT(0)
 #define OMAP_HSMMC_BROKEN_MULTIBLOCK_READ	BIT(1)
+#define OMAP_HSMMC_SWAKEUP_MISSING		BIT(2)
 
 struct mmc_card;
 
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 0f3c5d38da1f..80d345a3524c 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -390,7 +390,6 @@ struct quota_info {
 	unsigned int flags;			/* Flags for diskquotas on this device */
 	struct mutex dqio_mutex;		/* lock device while I/O in progress */
 	struct mutex dqonoff_mutex;		/* Serialize quotaon & quotaoff */
-	struct rw_semaphore dqptr_sem;		/* serialize ops using quota_info struct, pointers from inode to dquots */
 	struct inode *files[MAXQUOTAS];		/* inodes of quotafiles */
 	struct mem_dqinfo info[MAXQUOTAS];	/* Information for each quota type */
 	const struct quota_format_ops *ops[MAXQUOTAS];	/* Operations for each type */
diff --git a/include/linux/sh_dma.h b/include/linux/sh_dma.h
index b7b43b82231e..56b97eed28a4 100644
--- a/include/linux/sh_dma.h
+++ b/include/linux/sh_dma.h
@@ -95,19 +95,21 @@ struct sh_dmae_pdata {
 };
 
 /* DMAOR definitions */
-#define DMAOR_AE	0x00000004
+#define DMAOR_AE	0x00000004	/* Address Error Flag */
 #define DMAOR_NMIF	0x00000002
-#define DMAOR_DME	0x00000001
+#define DMAOR_DME	0x00000001	/* DMA Master Enable */
 
 /* Definitions for the SuperH DMAC */
-#define DM_INC	0x00004000
-#define DM_DEC	0x00008000
-#define DM_FIX	0x0000c000
-#define SM_INC	0x00001000
-#define SM_DEC	0x00002000
-#define SM_FIX	0x00003000
-#define CHCR_DE	0x00000001
-#define CHCR_TE	0x00000002
-#define CHCR_IE	0x00000004
+#define DM_INC	0x00004000	/* Destination addresses are incremented */
+#define DM_DEC	0x00008000	/* Destination addresses are decremented */
+#define DM_FIX	0x0000c000	/* Destination address is fixed */
+#define SM_INC	0x00001000	/* Source addresses are incremented */
+#define SM_DEC	0x00002000	/* Source addresses are decremented */
+#define SM_FIX	0x00003000	/* Source address is fixed */
+#define RS_AUTO	0x00000400	/* Auto Request */
+#define RS_ERS	0x00000800	/* DMA extended resource selector */
+#define CHCR_DE	0x00000001	/* DMA Enable */
+#define CHCR_TE	0x00000002	/* Transfer End Flag */
+#define CHCR_IE	0x00000004	/* Interrupt Enable */
 
 #endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 11c270551d25..abde271c18ae 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2555,6 +2555,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
 void skb_scrub_packet(struct sk_buff *skb, bool xnet);
 unsigned int skb_gso_transport_seglen(const struct sk_buff *skb);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
+struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 
 struct skb_checksum_ops {
 	__wsum (*update)(const void *mem, int len, __wsum wsum);
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 790be1472792..8e030075fe79 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -103,6 +103,7 @@ struct rpc_auth_create_args {
 
 /* Flags for rpcauth_lookupcred() */
 #define RPCAUTH_LOOKUP_NEW		0x01	/* Accept an uninitialised cred */
+#define RPCAUTH_LOOKUP_RCU		0x02	/* lock-less lookup */
 
 /*
  * Client authentication ops
@@ -140,6 +141,7 @@ struct rpc_credops {
 						void *, __be32 *, void *);
 	int			(*crkey_timeout)(struct rpc_cred *);
 	bool			(*crkey_to_expire)(struct rpc_cred *);
+	char *			(*crstringify_acceptor)(struct rpc_cred *);
 };
 
 extern const struct rpc_authops	authunix_ops;
@@ -153,6 +155,7 @@ void			rpc_destroy_generic_auth(void);
 void 			rpc_destroy_authunix(void);
 
 struct rpc_cred *	rpc_lookup_cred(void);
+struct rpc_cred *	rpc_lookup_cred_nonblock(void);
 struct rpc_cred *	rpc_lookup_machine_cred(const char *service_name);
 int			rpcauth_register(const struct rpc_authops *);
 int			rpcauth_unregister(const struct rpc_authops *);
@@ -182,6 +185,7 @@ void			rpcauth_clear_credcache(struct rpc_cred_cache *);
 int			rpcauth_key_timeout_notify(struct rpc_auth *,
 						struct rpc_cred *);
 bool			rpcauth_cred_key_to_expire(struct rpc_cred *);
+char *			rpcauth_stringify_acceptor(struct rpc_cred *);
 
 static inline
 struct rpc_cred *	get_rpccred(struct rpc_cred *cred)
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
index f1cfd4c85cd0..36eebc451b41 100644
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -69,8 +69,9 @@ struct gss_cl_ctx {
 	enum rpc_gss_proc	gc_proc;
 	u32			gc_seq;
 	spinlock_t		gc_seq_lock;
-	struct gss_ctx __rcu	*gc_gss_ctx;
+	struct gss_ctx		*gc_gss_ctx;
 	struct xdr_netobj	gc_wire_ctx;
+	struct xdr_netobj	gc_acceptor;
 	u32			gc_win;
 	unsigned long		gc_expiry;
 	struct rcu_head		gc_rcu;
diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h
index 5af2931cf58d..df02a4188487 100644
--- a/include/linux/sunrpc/gss_krb5.h
+++ b/include/linux/sunrpc/gss_krb5.h
@@ -81,7 +81,7 @@ struct gss_krb5_enctype {
 		       struct xdr_netobj *in,
 		       struct xdr_netobj *out);	/* complete key generation */
 	u32 (*encrypt_v2) (struct krb5_ctx *kctx, u32 offset,
-			   struct xdr_buf *buf, int ec,
+			   struct xdr_buf *buf,
 			   struct page **pages); /* v2 encryption function */
 	u32 (*decrypt_v2) (struct krb5_ctx *kctx, u32 offset,
 			   struct xdr_buf *buf, u32 *headskip,
@@ -310,7 +310,7 @@ gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e,
 
 u32
 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
-		     struct xdr_buf *buf, int ec,
+		     struct xdr_buf *buf,
 		     struct page **pages);
 
 u32
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index c2f04e1ae159..64a0a0a97b23 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -62,8 +62,6 @@
 #define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
 
 /* memory registration strategies */
-#define RPCRDMA_PERSISTENT_REGISTRATION (1)
-
 enum rpcrdma_memreg {
 	RPCRDMA_BOUNCEBUFFERS = 0,
 	RPCRDMA_REGISTER,
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index f7e11c7ea7d9..0305cde21a74 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -158,6 +158,42 @@ struct thermal_attr {
 	char name[THERMAL_NAME_LENGTH];
 };
 
+/**
+ * struct thermal_zone_device - structure for a thermal zone
+ * @id:		unique id number for each thermal zone
+ * @type:	the thermal zone device type
+ * @device:	&struct device for this thermal zone
+ * @trip_temp_attrs:	attributes for trip points for sysfs: trip temperature
+ * @trip_type_attrs:	attributes for trip points for sysfs: trip type
+ * @trip_hyst_attrs:	attributes for trip points for sysfs: trip hysteresis
+ * @devdata:	private pointer for device private data
+ * @trips:	number of trip points the thermal zone supports
+ * @passive_delay:	number of milliseconds to wait between polls when
+ *			performing passive cooling.  Currenty only used by the
+ *			step-wise governor
+ * @polling_delay:	number of milliseconds to wait between polls when
+ *			checking whether trip points have been crossed (0 for
+ *			interrupt driven systems)
+ * @temperature:	current temperature.  This is only for core code,
+ *			drivers should use thermal_zone_get_temp() to get the
+ *			current temperature
+ * @last_temperature:	previous temperature read
+ * @emul_temperature:	emulated temperature when using CONFIG_THERMAL_EMULATION
+ * @passive:		1 if you've crossed a passive trip point, 0 otherwise.
+ *			Currenty only used by the step-wise governor.
+ * @forced_passive:	If > 0, temperature at which to switch on all ACPI
+ *			processor cooling devices.  Currently only used by the
+ *			step-wise governor.
+ * @ops:	operations this &thermal_zone_device supports
+ * @tzp:	thermal zone parameters
+ * @governor:	pointer to the governor for this thermal zone
+ * @thermal_instances:	list of &struct thermal_instance of this thermal zone
+ * @idr:	&struct idr to generate unique id for this zone's cooling
+ *		devices
+ * @lock:	lock to protect thermal_instances list
+ * @node:	node in thermal_tz_list (in thermal_core.c)
+ * @poll_queue:	delayed work for polling
+ */
 struct thermal_zone_device {
 	int id;
 	char type[THERMAL_NAME_LENGTH];
@@ -179,12 +215,18 @@ struct thermal_zone_device {
 	struct thermal_governor *governor;
 	struct list_head thermal_instances;
 	struct idr idr;
-	struct mutex lock; /* protect thermal_instances list */
+	struct mutex lock;
 	struct list_head node;
 	struct delayed_work poll_queue;
 };
 
-/* Structure that holds thermal governor information */
+/**
+ * struct thermal_governor - structure that holds thermal governor information
+ * @name:	name of the governor
+ * @throttle:	callback called for every trip point even if temperature is
+ *		below the trip point temperature
+ * @governor_list:	node in thermal_governor_list (in thermal_core.c)
+ */
 struct thermal_governor {
 	char name[THERMAL_NAME_LENGTH];
 	int (*throttle)(struct thermal_zone_device *tz, int trip);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 09a7cffc224e..48d64e6ab292 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -84,7 +84,7 @@ unsigned long iov_iter_alignment(const struct iov_iter *i);
 void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
 			unsigned long nr_segs, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
-			size_t maxsize, size_t *start);
+			unsigned maxpages, size_t *start);
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
 			size_t maxsize, size_t *start);
 int iov_iter_npages(const struct iov_iter *i, int maxpages);
diff --git a/include/scsi/sg.h b/include/scsi/sg.h
index 9859355a7cf9..750e5db7c6bf 100644
--- a/include/scsi/sg.h
+++ b/include/scsi/sg.h
@@ -86,7 +86,9 @@ typedef struct sg_io_hdr
 #define SG_FLAG_MMAP_IO 4       /* request memory mapped IO */
 #define SG_FLAG_NO_DXFER 0x10000 /* no transfer of kernel buffers to/from */
 				/* user space (debug indirect IO) */
-#define SG_FLAG_Q_AT_TAIL 0x10  /* default is Q_AT_HEAD */
+/* defaults:: for sg driver: Q_AT_HEAD; for block layer: Q_AT_TAIL */
+#define SG_FLAG_Q_AT_TAIL 0x10
+#define SG_FLAG_Q_AT_HEAD 0x20
 
 /* following 'info' values are "or"-ed together */
 #define SG_INFO_OK_MASK 0x1
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index c9c3c044b32f..981acf74b14f 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -148,11 +148,13 @@ TRACE_EVENT(bcache_read,
 );
 
 TRACE_EVENT(bcache_write,
-	TP_PROTO(struct bio *bio, bool writeback, bool bypass),
-	TP_ARGS(bio, writeback, bypass),
+	TP_PROTO(struct cache_set *c, u64 inode, struct bio *bio,
+		bool writeback, bool bypass),
+	TP_ARGS(c, inode, bio, writeback, bypass),
 
 	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
+		__array(char,		uuid,	16		)
+		__field(u64,		inode			)
 		__field(sector_t,	sector			)
 		__field(unsigned int,	nr_sector		)
 		__array(char,		rwbs,	6		)
@@ -161,7 +163,8 @@ TRACE_EVENT(bcache_write,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
+		memcpy(__entry->uuid, c->sb.set_uuid, 16);
+		__entry->inode		= inode;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
@@ -169,8 +172,8 @@ TRACE_EVENT(bcache_write,
 		__entry->bypass = bypass;
 	),
 
-	TP_printk("%d,%d  %s %llu + %u hit %u bypass %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
+	TP_printk("%pU inode %llu  %s %llu + %u hit %u bypass %u",
+		  __entry->uuid, __entry->inode,
 		  __entry->rwbs, (unsigned long long)__entry->sector,
 		  __entry->nr_sector, __entry->writeback, __entry->bypass)
 );
@@ -258,9 +261,9 @@ DEFINE_EVENT(btree_node, bcache_btree_node_alloc,
 	TP_ARGS(b)
 );
 
-DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail,
-	TP_PROTO(struct btree *b),
-	TP_ARGS(b)
+DEFINE_EVENT(cache_set, bcache_btree_node_alloc_fail,
+	TP_PROTO(struct cache_set *c),
+	TP_ARGS(c)
 );
 
 DEFINE_EVENT(btree_node, bcache_btree_node_free,
diff --git a/include/uapi/linux/bsg.h b/include/uapi/linux/bsg.h
index 7a12e1c0f371..02986cf8b6f1 100644
--- a/include/uapi/linux/bsg.h
+++ b/include/uapi/linux/bsg.h
@@ -10,12 +10,13 @@
 #define BSG_SUB_PROTOCOL_SCSI_TRANSPORT	2
 
 /*
- * For flags member below
- * sg.h sg_io_hdr also has bits defined for it's flags member. However
- * none of these bits are implemented/used by bsg. The bits below are
- * allocated to not conflict with sg.h ones anyway.
+ * For flag constants below:
+ * sg.h sg_io_hdr also has bits defined for it's flags member. These
+ * two flag values (0x10 and 0x20) have the same meaning in sg.h . For
+ * bsg the BSG_FLAG_Q_AT_HEAD flag is ignored since it is the deafult.
  */
-#define BSG_FLAG_Q_AT_TAIL 0x10 /* default, == 0 at this bit, is Q_AT_HEAD */
+#define BSG_FLAG_Q_AT_TAIL 0x10 /* default is Q_AT_HEAD */
+#define BSG_FLAG_Q_AT_HEAD 0x20
 
 struct sg_io_v4 {
 	__s32 guard;		/* [i] 'Q' to differentiate from v3 */
diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index 6d8e61c48563..9ad67b267584 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -40,6 +40,7 @@
 #define VIRTIO_BLK_F_WCE	9	/* Writeback mode enabled after reset */
 #define VIRTIO_BLK_F_TOPOLOGY	10	/* Topology information is available */
 #define VIRTIO_BLK_F_CONFIG_WCE	11	/* Writeback mode available in config */
+#define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
 
 #ifndef __KERNEL__
 /* Old (deprecated) name for VIRTIO_BLK_F_WCE. */
@@ -77,6 +78,10 @@ struct virtio_blk_config {
 
 	/* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */
 	__u8 wce;
+	__u8 unused;
+
+	/* number of vqs, only available when VIRTIO_BLK_F_MQ is set */
+	__u16 num_queues;
 } __attribute__((packed));
 
 /*
diff --git a/kernel/acct.c b/kernel/acct.c
index 51793520566f..b4c667d22e79 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -59,6 +59,7 @@
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
 #include <linux/pid_namespace.h>
+#include <linux/fs_pin.h>
 
 /*
  * These constants control the amount of freespace that suspend and
@@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30};
 /*
  * External references and all of the globals.
  */
-static void do_acct_process(struct bsd_acct_struct *acct,
-		struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct);
 
-/*
- * This structure is used so that all the data protected by lock
- * can be placed in the same cache line as the lock.  This primes
- * the cache line to have the data after getting the lock.
- */
 struct bsd_acct_struct {
+	struct fs_pin		pin;
+	struct mutex		lock;
 	int			active;
 	unsigned long		needcheck;
 	struct file		*file;
 	struct pid_namespace	*ns;
-	struct list_head	list;
+	struct work_struct	work;
+	struct completion	done;
 };
 
-static DEFINE_SPINLOCK(acct_lock);
-static LIST_HEAD(acct_list);
-
 /*
  * Check the amount of free space and suspend/resume accordingly.
  */
-static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct)
 {
 	struct kstatfs sbuf;
-	int res;
-	int act;
-	u64 resume;
-	u64 suspend;
-
-	spin_lock(&acct_lock);
-	res = acct->active;
-	if (!file || time_is_before_jiffies(acct->needcheck))
+
+	if (time_is_before_jiffies(acct->needcheck))
 		goto out;
-	spin_unlock(&acct_lock);
 
 	/* May block */
-	if (vfs_statfs(&file->f_path, &sbuf))
-		return res;
-	suspend = sbuf.f_blocks * SUSPEND;
-	resume = sbuf.f_blocks * RESUME;
-
-	do_div(suspend, 100);
-	do_div(resume, 100);
-
-	if (sbuf.f_bavail <= suspend)
-		act = -1;
-	else if (sbuf.f_bavail >= resume)
-		act = 1;
-	else
-		act = 0;
-
-	/*
-	 * If some joker switched acct->file under us we'ld better be
-	 * silent and _not_ touch anything.
-	 */
-	spin_lock(&acct_lock);
-	if (file != acct->file) {
-		if (act)
-			res = act > 0;
+	if (vfs_statfs(&acct->file->f_path, &sbuf))
 		goto out;
-	}
 
 	if (acct->active) {
-		if (act < 0) {
+		u64 suspend = sbuf.f_blocks * SUSPEND;
+		do_div(suspend, 100);
+		if (sbuf.f_bavail <= suspend) {
 			acct->active = 0;
 			pr_info("Process accounting paused\n");
 		}
 	} else {
-		if (act > 0) {
+		u64 resume = sbuf.f_blocks * RESUME;
+		do_div(resume, 100);
+		if (sbuf.f_bavail >= resume) {
 			acct->active = 1;
 			pr_info("Process accounting resumed\n");
 		}
 	}
 
 	acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-	res = acct->active;
 out:
-	spin_unlock(&acct_lock);
+	return acct->active;
+}
+
+static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
+{
+	struct bsd_acct_struct *res;
+again:
+	smp_rmb();
+	rcu_read_lock();
+	res = ACCESS_ONCE(ns->bacct);
+	if (!res) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	if (!atomic_long_inc_not_zero(&res->pin.count)) {
+		rcu_read_unlock();
+		cpu_relax();
+		goto again;
+	}
+	rcu_read_unlock();
+	mutex_lock(&res->lock);
+	if (!res->ns) {
+		mutex_unlock(&res->lock);
+		pin_put(&res->pin);
+		goto again;
+	}
 	return res;
 }
 
-/*
- * Close the old accounting file (if currently open) and then replace
- * it with file (if non-NULL).
- *
- * NOTE: acct_lock MUST be held on entry and exit.
- */
-static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
-		struct pid_namespace *ns)
+static void close_work(struct work_struct *work)
 {
-	struct file *old_acct = NULL;
-	struct pid_namespace *old_ns = NULL;
-
-	if (acct->file) {
-		old_acct = acct->file;
-		old_ns = acct->ns;
-		acct->active = 0;
-		acct->file = NULL;
+	struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
+	struct file *file = acct->file;
+	if (file->f_op->flush)
+		file->f_op->flush(file, NULL);
+	__fput_sync(file);
+	complete(&acct->done);
+}
+
+static void acct_kill(struct bsd_acct_struct *acct,
+		      struct bsd_acct_struct *new)
+{
+	if (acct) {
+		struct pid_namespace *ns = acct->ns;
+		do_acct_process(acct);
+		INIT_WORK(&acct->work, close_work);
+		init_completion(&acct->done);
+		schedule_work(&acct->work);
+		wait_for_completion(&acct->done);
+		pin_remove(&acct->pin);
+		ns->bacct = new;
 		acct->ns = NULL;
-		list_del(&acct->list);
-	}
-	if (file) {
-		acct->file = file;
-		acct->ns = ns;
-		acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-		acct->active = 1;
-		list_add(&acct->list, &acct_list);
+		atomic_long_dec(&acct->pin.count);
+		mutex_unlock(&acct->lock);
+		pin_put(&acct->pin);
 	}
-	if (old_acct) {
-		mnt_unpin(old_acct->f_path.mnt);
-		spin_unlock(&acct_lock);
-		do_acct_process(acct, old_ns, old_acct);
-		filp_close(old_acct, NULL);
-		spin_lock(&acct_lock);
+}
+
+static void acct_pin_kill(struct fs_pin *pin)
+{
+	struct bsd_acct_struct *acct;
+	acct = container_of(pin, struct bsd_acct_struct, pin);
+	mutex_lock(&acct->lock);
+	if (!acct->ns) {
+		mutex_unlock(&acct->lock);
+		pin_put(pin);
+		acct = NULL;
 	}
+	acct_kill(acct, NULL);
 }
 
 static int acct_on(struct filename *pathname)
 {
 	struct file *file;
-	struct vfsmount *mnt;
-	struct pid_namespace *ns;
-	struct bsd_acct_struct *acct = NULL;
+	struct vfsmount *mnt, *internal;
+	struct pid_namespace *ns = task_active_pid_ns(current);
+	struct bsd_acct_struct *acct, *old;
+	int err;
+
+	acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+	if (!acct)
+		return -ENOMEM;
 
 	/* Difference from BSD - they don't do O_APPEND */
 	file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
-	if (IS_ERR(file))
+	if (IS_ERR(file)) {
+		kfree(acct);
 		return PTR_ERR(file);
+	}
 
 	if (!S_ISREG(file_inode(file)->i_mode)) {
+		kfree(acct);
 		filp_close(file, NULL);
 		return -EACCES;
 	}
 
 	if (!file->f_op->write) {
+		kfree(acct);
 		filp_close(file, NULL);
 		return -EIO;
 	}
-
-	ns = task_active_pid_ns(current);
-	if (ns->bacct == NULL) {
-		acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
-		if (acct == NULL) {
-			filp_close(file, NULL);
-			return -ENOMEM;
-		}
+	internal = mnt_clone_internal(&file->f_path);
+	if (IS_ERR(internal)) {
+		kfree(acct);
+		filp_close(file, NULL);
+		return PTR_ERR(internal);
 	}
-
-	spin_lock(&acct_lock);
-	if (ns->bacct == NULL) {
-		ns->bacct = acct;
-		acct = NULL;
+	err = mnt_want_write(internal);
+	if (err) {
+		mntput(internal);
+		kfree(acct);
+		filp_close(file, NULL);
+		return err;
 	}
-
 	mnt = file->f_path.mnt;
-	mnt_pin(mnt);
-	acct_file_reopen(ns->bacct, file, ns);
-	spin_unlock(&acct_lock);
-
-	mntput(mnt); /* it's pinned, now give up active reference */
-	kfree(acct);
-
+	file->f_path.mnt = internal;
+
+	atomic_long_set(&acct->pin.count, 1);
+	acct->pin.kill = acct_pin_kill;
+	acct->file = file;
+	acct->needcheck = jiffies;
+	acct->ns = ns;
+	mutex_init(&acct->lock);
+	mutex_lock_nested(&acct->lock, 1);	/* nobody has seen it yet */
+	pin_insert(&acct->pin, mnt);
+
+	old = acct_get(ns);
+	if (old)
+		acct_kill(old, acct);
+	else
+		ns->bacct = acct;
+	mutex_unlock(&acct->lock);
+	mnt_drop_write(mnt);
+	mntput(mnt);
 	return 0;
 }
 
+static DEFINE_MUTEX(acct_on_mutex);
+
 /**
  * sys_acct - enable/disable process accounting
  * @name: file name for accounting records or NULL to shutdown accounting
@@ -264,78 +283,20 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 
 		if (IS_ERR(tmp))
 			return PTR_ERR(tmp);
+		mutex_lock(&acct_on_mutex);
 		error = acct_on(tmp);
+		mutex_unlock(&acct_on_mutex);
 		putname(tmp);
 	} else {
-		struct bsd_acct_struct *acct;
-
-		acct = task_active_pid_ns(current)->bacct;
-		if (acct == NULL)
-			return 0;
-
-		spin_lock(&acct_lock);
-		acct_file_reopen(acct, NULL, NULL);
-		spin_unlock(&acct_lock);
+		acct_kill(acct_get(task_active_pid_ns(current)), NULL);
 	}
 
 	return error;
 }
 
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @m: vfsmount being shut down
- *
- * If the accounting is turned on for a file in the subtree pointed to
- * to by m, turn accounting off.  Done when m is about to die.
- */
-void acct_auto_close_mnt(struct vfsmount *m)
-{
-	struct bsd_acct_struct *acct;
-
-	spin_lock(&acct_lock);
-restart:
-	list_for_each_entry(acct, &acct_list, list)
-		if (acct->file && acct->file->f_path.mnt == m) {
-			acct_file_reopen(acct, NULL, NULL);
-			goto restart;
-		}
-	spin_unlock(&acct_lock);
-}
-
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @sb: super block for the filesystem
- *
- * If the accounting is turned on for a file in the filesystem pointed
- * to by sb, turn accounting off.
- */
-void acct_auto_close(struct super_block *sb)
-{
-	struct bsd_acct_struct *acct;
-
-	spin_lock(&acct_lock);
-restart:
-	list_for_each_entry(acct, &acct_list, list)
-		if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
-			acct_file_reopen(acct, NULL, NULL);
-			goto restart;
-		}
-	spin_unlock(&acct_lock);
-}
-
 void acct_exit_ns(struct pid_namespace *ns)
 {
-	struct bsd_acct_struct *acct = ns->bacct;
-
-	if (acct == NULL)
-		return;
-
-	spin_lock(&acct_lock);
-	if (acct->file != NULL)
-		acct_file_reopen(acct, NULL, NULL);
-	spin_unlock(&acct_lock);
-
-	kfree(acct);
+	acct_kill(acct_get(ns), NULL);
 }
 
 /*
@@ -450,38 +411,20 @@ static u32 encode_float(u64 value)
  *  do_exit() or when switching to a different output file.
  */
 
-/*
- *  do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct,
-		struct pid_namespace *ns, struct file *file)
+static void fill_ac(acct_t *ac)
 {
 	struct pacct_struct *pacct = &current->signal->pacct;
-	acct_t ac;
-	mm_segment_t fs;
-	unsigned long flim;
 	u64 elapsed, run_time;
 	struct tty_struct *tty;
-	const struct cred *orig_cred;
-
-	/* Perform file operations on behalf of whoever enabled accounting */
-	orig_cred = override_creds(file->f_cred);
-
-	/*
-	 * First check to see if there is enough free_space to continue
-	 * the process accounting system.
-	 */
-	if (!check_free_space(acct, file))
-		goto out;
 
 	/*
 	 * Fill the accounting struct with the needed info as recorded
 	 * by the different kernel functions.
 	 */
-	memset(&ac, 0, sizeof(acct_t));
+	memset(ac, 0, sizeof(acct_t));
 
-	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
-	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+	ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+	strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
 
 	/* calculate run_time in nsec*/
 	run_time = ktime_get_ns();
@@ -489,9 +432,9 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 	/* convert nsec -> AHZ */
 	elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION == 3
-	ac.ac_etime = encode_float(elapsed);
+	ac->ac_etime = encode_float(elapsed);
 #else
-	ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+	ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
 				(unsigned long) elapsed : (unsigned long) -1l);
 #endif
 #if ACCT_VERSION == 1 || ACCT_VERSION == 2
@@ -499,18 +442,58 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 		/* new enlarged etime field */
 		comp2_t etime = encode_comp2_t(elapsed);
 
-		ac.ac_etime_hi = etime >> 16;
-		ac.ac_etime_lo = (u16) etime;
+		ac->ac_etime_hi = etime >> 16;
+		ac->ac_etime_lo = (u16) etime;
 	}
 #endif
 	do_div(elapsed, AHZ);
-	ac.ac_btime = get_seconds() - elapsed;
+	ac->ac_btime = get_seconds() - elapsed;
+#if ACCT_VERSION==2
+	ac->ac_ahz = AHZ;
+#endif
+
+	spin_lock_irq(&current->sighand->siglock);
+	tty = current->signal->tty;	/* Safe as we hold the siglock */
+	ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+	ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+	ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+	ac->ac_flag = pacct->ac_flag;
+	ac->ac_mem = encode_comp_t(pacct->ac_mem);
+	ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
+	ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
+	ac->ac_exitcode = pacct->ac_exitcode;
+	spin_unlock_irq(&current->sighand->siglock);
+}
+/*
+ *  do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+	acct_t ac;
+	unsigned long flim;
+	const struct cred *orig_cred;
+	struct pid_namespace *ns = acct->ns;
+	struct file *file = acct->file;
+
+	/*
+	 * Accounting records are not subject to resource limits.
+	 */
+	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+	/* Perform file operations on behalf of whoever enabled accounting */
+	orig_cred = override_creds(file->f_cred);
+
+	/*
+	 * First check to see if there is enough free_space to continue
+	 * the process accounting system.
+	 */
+	if (!check_free_space(acct))
+		goto out;
+
+	fill_ac(&ac);
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
 	ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
-#if ACCT_VERSION == 2
-	ac.ac_ahz = AHZ;
-#endif
 #if ACCT_VERSION == 1 || ACCT_VERSION == 2
 	/* backward-compatible 16 bit fields */
 	ac.ac_uid16 = ac.ac_uid;
@@ -522,45 +505,18 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 	ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
 	rcu_read_unlock();
 #endif
-
-	spin_lock_irq(&current->sighand->siglock);
-	tty = current->signal->tty;	/* Safe as we hold the siglock */
-	ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
-	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
-	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
-	ac.ac_flag = pacct->ac_flag;
-	ac.ac_mem = encode_comp_t(pacct->ac_mem);
-	ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
-	ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
-	ac.ac_exitcode = pacct->ac_exitcode;
-	spin_unlock_irq(&current->sighand->siglock);
-	ac.ac_io = encode_comp_t(0 /* current->io_usage */);	/* %% */
-	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
-	ac.ac_swaps = encode_comp_t(0);
-
 	/*
 	 * Get freeze protection. If the fs is frozen, just skip the write
 	 * as we could deadlock the system otherwise.
 	 */
-	if (!file_start_write_trylock(file))
-		goto out;
-	/*
-	 * Kernel segment override to datasegment and write it
-	 * to the accounting file.
-	 */
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-	/*
-	 * Accounting records are not subject to resource limits.
-	 */
-	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-	file->f_op->write(file, (char *)&ac,
-			       sizeof(acct_t), &file->f_pos);
-	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
-	set_fs(fs);
-	file_end_write(file);
+	if (file_start_write_trylock(file)) {
+		/* it's been opened O_APPEND, so position is irrelevant */
+		loff_t pos = 0;
+		__kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+		file_end_write(file);
+	}
 out:
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 	revert_creds(orig_cred);
 }
 
@@ -609,34 +565,20 @@ void acct_collect(long exitcode, int group_dead)
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
-static void acct_process_in_ns(struct pid_namespace *ns)
+static void slow_acct_process(struct pid_namespace *ns)
 {
-	struct file *file = NULL;
-	struct bsd_acct_struct *acct;
-
-	acct = ns->bacct;
-	/*
-	 * accelerate the common fastpath:
-	 */
-	if (!acct || !acct->file)
-		return;
-
-	spin_lock(&acct_lock);
-	file = acct->file;
-	if (unlikely(!file)) {
-		spin_unlock(&acct_lock);
-		return;
+	for ( ; ns; ns = ns->parent) {
+		struct bsd_acct_struct *acct = acct_get(ns);
+		if (acct) {
+			do_acct_process(acct);
+			mutex_unlock(&acct->lock);
+			pin_put(&acct->pin);
+		}
 	}
-	get_file(file);
-	spin_unlock(&acct_lock);
-
-	do_acct_process(acct, ns, file);
-	fput(file);
 }
 
 /**
- * acct_process - now just a wrapper around acct_process_in_ns,
- * which in turn is a wrapper around do_acct_process.
+ * acct_process
  *
  * handles process accounting for an exiting task
  */
@@ -649,6 +591,10 @@ void acct_process(void)
 	 * alive and holds its namespace, which in turn holds
 	 * its parent.
 	 */
-	for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
-		acct_process_in_ns(ns);
+	for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
+		if (ns->bacct)
+			break;
+	}
+	if (unlikely(ns))
+		slow_acct_process(ns);
 }
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 4a83ecd03650..852c81e3ba9a 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -169,7 +169,7 @@ out_fail:
 	return NULL;
 }
 
-void lc_free_by_index(struct lru_cache *lc, unsigned i)
+static void lc_free_by_index(struct lru_cache *lc, unsigned i)
 {
 	void *p = lc->lc_element[i];
 	WARN_ON(!p);
@@ -643,9 +643,10 @@ void lc_set(struct lru_cache *lc, unsigned int enr, int index)
  * lc_dump - Dump a complete LRU cache to seq in textual form.
  * @lc: the lru cache to operate on
  * @seq: the &struct seq_file pointer to seq_printf into
- * @utext: user supplied "heading" or other info
+ * @utext: user supplied additional "heading" or other info
  * @detail: function pointer the user may provide to dump further details
- * of the object the lc_element is embedded in.
+ * of the object the lc_element is embedded in. May be NULL.
+ * Note: a leading space ' ' and trailing newline '\n' is implied.
  */
 void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
 	     void (*detail) (struct seq_file *, struct lc_element *))
@@ -654,16 +655,18 @@ void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext
 	struct lc_element *e;
 	int i;
 
-	seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext);
+	seq_printf(seq, "\tnn: lc_number (new nr) refcnt %s\n ", utext);
 	for (i = 0; i < nr_elements; i++) {
 		e = lc_element_by_index(lc, i);
-		if (e->lc_number == LC_FREE) {
-			seq_printf(seq, "\t%2d: FREE\n", i);
-		} else {
-			seq_printf(seq, "\t%2d: %4u %4u    ", i,
-				   e->lc_number, e->refcnt);
+		if (e->lc_number != e->lc_new_number)
+			seq_printf(seq, "\t%5d: %6d %8d %6d ",
+				i, e->lc_number, e->lc_new_number, e->refcnt);
+		else
+			seq_printf(seq, "\t%5d: %6d %-8s %6d ",
+				i, e->lc_number, "-\"-", e->refcnt);
+		if (detail)
 			detail(seq, e);
-		}
+		seq_putc(seq, '\n');
 	}
 }
 
diff --git a/mm/filemap.c b/mm/filemap.c
index f501b56ec2c6..90effcdf948d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2602,7 +2602,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		 * that this differs from normal direct-io semantics, which
 		 * will return -EFOO even if some bytes were written.
 		 */
-		if (unlikely(status < 0) && !written) {
+		if (unlikely(status < 0)) {
 			err = status;
 			goto out;
 		}
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 7b5dbd1517b5..ab88dc0ea1d3 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -310,7 +310,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
 EXPORT_SYMBOL(iov_iter_init);
 
 static ssize_t get_pages_iovec(struct iov_iter *i,
-		   struct page **pages, size_t maxsize,
+		   struct page **pages, unsigned maxpages,
 		   size_t *start)
 {
 	size_t offset = i->iov_offset;
@@ -323,10 +323,10 @@ static ssize_t get_pages_iovec(struct iov_iter *i,
 	len = iov->iov_len - offset;
 	if (len > i->count)
 		len = i->count;
-	if (len > maxsize)
-		len = maxsize;
 	addr = (unsigned long)iov->iov_base + offset;
 	len += *start = addr & (PAGE_SIZE - 1);
+	if (len > maxpages * PAGE_SIZE)
+		len = maxpages * PAGE_SIZE;
 	addr &= ~(PAGE_SIZE - 1);
 	n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
 	res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
@@ -588,15 +588,14 @@ static unsigned long alignment_bvec(const struct iov_iter *i)
 }
 
 static ssize_t get_pages_bvec(struct iov_iter *i,
-		   struct page **pages, size_t maxsize,
+		   struct page **pages, unsigned maxpages,
 		   size_t *start)
 {
 	const struct bio_vec *bvec = i->bvec;
 	size_t len = bvec->bv_len - i->iov_offset;
 	if (len > i->count)
 		len = i->count;
-	if (len > maxsize)
-		len = maxsize;
+	/* can't be more than PAGE_SIZE */
 	*start = bvec->bv_offset + i->iov_offset;
 
 	get_page(*pages = bvec->bv_page);
@@ -712,13 +711,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 EXPORT_SYMBOL(iov_iter_alignment);
 
 ssize_t iov_iter_get_pages(struct iov_iter *i,
-		   struct page **pages, size_t maxsize,
+		   struct page **pages, unsigned maxpages,
 		   size_t *start)
 {
 	if (i->type & ITER_BVEC)
-		return get_pages_bvec(i, pages, maxsize, start);
+		return get_pages_bvec(i, pages, maxpages, start);
 	else
-		return get_pages_iovec(i, pages, maxsize, start);
+		return get_pages_iovec(i, pages, maxpages, start);
 }
 EXPORT_SYMBOL(iov_iter_get_pages);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index a42add14331c..0e5fb225007c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2323,17 +2323,45 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
 	return shmem_unlink(dir, dentry);
 }
 
+static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
+{
+	bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+	bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode);
+
+	if (old_dir != new_dir && old_is_dir != new_is_dir) {
+		if (old_is_dir) {
+			drop_nlink(old_dir);
+			inc_nlink(new_dir);
+		} else {
+			drop_nlink(new_dir);
+			inc_nlink(old_dir);
+		}
+	}
+	old_dir->i_ctime = old_dir->i_mtime =
+	new_dir->i_ctime = new_dir->i_mtime =
+	old_dentry->d_inode->i_ctime =
+	new_dentry->d_inode->i_ctime = CURRENT_TIME;
+
+	return 0;
+}
+
 /*
  * The VFS layer already does all the dentry stuff for rename,
  * we just have to decrement the usage count for the target if
  * it exists so that the VFS layer correctly free's it when it
  * gets overwritten.
  */
-static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
+static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *inode = old_dentry->d_inode;
 	int they_are_dirs = S_ISDIR(inode->i_mode);
 
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if (flags & RENAME_EXCHANGE)
+		return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
+
 	if (!simple_empty(new_dentry))
 		return -ENOTEMPTY;
 
@@ -3087,7 +3115,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
 	.mkdir		= shmem_mkdir,
 	.rmdir		= shmem_rmdir,
 	.mknod		= shmem_mknod,
-	.rename		= shmem_rename,
+	.rename2	= shmem_rename2,
 	.tmpfile	= shmem_tmpfile,
 #endif
 #ifdef CONFIG_TMPFS_XATTR
diff --git a/net/6lowpan/Kconfig b/net/6lowpan/Kconfig
index 028a5c6d1f61..e4a02ef55102 100644
--- a/net/6lowpan/Kconfig
+++ b/net/6lowpan/Kconfig
@@ -1,5 +1,5 @@
 config 6LOWPAN
-	bool "6LoWPAN Support"
+	tristate "6LoWPAN Support"
 	depends on IPV6
 	---help---
 	  This enables IPv6 over Low power Wireless Personal Area Network -
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 75d427763992..90cc2bdd4064 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -112,59 +112,6 @@ __be16 vlan_dev_vlan_proto(const struct net_device *dev)
 }
 EXPORT_SYMBOL(vlan_dev_vlan_proto);
 
-static struct sk_buff *vlan_reorder_header(struct sk_buff *skb)
-{
-	if (skb_cow(skb, skb_headroom(skb)) < 0) {
-		kfree_skb(skb);
-		return NULL;
-	}
-
-	memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
-	skb->mac_header += VLAN_HLEN;
-	return skb;
-}
-
-struct sk_buff *vlan_untag(struct sk_buff *skb)
-{
-	struct vlan_hdr *vhdr;
-	u16 vlan_tci;
-
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		/* vlan_tci is already set-up so leave this for another time */
-		return skb;
-	}
-
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (unlikely(!skb))
-		goto err_free;
-
-	if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
-		goto err_free;
-
-	vhdr = (struct vlan_hdr *) skb->data;
-	vlan_tci = ntohs(vhdr->h_vlan_TCI);
-	__vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
-
-	skb_pull_rcsum(skb, VLAN_HLEN);
-	vlan_set_encap_proto(skb, vhdr);
-
-	skb = vlan_reorder_header(skb);
-	if (unlikely(!skb))
-		goto err_free;
-
-	skb_reset_network_header(skb);
-	skb_reset_transport_header(skb);
-	skb_reset_mac_len(skb);
-
-	return skb;
-
-err_free:
-	kfree_skb(skb);
-	return NULL;
-}
-EXPORT_SYMBOL(vlan_untag);
-
-
 /*
  * vlan info and vid list
  */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 96b66fd30f96..ab6bb2af1d45 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -20,7 +20,6 @@
 #include "originator.h"
 #include "hard-interface.h"
 #include "translation-table.h"
-#include "multicast.h"
 
 /**
  * batadv_mcast_mla_softif_get - get softif multicast listeners
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index febb0f87fa37..e1bcd653899b 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -181,7 +181,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
 	 */
 	if (unlikely(!vlan_tx_tag_present(skb) &&
 		     skb->protocol == proto)) {
-		skb = vlan_untag(skb);
+		skb = skb_vlan_untag(skb);
 		if (unlikely(!skb))
 			return false;
 	}
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 1059ed3bc255..6d69631b9f4d 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -327,10 +327,7 @@ find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
 		char name[EBT_FUNCTION_MAXNAMELEN];
 	} *e;
 
-	*error = mutex_lock_interruptible(mutex);
-	if (*error != 0)
-		return NULL;
-
+	mutex_lock(mutex);
 	list_for_each_entry(e, head, list) {
 		if (strcmp(e->name, name) == 0)
 			return e;
@@ -1203,10 +1200,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table)
 
 	table->private = newinfo;
 	rwlock_init(&table->lock);
-	ret = mutex_lock_interruptible(&ebt_mutex);
-	if (ret != 0)
-		goto free_chainstack;
-
+	mutex_lock(&ebt_mutex);
 	list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) {
 		if (strcmp(t->name, table->name) == 0) {
 			ret = -EEXIST;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1948d592aa54..b2f571dd933d 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -174,6 +174,7 @@ static struct lock_class_key socket_class;
 #define SKIP_BUF_SIZE	1024
 
 static void queue_con(struct ceph_connection *con);
+static void cancel_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
 static void con_fault(struct ceph_connection *con);
 
@@ -680,7 +681,7 @@ void ceph_con_close(struct ceph_connection *con)
 
 	reset_connection(con);
 	con->peer_global_seq = 0;
-	cancel_delayed_work(&con->work);
+	cancel_con(con);
 	con_close_socket(con);
 	mutex_unlock(&con->mutex);
 }
@@ -900,7 +901,7 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
 	BUG_ON(page_count > (int)USHRT_MAX);
 	cursor->page_count = (unsigned short)page_count;
 	BUG_ON(length > SIZE_MAX - cursor->page_offset);
-	cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE;
+	cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE;
 }
 
 static struct page *
@@ -2667,19 +2668,16 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
 {
 	if (!con->ops->get(con)) {
 		dout("%s %p ref count 0\n", __func__, con);
-
 		return -ENOENT;
 	}
 
 	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
 		dout("%s %p - already queued\n", __func__, con);
 		con->ops->put(con);
-
 		return -EBUSY;
 	}
 
 	dout("%s %p %lu\n", __func__, con, delay);
-
 	return 0;
 }
 
@@ -2688,6 +2686,14 @@ static void queue_con(struct ceph_connection *con)
 	(void) queue_con_delay(con, 0);
 }
 
+static void cancel_con(struct ceph_connection *con)
+{
+	if (cancel_delayed_work(&con->work)) {
+		dout("%s %p\n", __func__, con);
+		con->ops->put(con);
+	}
+}
+
 static bool con_sock_closed(struct ceph_connection *con)
 {
 	if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
@@ -3269,24 +3275,21 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 /*
  * Free a generically kmalloc'd message.
  */
-void ceph_msg_kfree(struct ceph_msg *m)
+static void ceph_msg_free(struct ceph_msg *m)
 {
-	dout("msg_kfree %p\n", m);
+	dout("%s %p\n", __func__, m);
 	ceph_kvfree(m->front.iov_base);
 	kmem_cache_free(ceph_msg_cache, m);
 }
 
-/*
- * Drop a msg ref.  Destroy as needed.
- */
-void ceph_msg_last_put(struct kref *kref)
+static void ceph_msg_release(struct kref *kref)
 {
 	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
 	LIST_HEAD(data);
 	struct list_head *links;
 	struct list_head *next;
 
-	dout("ceph_msg_put last one on %p\n", m);
+	dout("%s %p\n", __func__, m);
 	WARN_ON(!list_empty(&m->list_head));
 
 	/* drop middle, data, if any */
@@ -3308,9 +3311,25 @@ void ceph_msg_last_put(struct kref *kref)
 	if (m->pool)
 		ceph_msgpool_put(m->pool, m);
 	else
-		ceph_msg_kfree(m);
+		ceph_msg_free(m);
+}
+
+struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+	dout("%s %p (was %d)\n", __func__, msg,
+	     atomic_read(&msg->kref.refcount));
+	kref_get(&msg->kref);
+	return msg;
+}
+EXPORT_SYMBOL(ceph_msg_get);
+
+void ceph_msg_put(struct ceph_msg *msg)
+{
+	dout("%s %p (was %d)\n", __func__, msg,
+	     atomic_read(&msg->kref.refcount));
+	kref_put(&msg->kref, ceph_msg_release);
 }
-EXPORT_SYMBOL(ceph_msg_last_put);
+EXPORT_SYMBOL(ceph_msg_put);
 
 void ceph_msg_dump(struct ceph_msg *msg)
 {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 05be0c181695..30f6faf3584f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -297,12 +297,21 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 /*
  * requests
  */
-void ceph_osdc_release_request(struct kref *kref)
+static void ceph_osdc_release_request(struct kref *kref)
 {
-	struct ceph_osd_request *req;
+	struct ceph_osd_request *req = container_of(kref,
+					    struct ceph_osd_request, r_kref);
 	unsigned int which;
 
-	req = container_of(kref, struct ceph_osd_request, r_kref);
+	dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
+	     req->r_request, req->r_reply);
+	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+	WARN_ON(!list_empty(&req->r_req_lru_item));
+	WARN_ON(!list_empty(&req->r_osd_item));
+	WARN_ON(!list_empty(&req->r_linger_item));
+	WARN_ON(!list_empty(&req->r_linger_osd_item));
+	WARN_ON(req->r_osd);
+
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
 	if (req->r_reply) {
@@ -320,7 +329,22 @@ void ceph_osdc_release_request(struct kref *kref)
 		kmem_cache_free(ceph_osd_request_cache, req);
 
 }
-EXPORT_SYMBOL(ceph_osdc_release_request);
+
+void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+	dout("%s %p (was %d)\n", __func__, req,
+	     atomic_read(&req->r_kref.refcount));
+	kref_get(&req->r_kref);
+}
+EXPORT_SYMBOL(ceph_osdc_get_request);
+
+void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+	dout("%s %p (was %d)\n", __func__, req,
+	     atomic_read(&req->r_kref.refcount));
+	kref_put(&req->r_kref, ceph_osdc_release_request);
+}
+EXPORT_SYMBOL(ceph_osdc_put_request);
 
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
@@ -364,7 +388,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	RB_CLEAR_NODE(&req->r_node);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 	INIT_LIST_HEAD(&req->r_linger_item);
-	INIT_LIST_HEAD(&req->r_linger_osd);
+	INIT_LIST_HEAD(&req->r_linger_osd_item);
 	INIT_LIST_HEAD(&req->r_req_lru_item);
 	INIT_LIST_HEAD(&req->r_osd_item);
 
@@ -916,7 +940,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
 	 * list at the end to keep things in tid order.
 	 */
 	list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
-				 r_linger_osd) {
+				 r_linger_osd_item) {
 		/*
 		 * reregister request prior to unregistering linger so
 		 * that r_osd is preserved.
@@ -1008,6 +1032,8 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
 	dout("__remove_osd %p\n", osd);
 	BUG_ON(!list_empty(&osd->o_requests));
+	BUG_ON(!list_empty(&osd->o_linger_requests));
+
 	rb_erase(&osd->o_node, &osdc->osds);
 	list_del_init(&osd->o_osd_lru);
 	ceph_con_close(&osd->o_con);
@@ -1029,12 +1055,23 @@ static void remove_all_osds(struct ceph_osd_client *osdc)
 static void __move_osd_to_lru(struct ceph_osd_client *osdc,
 			      struct ceph_osd *osd)
 {
-	dout("__move_osd_to_lru %p\n", osd);
+	dout("%s %p\n", __func__, osd);
 	BUG_ON(!list_empty(&osd->o_osd_lru));
+
 	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
 	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
 }
 
+static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
+				  struct ceph_osd *osd)
+{
+	dout("%s %p\n", __func__, osd);
+
+	if (list_empty(&osd->o_requests) &&
+	    list_empty(&osd->o_linger_requests))
+		__move_osd_to_lru(osdc, osd);
+}
+
 static void __remove_osd_from_lru(struct ceph_osd *osd)
 {
 	dout("__remove_osd_from_lru %p\n", osd);
@@ -1175,6 +1212,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 
 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
 	rb_erase(&req->r_node, &osdc->requests);
+	RB_CLEAR_NODE(&req->r_node);
 	osdc->num_requests--;
 
 	if (req->r_osd) {
@@ -1182,12 +1220,8 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 		ceph_msg_revoke(req->r_request);
 
 		list_del_init(&req->r_osd_item);
-		if (list_empty(&req->r_osd->o_requests) &&
-		    list_empty(&req->r_osd->o_linger_requests)) {
-			dout("moving osd to %p lru\n", req->r_osd);
-			__move_osd_to_lru(osdc, req->r_osd);
-		}
-		if (list_empty(&req->r_linger_item))
+		maybe_move_osd_to_lru(osdc, req->r_osd);
+		if (list_empty(&req->r_linger_osd_item))
 			req->r_osd = NULL;
 	}
 
@@ -1214,45 +1248,39 @@ static void __cancel_request(struct ceph_osd_request *req)
 static void __register_linger_request(struct ceph_osd_client *osdc,
 				    struct ceph_osd_request *req)
 {
-	dout("__register_linger_request %p\n", req);
+	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+	WARN_ON(!req->r_linger);
+
 	ceph_osdc_get_request(req);
 	list_add_tail(&req->r_linger_item, &osdc->req_linger);
 	if (req->r_osd)
-		list_add_tail(&req->r_linger_osd,
+		list_add_tail(&req->r_linger_osd_item,
 			      &req->r_osd->o_linger_requests);
 }
 
 static void __unregister_linger_request(struct ceph_osd_client *osdc,
 					struct ceph_osd_request *req)
 {
-	dout("__unregister_linger_request %p\n", req);
+	WARN_ON(!req->r_linger);
+
+	if (list_empty(&req->r_linger_item)) {
+		dout("%s %p tid %llu not registered\n", __func__, req,
+		     req->r_tid);
+		return;
+	}
+
+	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
 	list_del_init(&req->r_linger_item);
-	if (req->r_osd) {
-		list_del_init(&req->r_linger_osd);
 
-		if (list_empty(&req->r_osd->o_requests) &&
-		    list_empty(&req->r_osd->o_linger_requests)) {
-			dout("moving osd to %p lru\n", req->r_osd);
-			__move_osd_to_lru(osdc, req->r_osd);
-		}
+	if (req->r_osd) {
+		list_del_init(&req->r_linger_osd_item);
+		maybe_move_osd_to_lru(osdc, req->r_osd);
 		if (list_empty(&req->r_osd_item))
 			req->r_osd = NULL;
 	}
 	ceph_osdc_put_request(req);
 }
 
-void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
-					 struct ceph_osd_request *req)
-{
-	mutex_lock(&osdc->request_mutex);
-	if (req->r_linger) {
-		req->r_linger = 0;
-		__unregister_linger_request(osdc, req);
-	}
-	mutex_unlock(&osdc->request_mutex);
-}
-EXPORT_SYMBOL(ceph_osdc_unregister_linger_request);
-
 void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
 				  struct ceph_osd_request *req)
 {
@@ -2430,6 +2458,25 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 EXPORT_SYMBOL(ceph_osdc_start_request);
 
 /*
+ * Unregister a registered request.  The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+
+	mutex_lock(&osdc->request_mutex);
+	if (req->r_linger)
+		__unregister_linger_request(osdc, req);
+	__unregister_request(osdc, req);
+	mutex_unlock(&osdc->request_mutex);
+
+	dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
+
+/*
  * wait for a request to complete
  */
 int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
@@ -2437,18 +2484,18 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 {
 	int rc;
 
+	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+
 	rc = wait_for_completion_interruptible(&req->r_completion);
 	if (rc < 0) {
-		mutex_lock(&osdc->request_mutex);
-		__cancel_request(req);
-		__unregister_request(osdc, req);
-		mutex_unlock(&osdc->request_mutex);
+		dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
+		ceph_osdc_cancel_request(req);
 		complete_request(req);
-		dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
 		return rc;
 	}
 
-	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+	dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
+	     req->r_result);
 	return req->r_result;
 }
 EXPORT_SYMBOL(ceph_osdc_wait_request);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1c15b189c52b..b65a5051361f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3602,7 +3602,7 @@ another_round:
 
 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
-		skb = vlan_untag(skb);
+		skb = skb_vlan_untag(skb);
 		if (unlikely(!skb))
 			goto unlock;
 	}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8d39071f32d7..f0493e3b7471 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -804,7 +804,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 			(nla_total_size(sizeof(struct ifla_vf_mac)) +
 			 nla_total_size(sizeof(struct ifla_vf_vlan)) +
 			 nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
-			 nla_total_size(sizeof(struct ifla_vf_rate)));
+			 nla_total_size(sizeof(struct ifla_vf_rate)) +
+			 nla_total_size(sizeof(struct ifla_vf_link_state)));
 		return size;
 	} else
 		return 0;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 224506a6fa80..163b673f9e62 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -62,6 +62,7 @@
 #include <linux/scatterlist.h>
 #include <linux/errqueue.h>
 #include <linux/prefetch.h>
+#include <linux/if_vlan.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
@@ -3973,3 +3974,55 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
 	return shinfo->gso_size;
 }
 EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
+
+static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
+{
+	if (skb_cow(skb, skb_headroom(skb)) < 0) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
+	skb->mac_header += VLAN_HLEN;
+	return skb;
+}
+
+struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
+{
+	struct vlan_hdr *vhdr;
+	u16 vlan_tci;
+
+	if (unlikely(vlan_tx_tag_present(skb))) {
+		/* vlan_tci is already set-up so leave this for another time */
+		return skb;
+	}
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		goto err_free;
+
+	if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
+		goto err_free;
+
+	vhdr = (struct vlan_hdr *)skb->data;
+	vlan_tci = ntohs(vhdr->h_vlan_TCI);
+	__vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
+
+	skb_pull_rcsum(skb, VLAN_HLEN);
+	vlan_set_encap_proto(skb, vhdr);
+
+	skb = skb_reorder_vlan_header(skb);
+	if (unlikely(!skb))
+		goto err_free;
+
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_reset_mac_len(skb);
+
+	return skb;
+
+err_free:
+	kfree_skb(skb);
+	return NULL;
+}
+EXPORT_SYMBOL(skb_vlan_untag);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 190199851c9a..eaa4b000c7b4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1798,8 +1798,6 @@ local_input:
 no_route:
 	RT_CACHE_STAT_INC(in_no_route);
 	res.type = RTN_UNREACHABLE;
-	if (err == -ESRCH)
-		err = -ENETUNREACH;
 	goto local_input;
 
 	/*
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 1fbab0cdd302..a93c97f106d4 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -35,11 +35,7 @@ EXPORT_SYMBOL_GPL(nf_ipv6_ops);
 
 int nf_register_afinfo(const struct nf_afinfo *afinfo)
 {
-	int err;
-
-	err = mutex_lock_interruptible(&afinfo_mutex);
-	if (err < 0)
-		return err;
+	mutex_lock(&afinfo_mutex);
 	RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo);
 	mutex_unlock(&afinfo_mutex);
 	return 0;
@@ -68,11 +64,8 @@ static DEFINE_MUTEX(nf_hook_mutex);
 int nf_register_hook(struct nf_hook_ops *reg)
 {
 	struct nf_hook_ops *elem;
-	int err;
 
-	err = mutex_lock_interruptible(&nf_hook_mutex);
-	if (err < 0)
-		return err;
+	mutex_lock(&nf_hook_mutex);
 	list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
 		if (reg->priority < elem->priority)
 			break;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 8416307fdd1d..fd3f444a4f96 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2271,10 +2271,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 	    cmd == IP_VS_SO_SET_STOPDAEMON) {
 		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
 
-		if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
-			ret = -ERESTARTSYS;
-			goto out_dec;
-		}
+		mutex_lock(&ipvs->sync_mutex);
 		if (cmd == IP_VS_SO_SET_STARTDAEMON)
 			ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
 						dm->syncid);
@@ -2284,11 +2281,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		goto out_dec;
 	}
 
-	if (mutex_lock_interruptible(&__ip_vs_mutex)) {
-		ret = -ERESTARTSYS;
-		goto out_dec;
-	}
-
+	mutex_lock(&__ip_vs_mutex);
 	if (cmd == IP_VS_SO_SET_FLUSH) {
 		/* Flush the virtual service */
 		ret = ip_vs_flush(net, false);
@@ -2573,9 +2566,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		struct ip_vs_daemon_user d[2];
 
 		memset(&d, 0, sizeof(d));
-		if (mutex_lock_interruptible(&ipvs->sync_mutex))
-			return -ERESTARTSYS;
-
+		mutex_lock(&ipvs->sync_mutex);
 		if (ipvs->sync_state & IP_VS_STATE_MASTER) {
 			d[0].state = IP_VS_STATE_MASTER;
 			strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
@@ -2594,9 +2585,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		return ret;
 	}
 
-	if (mutex_lock_interruptible(&__ip_vs_mutex))
-		return -ERESTARTSYS;
-
+	mutex_lock(&__ip_vs_mutex);
 	switch (cmd) {
 	case IP_VS_SO_GET_VERSION:
 	{
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index f042ae521557..c68c1e58b362 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -26,9 +26,7 @@ int nf_register_sockopt(struct nf_sockopt_ops *reg)
 	struct nf_sockopt_ops *ops;
 	int ret = 0;
 
-	if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
-		return -EINTR;
-
+	mutex_lock(&nf_sockopt_mutex);
 	list_for_each_entry(ops, &nf_sockopts, list) {
 		if (ops->pf == reg->pf
 		    && (overlap(ops->set_optmin, ops->set_optmax,
@@ -65,9 +63,7 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf,
 {
 	struct nf_sockopt_ops *ops;
 
-	if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
-		return ERR_PTR(-EINTR);
-
+	mutex_lock(&nf_sockopt_mutex);
 	list_for_each_entry(ops, &nf_sockopts, list) {
 		if (ops->pf == pf) {
 			if (!try_module_get(ops->owner))
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b8035c2d6667..deeb95fb7028 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -899,6 +899,9 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
 static void nft_chain_stats_replace(struct nft_base_chain *chain,
 				    struct nft_stats __percpu *newstats)
 {
+	if (newstats == NULL)
+		return;
+
 	if (chain->stats) {
 		struct nft_stats __percpu *oldstats =
 				nft_dereference(chain->stats);
@@ -3134,16 +3137,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 		goto err2;
 
 	trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
-	if (trans == NULL)
+	if (trans == NULL) {
+		err = -ENOMEM;
 		goto err2;
+	}
 
 	nft_trans_elem(trans) = elem;
 	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
-
-	nft_data_uninit(&elem.key, NFT_DATA_VALUE);
-	if (set->flags & NFT_SET_MAP)
-		nft_data_uninit(&elem.data, set->dtype);
-
 	return 0;
 err2:
 	nft_data_uninit(&elem.key, desc.type);
@@ -3310,7 +3310,7 @@ static int nf_tables_commit(struct sk_buff *skb)
 {
 	struct net *net = sock_net(skb->sk);
 	struct nft_trans *trans, *next;
-	struct nft_set *set;
+	struct nft_trans_elem *te;
 
 	/* Bump generation counter, invalidate any dump in progress */
 	while (++net->nft.base_seq == 0);
@@ -3396,13 +3396,17 @@ static int nf_tables_commit(struct sk_buff *skb)
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_DELSETELEM:
-			nf_tables_setelem_notify(&trans->ctx,
-						 nft_trans_elem_set(trans),
-						 &nft_trans_elem(trans),
+			te = (struct nft_trans_elem *)trans->data;
+			nf_tables_setelem_notify(&trans->ctx, te->set,
+						 &te->elem,
 						 NFT_MSG_DELSETELEM, 0);
-			set = nft_trans_elem_set(trans);
-			set->ops->get(set, &nft_trans_elem(trans));
-			set->ops->remove(set, &nft_trans_elem(trans));
+			te->set->ops->get(te->set, &te->elem);
+			te->set->ops->remove(te->set, &te->elem);
+			nft_data_uninit(&te->elem.key, NFT_DATA_VALUE);
+			if (te->elem.flags & NFT_SET_MAP) {
+				nft_data_uninit(&te->elem.data,
+						te->set->dtype);
+			}
 			nft_trans_destroy(trans);
 			break;
 		}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 47b978bc3100..272ae4d6fdf4 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -71,18 +71,14 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
 static const unsigned int xt_jumpstack_multiplier = 2;
 
 /* Registration hooks for targets. */
-int
-xt_register_target(struct xt_target *target)
+int xt_register_target(struct xt_target *target)
 {
 	u_int8_t af = target->family;
-	int ret;
 
-	ret = mutex_lock_interruptible(&xt[af].mutex);
-	if (ret != 0)
-		return ret;
+	mutex_lock(&xt[af].mutex);
 	list_add(&target->list, &xt[af].target);
 	mutex_unlock(&xt[af].mutex);
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(xt_register_target);
 
@@ -125,20 +121,14 @@ xt_unregister_targets(struct xt_target *target, unsigned int n)
 }
 EXPORT_SYMBOL(xt_unregister_targets);
 
-int
-xt_register_match(struct xt_match *match)
+int xt_register_match(struct xt_match *match)
 {
 	u_int8_t af = match->family;
-	int ret;
-
-	ret = mutex_lock_interruptible(&xt[af].mutex);
-	if (ret != 0)
-		return ret;
 
+	mutex_lock(&xt[af].mutex);
 	list_add(&match->list, &xt[af].match);
 	mutex_unlock(&xt[af].mutex);
-
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(xt_register_match);
 
@@ -194,9 +184,7 @@ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)
 	struct xt_match *m;
 	int err = -ENOENT;
 
-	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
-		return ERR_PTR(-EINTR);
-
+	mutex_lock(&xt[af].mutex);
 	list_for_each_entry(m, &xt[af].match, list) {
 		if (strcmp(m->name, name) == 0) {
 			if (m->revision == revision) {
@@ -239,9 +227,7 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
 	struct xt_target *t;
 	int err = -ENOENT;
 
-	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
-		return ERR_PTR(-EINTR);
-
+	mutex_lock(&xt[af].mutex);
 	list_for_each_entry(t, &xt[af].target, list) {
 		if (strcmp(t->name, name) == 0) {
 			if (t->revision == revision) {
@@ -323,10 +309,7 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target,
 {
 	int have_rev, best = -1;
 
-	if (mutex_lock_interruptible(&xt[af].mutex) != 0) {
-		*err = -EINTR;
-		return 1;
-	}
+	mutex_lock(&xt[af].mutex);
 	if (target == 1)
 		have_rev = target_revfn(af, name, revision, &best);
 	else
@@ -732,9 +715,7 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 {
 	struct xt_table *t;
 
-	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
-		return ERR_PTR(-EINTR);
-
+	mutex_lock(&xt[af].mutex);
 	list_for_each_entry(t, &net->xt.tables[af], list)
 		if (strcmp(t->name, name) == 0 && try_module_get(t->me))
 			return t;
@@ -883,10 +864,7 @@ struct xt_table *xt_register_table(struct net *net,
 		goto out;
 	}
 
-	ret = mutex_lock_interruptible(&xt[table->af].mutex);
-	if (ret != 0)
-		goto out_free;
-
+	mutex_lock(&xt[table->af].mutex);
 	/* Don't autoload: we'd eat our tail... */
 	list_for_each_entry(t, &net->xt.tables[table->af], list) {
 		if (strcmp(t->name, table->name) == 0) {
@@ -911,9 +889,8 @@ struct xt_table *xt_register_table(struct net *net,
 	mutex_unlock(&xt[table->af].mutex);
 	return table;
 
- unlock:
+unlock:
 	mutex_unlock(&xt[table->af].mutex);
-out_free:
 	kfree(table);
 out:
 	return ERR_PTR(ret);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a324b4b34c90..2e152e5f2186 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -213,7 +213,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 		nskb->protocol = htons((u16) sk->sk_protocol);
 		nskb->pkt_type = netlink_is_kernel(sk) ?
 				 PACKET_KERNEL : PACKET_USER;
-
+		skb_reset_network_header(nskb);
 		ret = dev_queue_xmit(nskb);
 		if (unlikely(ret > 0))
 			ret = net_xmit_errno(ret);
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 7ad3f029baae..7228ec3faf19 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -47,8 +47,6 @@
 #include <linux/openvswitch.h>
 #include <linux/rculist.h>
 #include <linux/dmi.h>
-#include <linux/genetlink.h>
-#include <net/genetlink.h>
 #include <net/genetlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index a622ad64acd8..2e0a6f92e563 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -176,7 +176,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
 	len = (buf + buflen) - delim - 1;
 	p = kstrndup(delim + 1, len, GFP_KERNEL);
 	if (p) {
-		unsigned long scope_id = 0;
+		u32 scope_id = 0;
 		struct net_device *dev;
 
 		dev = dev_get_by_name(net, p);
@@ -184,7 +184,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
 			scope_id = dev->ifindex;
 			dev_put(dev);
 		} else {
-			if (strict_strtoul(p, 10, &scope_id) == 0) {
+			if (kstrtou32(p, 10, &scope_id) == 0) {
 				kfree(p);
 				return 0;
 			}
@@ -304,7 +304,7 @@ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags)
  * @sap: buffer into which to plant socket address
  * @salen: size of buffer
  *
- * @uaddr does not have to be '\0'-terminated, but strict_strtoul() and
+ * @uaddr does not have to be '\0'-terminated, but kstrtou8() and
  * rpc_pton() require proper string termination to be successful.
  *
  * Returns the size of the socket address if successful; otherwise
@@ -315,7 +315,7 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
 			  const size_t salen)
 {
 	char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')];
-	unsigned long portlo, porthi;
+	u8 portlo, porthi;
 	unsigned short port;
 
 	if (uaddr_len > RPCBIND_MAXUADDRLEN)
@@ -327,18 +327,14 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
 	c = strrchr(buf, '.');
 	if (unlikely(c == NULL))
 		return 0;
-	if (unlikely(strict_strtoul(c + 1, 10, &portlo) != 0))
-		return 0;
-	if (unlikely(portlo > 255))
+	if (unlikely(kstrtou8(c + 1, 10, &portlo) != 0))
 		return 0;
 
 	*c = '\0';
 	c = strrchr(buf, '.');
 	if (unlikely(c == NULL))
 		return 0;
-	if (unlikely(strict_strtoul(c + 1, 10, &porthi) != 0))
-		return 0;
-	if (unlikely(porthi > 255))
+	if (unlikely(kstrtou8(c + 1, 10, &porthi) != 0))
 		return 0;
 
 	port = (unsigned short)((porthi << 8) | portlo);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index f77366717420..383eb919ac0b 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -48,7 +48,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
 
 	if (!val)
 		goto out_inval;
-	ret = strict_strtoul(val, 0, &num);
+	ret = kstrtoul(val, 0, &num);
 	if (ret == -EINVAL)
 		goto out_inval;
 	nbits = fls(num);
@@ -80,6 +80,10 @@ static struct kernel_param_ops param_ops_hashtbl_sz = {
 module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
 MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
 
+static unsigned long auth_max_cred_cachesize = ULONG_MAX;
+module_param(auth_max_cred_cachesize, ulong, 0644);
+MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size");
+
 static u32
 pseudoflavor_to_flavor(u32 flavor) {
 	if (flavor > RPC_AUTH_MAXFLAVOR)
@@ -363,6 +367,15 @@ rpcauth_cred_key_to_expire(struct rpc_cred *cred)
 }
 EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire);
 
+char *
+rpcauth_stringify_acceptor(struct rpc_cred *cred)
+{
+	if (!cred->cr_ops->crstringify_acceptor)
+		return NULL;
+	return cred->cr_ops->crstringify_acceptor(cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor);
+
 /*
  * Destroy a list of credentials
  */
@@ -472,6 +485,20 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 	return freed;
 }
 
+static unsigned long
+rpcauth_cache_do_shrink(int nr_to_scan)
+{
+	LIST_HEAD(free);
+	unsigned long freed;
+
+	spin_lock(&rpc_credcache_lock);
+	freed = rpcauth_prune_expired(&free, nr_to_scan);
+	spin_unlock(&rpc_credcache_lock);
+	rpcauth_destroy_credlist(&free);
+
+	return freed;
+}
+
 /*
  * Run memory cache shrinker.
  */
@@ -479,9 +506,6 @@ static unsigned long
 rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 
 {
-	LIST_HEAD(free);
-	unsigned long freed;
-
 	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
 		return SHRINK_STOP;
 
@@ -489,12 +513,7 @@ rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 	if (list_empty(&cred_unused))
 		return SHRINK_STOP;
 
-	spin_lock(&rpc_credcache_lock);
-	freed = rpcauth_prune_expired(&free, sc->nr_to_scan);
-	spin_unlock(&rpc_credcache_lock);
-	rpcauth_destroy_credlist(&free);
-
-	return freed;
+	return rpcauth_cache_do_shrink(sc->nr_to_scan);
 }
 
 static unsigned long
@@ -504,6 +523,21 @@ rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 	return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
+static void
+rpcauth_cache_enforce_limit(void)
+{
+	unsigned long diff;
+	unsigned int nr_to_scan;
+
+	if (number_cred_unused <= auth_max_cred_cachesize)
+		return;
+	diff = number_cred_unused - auth_max_cred_cachesize;
+	nr_to_scan = 100;
+	if (diff < nr_to_scan)
+		nr_to_scan = diff;
+	rpcauth_cache_do_shrink(nr_to_scan);
+}
+
 /*
  * Look up a process' credentials in the authentication cache
  */
@@ -523,6 +557,12 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 	hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
 		if (!entry->cr_ops->crmatch(acred, entry, flags))
 			continue;
+		if (flags & RPCAUTH_LOOKUP_RCU) {
+			if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) &&
+			    !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags))
+				cred = entry;
+			break;
+		}
 		spin_lock(&cache->lock);
 		if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
 			spin_unlock(&cache->lock);
@@ -537,6 +577,9 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 	if (cred != NULL)
 		goto found;
 
+	if (flags & RPCAUTH_LOOKUP_RCU)
+		return ERR_PTR(-ECHILD);
+
 	new = auth->au_ops->crcreate(auth, acred, flags);
 	if (IS_ERR(new)) {
 		cred = new;
@@ -557,6 +600,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 	} else
 		list_add_tail(&new->cr_lru, &free);
 	spin_unlock(&cache->lock);
+	rpcauth_cache_enforce_limit();
 found:
 	if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) &&
 	    cred->cr_ops->cr_init != NULL &&
@@ -586,10 +630,8 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
 	memset(&acred, 0, sizeof(acred));
 	acred.uid = cred->fsuid;
 	acred.gid = cred->fsgid;
-	acred.group_info = get_group_info(((struct cred *)cred)->group_info);
-
+	acred.group_info = cred->group_info;
 	ret = auth->au_ops->lookup_cred(auth, &acred, flags);
-	put_group_info(acred.group_info);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(rpcauth_lookupcred);
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index ed04869b2d4f..6f6b829c9e8e 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,12 @@ struct rpc_cred *rpc_lookup_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 
+struct rpc_cred *rpc_lookup_cred_nonblock(void)
+{
+	return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock);
+
 /*
  * Public call interface for looking up machine creds.
  */
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index b6e440baccc3..afb292cd797d 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -183,8 +183,9 @@ gss_cred_get_ctx(struct rpc_cred *cred)
 	struct gss_cl_ctx *ctx = NULL;
 
 	rcu_read_lock();
-	if (gss_cred->gc_ctx)
-		ctx = gss_get_ctx(gss_cred->gc_ctx);
+	ctx = rcu_dereference(gss_cred->gc_ctx);
+	if (ctx)
+		gss_get_ctx(ctx);
 	rcu_read_unlock();
 	return ctx;
 }
@@ -262,9 +263,22 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
 		p = ERR_PTR(ret);
 		goto err;
 	}
-	dprintk("RPC:       %s Success. gc_expiry %lu now %lu timeout %u\n",
-		__func__, ctx->gc_expiry, now, timeout);
-	return q;
+
+	/* is there any trailing data? */
+	if (q == end) {
+		p = q;
+		goto done;
+	}
+
+	/* pull in acceptor name (if there is one) */
+	p = simple_get_netobj(q, end, &ctx->gc_acceptor);
+	if (IS_ERR(p))
+		goto err;
+done:
+	dprintk("RPC:       %s Success. gc_expiry %lu now %lu timeout %u acceptor %.*s\n",
+		__func__, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len,
+		ctx->gc_acceptor.data);
+	return p;
 err:
 	dprintk("RPC:       %s returns error %ld\n", __func__, -PTR_ERR(p));
 	return p;
@@ -1194,13 +1208,13 @@ gss_destroying_context(struct rpc_cred *cred)
 {
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
 	struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
+	struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
 	struct rpc_task *task;
 
-	if (gss_cred->gc_ctx == NULL ||
-	    test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
+	if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
 		return 0;
 
-	gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY;
+	ctx->gc_proc = RPC_GSS_PROC_DESTROY;
 	cred->cr_ops = &gss_nullops;
 
 	/* Take a reference to ensure the cred will be destroyed either
@@ -1225,6 +1239,7 @@ gss_do_free_ctx(struct gss_cl_ctx *ctx)
 
 	gss_delete_sec_context(&ctx->gc_gss_ctx);
 	kfree(ctx->gc_wire_ctx.data);
+	kfree(ctx->gc_acceptor.data);
 	kfree(ctx);
 }
 
@@ -1260,7 +1275,7 @@ gss_destroy_nullcred(struct rpc_cred *cred)
 {
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
 	struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
-	struct gss_cl_ctx *ctx = gss_cred->gc_ctx;
+	struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
 
 	RCU_INIT_POINTER(gss_cred->gc_ctx, NULL);
 	call_rcu(&cred->cr_rcu, gss_free_cred_callback);
@@ -1332,6 +1347,36 @@ gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred)
 	return err;
 }
 
+static char *
+gss_stringify_acceptor(struct rpc_cred *cred)
+{
+	char *string = NULL;
+	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+	struct gss_cl_ctx *ctx;
+	struct xdr_netobj *acceptor;
+
+	rcu_read_lock();
+	ctx = rcu_dereference(gss_cred->gc_ctx);
+	if (!ctx)
+		goto out;
+
+	acceptor = &ctx->gc_acceptor;
+
+	/* no point if there's no string */
+	if (!acceptor->len)
+		goto out;
+
+	string = kmalloc(acceptor->len + 1, GFP_KERNEL);
+	if (!string)
+		goto out;
+
+	memcpy(string, acceptor->data, acceptor->len);
+	string[acceptor->len] = '\0';
+out:
+	rcu_read_unlock();
+	return string;
+}
+
 /*
  * Returns -EACCES if GSS context is NULL or will expire within the
  * timeout (miliseconds)
@@ -1340,15 +1385,16 @@ static int
 gss_key_timeout(struct rpc_cred *rc)
 {
 	struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+	struct gss_cl_ctx *ctx;
 	unsigned long now = jiffies;
 	unsigned long expire;
 
-	if (gss_cred->gc_ctx == NULL)
-		return -EACCES;
-
-	expire = gss_cred->gc_ctx->gc_expiry - (gss_key_expire_timeo * HZ);
-
-	if (time_after(now, expire))
+	rcu_read_lock();
+	ctx = rcu_dereference(gss_cred->gc_ctx);
+	if (ctx)
+		expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ);
+	rcu_read_unlock();
+	if (!ctx || time_after(now, expire))
 		return -EACCES;
 	return 0;
 }
@@ -1357,13 +1403,19 @@ static int
 gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
 {
 	struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+	struct gss_cl_ctx *ctx;
 	int ret;
 
 	if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
 		goto out;
 	/* Don't match with creds that have expired. */
-	if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry))
+	rcu_read_lock();
+	ctx = rcu_dereference(gss_cred->gc_ctx);
+	if (!ctx || time_after(jiffies, ctx->gc_expiry)) {
+		rcu_read_unlock();
 		return 0;
+	}
+	rcu_read_unlock();
 	if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags))
 		return 0;
 out:
@@ -1909,29 +1961,31 @@ static const struct rpc_authops authgss_ops = {
 };
 
 static const struct rpc_credops gss_credops = {
-	.cr_name	= "AUTH_GSS",
-	.crdestroy	= gss_destroy_cred,
-	.cr_init	= gss_cred_init,
-	.crbind		= rpcauth_generic_bind_cred,
-	.crmatch	= gss_match,
-	.crmarshal	= gss_marshal,
-	.crrefresh	= gss_refresh,
-	.crvalidate	= gss_validate,
-	.crwrap_req	= gss_wrap_req,
-	.crunwrap_resp	= gss_unwrap_resp,
-	.crkey_timeout	= gss_key_timeout,
+	.cr_name		= "AUTH_GSS",
+	.crdestroy		= gss_destroy_cred,
+	.cr_init		= gss_cred_init,
+	.crbind			= rpcauth_generic_bind_cred,
+	.crmatch		= gss_match,
+	.crmarshal		= gss_marshal,
+	.crrefresh		= gss_refresh,
+	.crvalidate		= gss_validate,
+	.crwrap_req		= gss_wrap_req,
+	.crunwrap_resp		= gss_unwrap_resp,
+	.crkey_timeout		= gss_key_timeout,
+	.crstringify_acceptor	= gss_stringify_acceptor,
 };
 
 static const struct rpc_credops gss_nullops = {
-	.cr_name	= "AUTH_GSS",
-	.crdestroy	= gss_destroy_nullcred,
-	.crbind		= rpcauth_generic_bind_cred,
-	.crmatch	= gss_match,
-	.crmarshal	= gss_marshal,
-	.crrefresh	= gss_refresh_null,
-	.crvalidate	= gss_validate,
-	.crwrap_req	= gss_wrap_req,
-	.crunwrap_resp	= gss_unwrap_resp,
+	.cr_name		= "AUTH_GSS",
+	.crdestroy		= gss_destroy_nullcred,
+	.crbind			= rpcauth_generic_bind_cred,
+	.crmatch		= gss_match,
+	.crmarshal		= gss_marshal,
+	.crrefresh		= gss_refresh_null,
+	.crvalidate		= gss_validate,
+	.crwrap_req		= gss_wrap_req,
+	.crunwrap_resp		= gss_unwrap_resp,
+	.crstringify_acceptor	= gss_stringify_acceptor,
 };
 
 static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 0f43e894bc0a..f5ed9f6ece06 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -641,7 +641,7 @@ out:
 
 u32
 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
-		     struct xdr_buf *buf, int ec, struct page **pages)
+		     struct xdr_buf *buf, struct page **pages)
 {
 	u32 err;
 	struct xdr_netobj hmac;
@@ -684,13 +684,8 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
 		ecptr = buf->tail[0].iov_base;
 	}
 
-	memset(ecptr, 'X', ec);
-	buf->tail[0].iov_len += ec;
-	buf->len += ec;
-
 	/* copy plaintext gss token header after filler (if any) */
-	memcpy(ecptr + ec, buf->head[0].iov_base + offset,
-						GSS_KRB5_TOK_HDR_LEN);
+	memcpy(ecptr, buf->head[0].iov_base + offset, GSS_KRB5_TOK_HDR_LEN);
 	buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN;
 	buf->len += GSS_KRB5_TOK_HDR_LEN;
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 62ae3273186c..42768e5c3994 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -70,31 +70,37 @@
 
 DEFINE_SPINLOCK(krb5_seq_lock);
 
-static char *
+static void *
 setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token)
 {
-	__be16 *ptr, *krb5_hdr;
+	u16 *ptr;
+	void *krb5_hdr;
 	int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
 
 	token->len = g_token_size(&ctx->mech_used, body_size);
 
-	ptr = (__be16 *)token->data;
+	ptr = (u16 *)token->data;
 	g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr);
 
 	/* ptr now at start of header described in rfc 1964, section 1.2.1: */
 	krb5_hdr = ptr;
 	*ptr++ = KG_TOK_MIC_MSG;
-	*ptr++ = cpu_to_le16(ctx->gk5e->signalg);
+	/*
+	 * signalg is stored as if it were converted from LE to host endian, even
+	 * though it's an opaque pair of bytes according to the RFC.
+	 */
+	*ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg);
 	*ptr++ = SEAL_ALG_NONE;
-	*ptr++ = 0xffff;
+	*ptr = 0xffff;
 
-	return (char *)krb5_hdr;
+	return krb5_hdr;
 }
 
 static void *
 setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
 {
-	__be16 *ptr, *krb5_hdr;
+	u16 *ptr;
+	void *krb5_hdr;
 	u8 *p, flags = 0x00;
 
 	if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0)
@@ -104,15 +110,15 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
 
 	/* Per rfc 4121, sec 4.2.6.1, there is no header,
 	 * just start the token */
-	krb5_hdr = ptr = (__be16 *)token->data;
+	krb5_hdr = ptr = (u16 *)token->data;
 
 	*ptr++ = KG2_TOK_MIC;
 	p = (u8 *)ptr;
 	*p++ = flags;
 	*p++ = 0xff;
-	ptr = (__be16 *)p;
-	*ptr++ = 0xffff;
+	ptr = (u16 *)p;
 	*ptr++ = 0xffff;
+	*ptr = 0xffff;
 
 	token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
 	return krb5_hdr;
@@ -181,7 +187,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
 	spin_lock(&krb5_seq_lock);
 	seq_send = ctx->seq_send64++;
 	spin_unlock(&krb5_seq_lock);
-	*((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
+	*((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
 
 	if (ctx->initiate) {
 		cksumkey = ctx->initiator_sign;
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 42560e55d978..4b614c604fe0 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -201,9 +201,15 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
 
 	msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength;
 
-	*(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
-	memset(ptr + 4, 0xff, 4);
-	*(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+	/*
+	 * signalg and sealalg are stored as if they were converted from LE
+	 * to host endian, even though they're opaque pairs of bytes according
+	 * to the RFC.
+	 */
+	*(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
+	*(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+	ptr[6] = 0xff;
+	ptr[7] = 0xff;
 
 	gss_krb5_make_confounder(msg_start, conflen);
 
@@ -438,7 +444,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
 	u8		*ptr, *plainhdr;
 	s32		now;
 	u8		flags = 0x00;
-	__be16		*be16ptr, ec = 0;
+	__be16		*be16ptr;
 	__be64		*be64ptr;
 	u32		err;
 
@@ -468,16 +474,16 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
 	be16ptr = (__be16 *)ptr;
 
 	blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc);
-	*be16ptr++ = cpu_to_be16(ec);
+	*be16ptr++ = 0;
 	/* "inner" token header always uses 0 for RRC */
-	*be16ptr++ = cpu_to_be16(0);
+	*be16ptr++ = 0;
 
 	be64ptr = (__be64 *)be16ptr;
 	spin_lock(&krb5_seq_lock);
 	*be64ptr = cpu_to_be64(kctx->seq_send64++);
 	spin_unlock(&krb5_seq_lock);
 
-	err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages);
+	err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages);
 	if (err)
 		return err;
 
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index f0ebe07978a2..712c123e04e9 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -35,6 +35,8 @@ nul_destroy(struct rpc_auth *auth)
 static struct rpc_cred *
 nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
+	if (flags & RPCAUTH_LOOKUP_RCU)
+		return &null_cred;
 	return get_rpccred(&null_cred);
 }
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2e6ab10734f6..488ddeed9363 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1746,6 +1746,7 @@ call_bind_status(struct rpc_task *task)
 	case -EHOSTDOWN:
 	case -EHOSTUNREACH:
 	case -ENETUNREACH:
+	case -ENOBUFS:
 	case -EPIPE:
 		dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
 				task->tk_pid, task->tk_status);
@@ -1812,6 +1813,8 @@ call_connect_status(struct rpc_task *task)
 	case -ECONNABORTED:
 	case -ENETUNREACH:
 	case -EHOSTUNREACH:
+	case -ENOBUFS:
+	case -EPIPE:
 		if (RPC_IS_SOFTCONN(task))
 			break;
 		/* retry with existing socket, after a delay */
@@ -1918,6 +1921,7 @@ call_transmit_status(struct rpc_task *task)
 	case -ECONNRESET:
 	case -ECONNABORTED:
 	case -ENOTCONN:
+	case -ENOBUFS:
 	case -EPIPE:
 		rpc_task_force_reencode(task);
 	}
@@ -2034,6 +2038,7 @@ call_status(struct rpc_task *task)
 	case -ECONNRESET:
 	case -ECONNABORTED:
 		rpc_force_rebind(clnt);
+	case -ENOBUFS:
 		rpc_delay(task, 3*HZ);
 	case -EPIPE:
 	case -ENOTCONN:
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index b18554898562..2d12b76b5a64 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -195,7 +195,7 @@ static struct inode *
 rpc_alloc_inode(struct super_block *sb)
 {
 	struct rpc_inode *rpci;
-	rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
+	rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
 	if (!rpci)
 		return NULL;
 	return &rpci->vfs_inode;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 51c63165073c..56e4e150e80e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -744,6 +744,7 @@ static void xprt_connect_status(struct rpc_task *task)
 	case -ECONNABORTED:
 	case -ENETUNREACH:
 	case -EHOSTUNREACH:
+	case -EPIPE:
 	case -EAGAIN:
 		dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
 		break;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 693966d3f33b..6166c985fe24 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,14 +53,6 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
-enum rpcrdma_chunktype {
-	rpcrdma_noch = 0,
-	rpcrdma_readch,
-	rpcrdma_areadch,
-	rpcrdma_writech,
-	rpcrdma_replych
-};
-
 #ifdef RPC_DEBUG
 static const char transfertypes[][12] = {
 	"pure inline",	/* no chunks */
@@ -279,13 +271,37 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 	return (unsigned char *)iptr - (unsigned char *)headerp;
 
 out:
-	for (pos = 0; nchunks--;)
-		pos += rpcrdma_deregister_external(
-				&req->rl_segments[pos], r_xprt);
+	if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
+		for (pos = 0; nchunks--;)
+			pos += rpcrdma_deregister_external(
+					&req->rl_segments[pos], r_xprt);
+	}
 	return n;
 }
 
 /*
+ * Marshal chunks. This routine returns the header length
+ * consumed by marshaling.
+ *
+ * Returns positive RPC/RDMA header size, or negative errno.
+ */
+
+ssize_t
+rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
+{
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base;
+
+	if (req->rl_rtype != rpcrdma_noch)
+		result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
+					       headerp, req->rl_rtype);
+	else if (req->rl_wtype != rpcrdma_noch)
+		result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
+					       headerp, req->rl_wtype);
+	return result;
+}
+
+/*
  * Copy write data inline.
  * This function is used for "small" requests. Data which is passed
  * to RPC via iovecs (or page list) is copied directly into the
@@ -377,7 +393,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	char *base;
 	size_t rpclen, padlen;
 	ssize_t hdrlen;
-	enum rpcrdma_chunktype rtype, wtype;
 	struct rpcrdma_msg *headerp;
 
 	/*
@@ -415,13 +430,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 * into pages; otherwise use reply chunks.
 	 */
 	if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
-		wtype = rpcrdma_noch;
+		req->rl_wtype = rpcrdma_noch;
 	else if (rqst->rq_rcv_buf.page_len == 0)
-		wtype = rpcrdma_replych;
+		req->rl_wtype = rpcrdma_replych;
 	else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
-		wtype = rpcrdma_writech;
+		req->rl_wtype = rpcrdma_writech;
 	else
-		wtype = rpcrdma_replych;
+		req->rl_wtype = rpcrdma_replych;
 
 	/*
 	 * Chunks needed for arguments?
@@ -438,16 +453,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 * TBD check NFSv4 setacl
 	 */
 	if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
-		rtype = rpcrdma_noch;
+		req->rl_rtype = rpcrdma_noch;
 	else if (rqst->rq_snd_buf.page_len == 0)
-		rtype = rpcrdma_areadch;
+		req->rl_rtype = rpcrdma_areadch;
 	else
-		rtype = rpcrdma_readch;
+		req->rl_rtype = rpcrdma_readch;
 
 	/* The following simplification is not true forever */
-	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
-		wtype = rpcrdma_noch;
-	if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
+	if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
+		req->rl_wtype = rpcrdma_noch;
+	if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
 		dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
 			__func__);
 		return -EIO;
@@ -461,7 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 * When padding is in use and applies to the transfer, insert
 	 * it and change the message type.
 	 */
-	if (rtype == rpcrdma_noch) {
+	if (req->rl_rtype == rpcrdma_noch) {
 
 		padlen = rpcrdma_inline_pullup(rqst,
 						RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -476,7 +491,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 			headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
 			headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
 			hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
-			if (wtype != rpcrdma_noch) {
+			if (req->rl_wtype != rpcrdma_noch) {
 				dprintk("RPC:       %s: invalid chunk list\n",
 					__func__);
 				return -EIO;
@@ -497,30 +512,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 			 * on receive. Therefore, we request a reply chunk
 			 * for non-writes wherever feasible and efficient.
 			 */
-			if (wtype == rpcrdma_noch)
-				wtype = rpcrdma_replych;
+			if (req->rl_wtype == rpcrdma_noch)
+				req->rl_wtype = rpcrdma_replych;
 		}
 	}
 
-	/*
-	 * Marshal chunks. This routine will return the header length
-	 * consumed by marshaling.
-	 */
-	if (rtype != rpcrdma_noch) {
-		hdrlen = rpcrdma_create_chunks(rqst,
-					&rqst->rq_snd_buf, headerp, rtype);
-		wtype = rtype;	/* simplify dprintk */
-
-	} else if (wtype != rpcrdma_noch) {
-		hdrlen = rpcrdma_create_chunks(rqst,
-					&rqst->rq_rcv_buf, headerp, wtype);
-	}
+	hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
 	if (hdrlen < 0)
 		return hdrlen;
 
 	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
 		" headerp 0x%p base 0x%p lkey 0x%x\n",
-		__func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+		__func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
 		headerp, base, req->rl_iov.lkey);
 
 	/*
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 66f91f0d071a..2faac4940563 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -296,7 +296,6 @@ xprt_setup_rdma(struct xprt_create *args)
 
 	xprt->resvport = 0;		/* privileged port not needed */
 	xprt->tsh_size = 0;		/* RPC-RDMA handles framing */
-	xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
 	xprt->ops = &xprt_rdma_procs;
 
 	/*
@@ -382,6 +381,9 @@ xprt_setup_rdma(struct xprt_create *args)
 	new_ep->rep_xprt = xprt;
 
 	xprt_rdma_format_addresses(xprt);
+	xprt->max_payload = rpcrdma_max_payload(new_xprt);
+	dprintk("RPC:       %s: transport data payload maximum: %zu bytes\n",
+		__func__, xprt->max_payload);
 
 	if (!try_module_get(THIS_MODULE))
 		goto out4;
@@ -412,7 +414,7 @@ xprt_rdma_close(struct rpc_xprt *xprt)
 	if (r_xprt->rx_ep.rep_connected > 0)
 		xprt->reestablish_timeout = 0;
 	xprt_disconnect_done(xprt);
-	(void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+	rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
 }
 
 static void
@@ -595,13 +597,14 @@ xprt_rdma_send_request(struct rpc_task *task)
 	struct rpc_xprt *xprt = rqst->rq_xprt;
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-	int rc;
+	int rc = 0;
 
-	if (req->rl_niovs == 0) {
+	if (req->rl_niovs == 0)
 		rc = rpcrdma_marshal_req(rqst);
-		if (rc < 0)
-			goto failed_marshal;
-	}
+	else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+		rc = rpcrdma_marshal_chunks(rqst, 0);
+	if (rc < 0)
+		goto failed_marshal;
 
 	if (req->rl_reply == NULL) 		/* e.g. reconnection */
 		rpcrdma_recv_buffer_get(req);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 13dbd1c389ff..61c41298b4ea 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -61,6 +61,8 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
+
 /*
  * internal functions
  */
@@ -103,17 +105,6 @@ rpcrdma_run_tasklet(unsigned long data)
 
 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
 
-static inline void
-rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
-	list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
-	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
-	tasklet_schedule(&rpcrdma_tasklet_g);
-}
-
 static void
 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
 {
@@ -153,12 +144,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
 	if (wc->wr_id == 0ULL)
 		return;
 	if (wc->status != IB_WC_SUCCESS)
-		return;
-
-	if (wc->opcode == IB_WC_FAST_REG_MR)
-		frmr->r.frmr.state = FRMR_IS_VALID;
-	else if (wc->opcode == IB_WC_LOCAL_INV)
-		frmr->r.frmr.state = FRMR_IS_INVALID;
+		frmr->r.frmr.fr_state = FRMR_IS_STALE;
 }
 
 static int
@@ -217,7 +203,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
 }
 
 static void
-rpcrdma_recvcq_process_wc(struct ib_wc *wc)
+rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
 {
 	struct rpcrdma_rep *rep =
 			(struct rpcrdma_rep *)(unsigned long)wc->wr_id;
@@ -248,28 +234,38 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc)
 	}
 
 out_schedule:
-	rpcrdma_schedule_tasklet(rep);
+	list_add_tail(&rep->rr_list, sched_list);
 }
 
 static int
 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
 {
+	struct list_head sched_list;
 	struct ib_wc *wcs;
 	int budget, count, rc;
+	unsigned long flags;
 
+	INIT_LIST_HEAD(&sched_list);
 	budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
 	do {
 		wcs = ep->rep_recv_wcs;
 
 		rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
 		if (rc <= 0)
-			return rc;
+			goto out_schedule;
 
 		count = rc;
 		while (count-- > 0)
-			rpcrdma_recvcq_process_wc(wcs++);
+			rpcrdma_recvcq_process_wc(wcs++, &sched_list);
 	} while (rc == RPCRDMA_POLLSIZE && --budget);
-	return 0;
+	rc = 0;
+
+out_schedule:
+	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	list_splice_tail(&sched_list, &rpcrdma_tasklets_g);
+	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+	tasklet_schedule(&rpcrdma_tasklet_g);
+	return rc;
 }
 
 /*
@@ -310,6 +306,13 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
 	rpcrdma_recvcq_poll(cq, ep);
 }
 
+static void
+rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
+{
+	rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
+	rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
+}
+
 #ifdef RPC_DEBUG
 static const char * const conn[] = {
 	"address resolved",
@@ -323,8 +326,16 @@ static const char * const conn[] = {
 	"rejected",
 	"established",
 	"disconnected",
-	"device removal"
+	"device removal",
+	"multicast join",
+	"multicast error",
+	"address change",
+	"timewait exit",
 };
+
+#define CONNECTION_MSG(status)						\
+	((status) < ARRAY_SIZE(conn) ?					\
+		conn[(status)] : "unrecognized connection error")
 #endif
 
 static int
@@ -382,23 +393,18 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
 		connstate = -ENODEV;
 connected:
-		dprintk("RPC:       %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
-			__func__,
-			(event->event <= 11) ? conn[event->event] :
-						"unknown connection error",
-			&addr->sin_addr.s_addr,
-			ntohs(addr->sin_port),
-			ep, event->event);
 		atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
 		dprintk("RPC:       %s: %sconnected\n",
 					__func__, connstate > 0 ? "" : "dis");
 		ep->rep_connected = connstate;
 		ep->rep_func(ep);
 		wake_up_all(&ep->rep_connect_wait);
-		break;
+		/*FALLTHROUGH*/
 	default:
-		dprintk("RPC:       %s: unexpected CM event %d\n",
-			__func__, event->event);
+		dprintk("RPC:       %s: %pI4:%u (ep 0x%p): %s\n",
+			__func__, &addr->sin_addr.s_addr,
+			ntohs(addr->sin_port), ep,
+			CONNECTION_MSG(event->event));
 		break;
 	}
 
@@ -558,12 +564,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 		if (!ia->ri_id->device->alloc_fmr) {
 			dprintk("RPC:       %s: MTHCAFMR registration "
 				"not supported by HCA\n", __func__);
-#if RPCRDMA_PERSISTENT_REGISTRATION
 			memreg = RPCRDMA_ALLPHYSICAL;
-#else
-			rc = -ENOMEM;
-			goto out2;
-#endif
 		}
 	}
 
@@ -578,20 +579,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	switch (memreg) {
 	case RPCRDMA_FRMR:
 		break;
-#if RPCRDMA_PERSISTENT_REGISTRATION
 	case RPCRDMA_ALLPHYSICAL:
 		mem_priv = IB_ACCESS_LOCAL_WRITE |
 				IB_ACCESS_REMOTE_WRITE |
 				IB_ACCESS_REMOTE_READ;
 		goto register_setup;
-#endif
 	case RPCRDMA_MTHCAFMR:
 		if (ia->ri_have_dma_lkey)
 			break;
 		mem_priv = IB_ACCESS_LOCAL_WRITE;
-#if RPCRDMA_PERSISTENT_REGISTRATION
 	register_setup:
-#endif
 		ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
 		if (IS_ERR(ia->ri_bind_mem)) {
 			printk(KERN_ALERT "%s: ib_get_dma_mr for "
@@ -613,6 +610,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	/* Else will do memory reg/dereg for each chunk */
 	ia->ri_memreg_strategy = memreg;
 
+	rwlock_init(&ia->ri_qplock);
 	return 0;
 out2:
 	rdma_destroy_id(ia->ri_id);
@@ -826,10 +824,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 	cancel_delayed_work_sync(&ep->rep_connect_worker);
 
 	if (ia->ri_id->qp) {
-		rc = rpcrdma_ep_disconnect(ep, ia);
-		if (rc)
-			dprintk("RPC:       %s: rpcrdma_ep_disconnect"
-				" returned %i\n", __func__, rc);
+		rpcrdma_ep_disconnect(ep, ia);
 		rdma_destroy_qp(ia->ri_id);
 		ia->ri_id->qp = NULL;
 	}
@@ -859,7 +854,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 int
 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
-	struct rdma_cm_id *id;
+	struct rdma_cm_id *id, *old;
 	int rc = 0;
 	int retry_count = 0;
 
@@ -867,13 +862,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 		struct rpcrdma_xprt *xprt;
 retry:
 		dprintk("RPC:       %s: reconnecting...\n", __func__);
-		rc = rpcrdma_ep_disconnect(ep, ia);
-		if (rc && rc != -ENOTCONN)
-			dprintk("RPC:       %s: rpcrdma_ep_disconnect"
-				" status %i\n", __func__, rc);
 
-		rpcrdma_clean_cq(ep->rep_attr.recv_cq);
-		rpcrdma_clean_cq(ep->rep_attr.send_cq);
+		rpcrdma_ep_disconnect(ep, ia);
+		rpcrdma_flush_cqs(ep);
+
+		if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
+			rpcrdma_reset_frmrs(ia);
 
 		xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
 		id = rpcrdma_create_id(xprt, ia,
@@ -905,9 +899,14 @@ retry:
 			rc = -ENETUNREACH;
 			goto out;
 		}
-		rdma_destroy_qp(ia->ri_id);
-		rdma_destroy_id(ia->ri_id);
+
+		write_lock(&ia->ri_qplock);
+		old = ia->ri_id;
 		ia->ri_id = id;
+		write_unlock(&ia->ri_qplock);
+
+		rdma_destroy_qp(old);
+		rdma_destroy_id(old);
 	} else {
 		dprintk("RPC:       %s: connecting...\n", __func__);
 		rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -974,13 +973,12 @@ out:
  * This call is not reentrant, and must not be made in parallel
  * on the same endpoint.
  */
-int
+void
 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
 	int rc;
 
-	rpcrdma_clean_cq(ep->rep_attr.recv_cq);
-	rpcrdma_clean_cq(ep->rep_attr.send_cq);
+	rpcrdma_flush_cqs(ep);
 	rc = rdma_disconnect(ia->ri_id);
 	if (!rc) {
 		/* returns without wait if not connected */
@@ -992,12 +990,93 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 		dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
 		ep->rep_connected = rc;
 	}
+}
+
+static int
+rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+	int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
+	struct ib_fmr_attr fmr_attr = {
+		.max_pages	= RPCRDMA_MAX_DATA_SEGS,
+		.max_maps	= 1,
+		.page_shift	= PAGE_SHIFT
+	};
+	struct rpcrdma_mw *r;
+	int i, rc;
+
+	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+	dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
+
+	while (i--) {
+		r = kzalloc(sizeof(*r), GFP_KERNEL);
+		if (r == NULL)
+			return -ENOMEM;
+
+		r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
+		if (IS_ERR(r->r.fmr)) {
+			rc = PTR_ERR(r->r.fmr);
+			dprintk("RPC:       %s: ib_alloc_fmr failed %i\n",
+				__func__, rc);
+			goto out_free;
+		}
+
+		list_add(&r->mw_list, &buf->rb_mws);
+		list_add(&r->mw_all, &buf->rb_all);
+	}
+	return 0;
+
+out_free:
+	kfree(r);
+	return rc;
+}
+
+static int
+rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_frmr *f;
+	struct rpcrdma_mw *r;
+	int i, rc;
+
+	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+	dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
+
+	while (i--) {
+		r = kzalloc(sizeof(*r), GFP_KERNEL);
+		if (r == NULL)
+			return -ENOMEM;
+		f = &r->r.frmr;
+
+		f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+						ia->ri_max_frmr_depth);
+		if (IS_ERR(f->fr_mr)) {
+			rc = PTR_ERR(f->fr_mr);
+			dprintk("RPC:       %s: ib_alloc_fast_reg_mr "
+				"failed %i\n", __func__, rc);
+			goto out_free;
+		}
+
+		f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
+							ia->ri_max_frmr_depth);
+		if (IS_ERR(f->fr_pgl)) {
+			rc = PTR_ERR(f->fr_pgl);
+			dprintk("RPC:       %s: ib_alloc_fast_reg_page_list "
+				"failed %i\n", __func__, rc);
+
+			ib_dereg_mr(f->fr_mr);
+			goto out_free;
+		}
+
+		list_add(&r->mw_list, &buf->rb_mws);
+		list_add(&r->mw_all, &buf->rb_all);
+	}
+
+	return 0;
+
+out_free:
+	kfree(r);
 	return rc;
 }
 
-/*
- * Initialize buffer memory
- */
 int
 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
@@ -1005,7 +1084,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	char *p;
 	size_t len, rlen, wlen;
 	int i, rc;
-	struct rpcrdma_mw *r;
 
 	buf->rb_max_requests = cdata->max_requests;
 	spin_lock_init(&buf->rb_lock);
@@ -1016,28 +1094,12 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	 *   2.  arrays of struct rpcrdma_req to fill in pointers
 	 *   3.  array of struct rpcrdma_rep for replies
 	 *   4.  padding, if any
-	 *   5.  mw's, fmr's or frmr's, if any
 	 * Send/recv buffers in req/rep need to be registered
 	 */
-
 	len = buf->rb_max_requests *
 		(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
 	len += cdata->padding;
-	switch (ia->ri_memreg_strategy) {
-	case RPCRDMA_FRMR:
-		len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
-				sizeof(struct rpcrdma_mw);
-		break;
-	case RPCRDMA_MTHCAFMR:
-		/* TBD we are perhaps overallocating here */
-		len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
-				sizeof(struct rpcrdma_mw);
-		break;
-	default:
-		break;
-	}
 
-	/* allocate 1, 4 and 5 in one shot */
 	p = kzalloc(len, GFP_KERNEL);
 	if (p == NULL) {
 		dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
@@ -1064,51 +1126,17 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 	p += cdata->padding;
 
 	INIT_LIST_HEAD(&buf->rb_mws);
-	r = (struct rpcrdma_mw *)p;
+	INIT_LIST_HEAD(&buf->rb_all);
 	switch (ia->ri_memreg_strategy) {
 	case RPCRDMA_FRMR:
-		for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
-			r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
-						ia->ri_max_frmr_depth);
-			if (IS_ERR(r->r.frmr.fr_mr)) {
-				rc = PTR_ERR(r->r.frmr.fr_mr);
-				dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
-					" failed %i\n", __func__, rc);
-				goto out;
-			}
-			r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
-						ia->ri_id->device,
-						ia->ri_max_frmr_depth);
-			if (IS_ERR(r->r.frmr.fr_pgl)) {
-				rc = PTR_ERR(r->r.frmr.fr_pgl);
-				dprintk("RPC:       %s: "
-					"ib_alloc_fast_reg_page_list "
-					"failed %i\n", __func__, rc);
-
-				ib_dereg_mr(r->r.frmr.fr_mr);
-				goto out;
-			}
-			list_add(&r->mw_list, &buf->rb_mws);
-			++r;
-		}
+		rc = rpcrdma_init_frmrs(ia, buf);
+		if (rc)
+			goto out;
 		break;
 	case RPCRDMA_MTHCAFMR:
-		/* TBD we are perhaps overallocating here */
-		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
-			static struct ib_fmr_attr fa =
-				{ RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
-			r->r.fmr = ib_alloc_fmr(ia->ri_pd,
-				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
-				&fa);
-			if (IS_ERR(r->r.fmr)) {
-				rc = PTR_ERR(r->r.fmr);
-				dprintk("RPC:       %s: ib_alloc_fmr"
-					" failed %i\n", __func__, rc);
-				goto out;
-			}
-			list_add(&r->mw_list, &buf->rb_mws);
-			++r;
-		}
+		rc = rpcrdma_init_fmrs(ia, buf);
+		if (rc)
+			goto out;
 		break;
 	default:
 		break;
@@ -1176,24 +1204,57 @@ out:
 	return rc;
 }
 
-/*
- * Unregister and destroy buffer memory. Need to deal with
- * partial initialization, so it's callable from failed create.
- * Must be called before destroying endpoint, as registrations
- * reference it.
- */
+static void
+rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mw *r;
+	int rc;
+
+	while (!list_empty(&buf->rb_all)) {
+		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+		list_del(&r->mw_all);
+		list_del(&r->mw_list);
+
+		rc = ib_dealloc_fmr(r->r.fmr);
+		if (rc)
+			dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
+				__func__, rc);
+
+		kfree(r);
+	}
+}
+
+static void
+rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mw *r;
+	int rc;
+
+	while (!list_empty(&buf->rb_all)) {
+		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+		list_del(&r->mw_all);
+		list_del(&r->mw_list);
+
+		rc = ib_dereg_mr(r->r.frmr.fr_mr);
+		if (rc)
+			dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
+				__func__, rc);
+		ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+		kfree(r);
+	}
+}
+
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
-	int rc, i;
 	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-	struct rpcrdma_mw *r;
+	int i;
 
 	/* clean up in reverse order from create
 	 *   1.  recv mr memory (mr free, then kfree)
 	 *   2.  send mr memory (mr free, then kfree)
-	 *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
-	 *   4.  arrays
+	 *   3.  MWs
 	 */
 	dprintk("RPC:       %s: entering\n", __func__);
 
@@ -1212,34 +1273,217 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 		}
 	}
 
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		rpcrdma_destroy_frmrs(buf);
+		break;
+	case RPCRDMA_MTHCAFMR:
+		rpcrdma_destroy_fmrs(buf);
+		break;
+	default:
+		break;
+	}
+
+	kfree(buf->rb_pool);
+}
+
+/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
+ * an unusable state. Find FRMRs in this state and dereg / reg
+ * each.  FRMRs that are VALID and attached to an rpcrdma_req are
+ * also torn down.
+ *
+ * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_frmr_external().
+ */
+static void
+rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_xprt *r_xprt =
+				container_of(ia, struct rpcrdma_xprt, rx_ia);
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct list_head *pos;
+	struct rpcrdma_mw *r;
+	int rc;
+
+	list_for_each(pos, &buf->rb_all) {
+		r = list_entry(pos, struct rpcrdma_mw, mw_all);
+
+		if (r->r.frmr.fr_state == FRMR_IS_INVALID)
+			continue;
+
+		rc = ib_dereg_mr(r->r.frmr.fr_mr);
+		if (rc)
+			dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
+				__func__, rc);
+		ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+		r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+					ia->ri_max_frmr_depth);
+		if (IS_ERR(r->r.frmr.fr_mr)) {
+			rc = PTR_ERR(r->r.frmr.fr_mr);
+			dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
+				" failed %i\n", __func__, rc);
+			continue;
+		}
+		r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
+					ia->ri_id->device,
+					ia->ri_max_frmr_depth);
+		if (IS_ERR(r->r.frmr.fr_pgl)) {
+			rc = PTR_ERR(r->r.frmr.fr_pgl);
+			dprintk("RPC:       %s: "
+				"ib_alloc_fast_reg_page_list "
+				"failed %i\n", __func__, rc);
+
+			ib_dereg_mr(r->r.frmr.fr_mr);
+			continue;
+		}
+		r->r.frmr.fr_state = FRMR_IS_INVALID;
+	}
+}
+
+/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
+ * some req segments uninitialized.
+ */
+static void
+rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
+{
+	if (*mw) {
+		list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
+		*mw = NULL;
+	}
+}
+
+/* Cycle mw's back in reverse order, and "spin" them.
+ * This delays and scrambles reuse as much as possible.
+ */
+static void
+rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mr_seg *seg = req->rl_segments;
+	struct rpcrdma_mr_seg *seg1 = seg;
+	int i;
+
+	for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
+		rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
+	rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
+}
+
+static void
+rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+	buf->rb_send_bufs[--buf->rb_send_index] = req;
+	req->rl_niovs = 0;
+	if (req->rl_reply) {
+		buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
+		req->rl_reply->rr_func = NULL;
+		req->rl_reply = NULL;
+	}
+}
+
+/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
+ * Redo only the ib_post_send().
+ */
+static void
+rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_xprt *r_xprt =
+				container_of(ia, struct rpcrdma_xprt, rx_ia);
+	struct ib_send_wr invalidate_wr, *bad_wr;
+	int rc;
+
+	dprintk("RPC:       %s: FRMR %p is stale\n", __func__, r);
+
+	/* When this FRMR is re-inserted into rb_mws, it is no longer stale */
+	r->r.frmr.fr_state = FRMR_IS_INVALID;
+
+	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+	invalidate_wr.wr_id = (unsigned long)(void *)r;
+	invalidate_wr.opcode = IB_WR_LOCAL_INV;
+	invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	dprintk("RPC:       %s: frmr %p invalidating rkey %08x\n",
+		__func__, r, r->r.frmr.fr_mr->rkey);
+
+	read_lock(&ia->ri_qplock);
+	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+	read_unlock(&ia->ri_qplock);
+	if (rc) {
+		/* Force rpcrdma_buffer_get() to retry */
+		r->r.frmr.fr_state = FRMR_IS_STALE;
+		dprintk("RPC:       %s: ib_post_send failed, %i\n",
+			__func__, rc);
+	}
+}
+
+static void
+rpcrdma_retry_flushed_linv(struct list_head *stale,
+			   struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+	struct list_head *pos;
+	struct rpcrdma_mw *r;
+	unsigned long flags;
+
+	list_for_each(pos, stale) {
+		r = list_entry(pos, struct rpcrdma_mw, mw_list);
+		rpcrdma_retry_local_inv(r, ia);
+	}
+
+	spin_lock_irqsave(&buf->rb_lock, flags);
+	list_splice_tail(stale, &buf->rb_mws);
+	spin_unlock_irqrestore(&buf->rb_lock, flags);
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
+			 struct list_head *stale)
+{
+	struct rpcrdma_mw *r;
+	int i;
+
+	i = RPCRDMA_MAX_SEGS - 1;
 	while (!list_empty(&buf->rb_mws)) {
 		r = list_entry(buf->rb_mws.next,
-			struct rpcrdma_mw, mw_list);
+			       struct rpcrdma_mw, mw_list);
 		list_del(&r->mw_list);
-		switch (ia->ri_memreg_strategy) {
-		case RPCRDMA_FRMR:
-			rc = ib_dereg_mr(r->r.frmr.fr_mr);
-			if (rc)
-				dprintk("RPC:       %s:"
-					" ib_dereg_mr"
-					" failed %i\n",
-					__func__, rc);
-			ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-			break;
-		case RPCRDMA_MTHCAFMR:
-			rc = ib_dealloc_fmr(r->r.fmr);
-			if (rc)
-				dprintk("RPC:       %s:"
-					" ib_dealloc_fmr"
-					" failed %i\n",
-					__func__, rc);
-			break;
-		default:
-			break;
+		if (r->r.frmr.fr_state == FRMR_IS_STALE) {
+			list_add(&r->mw_list, stale);
+			continue;
 		}
+		req->rl_segments[i].mr_chunk.rl_mw = r;
+		if (unlikely(i-- == 0))
+			return req;	/* Success */
 	}
 
-	kfree(buf->rb_pool);
+	/* Not enough entries on rb_mws for this req */
+	rpcrdma_buffer_put_sendbuf(req, buf);
+	rpcrdma_buffer_put_mrs(req, buf);
+	return NULL;
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mw *r;
+	int i;
+
+	i = RPCRDMA_MAX_SEGS - 1;
+	while (!list_empty(&buf->rb_mws)) {
+		r = list_entry(buf->rb_mws.next,
+			       struct rpcrdma_mw, mw_list);
+		list_del(&r->mw_list);
+		req->rl_segments[i].mr_chunk.rl_mw = r;
+		if (unlikely(i-- == 0))
+			return req;	/* Success */
+	}
+
+	/* Not enough entries on rb_mws for this req */
+	rpcrdma_buffer_put_sendbuf(req, buf);
+	rpcrdma_buffer_put_mrs(req, buf);
+	return NULL;
 }
 
 /*
@@ -1254,10 +1498,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 struct rpcrdma_req *
 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 {
+	struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+	struct list_head stale;
 	struct rpcrdma_req *req;
 	unsigned long flags;
-	int i;
-	struct rpcrdma_mw *r;
 
 	spin_lock_irqsave(&buffers->rb_lock, flags);
 	if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1277,16 +1521,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 		buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
 	}
 	buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
-	if (!list_empty(&buffers->rb_mws)) {
-		i = RPCRDMA_MAX_SEGS - 1;
-		do {
-			r = list_entry(buffers->rb_mws.next,
-					struct rpcrdma_mw, mw_list);
-			list_del(&r->mw_list);
-			req->rl_segments[i].mr_chunk.rl_mw = r;
-		} while (--i >= 0);
+
+	INIT_LIST_HEAD(&stale);
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
+		break;
+	case RPCRDMA_MTHCAFMR:
+		req = rpcrdma_buffer_get_fmrs(req, buffers);
+		break;
+	default:
+		break;
 	}
 	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+	if (!list_empty(&stale))
+		rpcrdma_retry_flushed_linv(&stale, buffers);
 	return req;
 }
 
@@ -1299,34 +1548,14 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
 {
 	struct rpcrdma_buffer *buffers = req->rl_buffer;
 	struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
-	int i;
 	unsigned long flags;
 
 	spin_lock_irqsave(&buffers->rb_lock, flags);
-	buffers->rb_send_bufs[--buffers->rb_send_index] = req;
-	req->rl_niovs = 0;
-	if (req->rl_reply) {
-		buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
-		req->rl_reply->rr_func = NULL;
-		req->rl_reply = NULL;
-	}
+	rpcrdma_buffer_put_sendbuf(req, buffers);
 	switch (ia->ri_memreg_strategy) {
 	case RPCRDMA_FRMR:
 	case RPCRDMA_MTHCAFMR:
-		/*
-		 * Cycle mw's back in reverse order, and "spin" them.
-		 * This delays and scrambles reuse as much as possible.
-		 */
-		i = 1;
-		do {
-			struct rpcrdma_mw **mw;
-			mw = &req->rl_segments[i].mr_chunk.rl_mw;
-			list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
-			*mw = NULL;
-		} while (++i < RPCRDMA_MAX_SEGS);
-		list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
-					&buffers->rb_mws);
-		req->rl_segments[0].mr_chunk.rl_mw = NULL;
+		rpcrdma_buffer_put_mrs(req, buffers);
 		break;
 	default:
 		break;
@@ -1388,6 +1617,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
 	 */
 	iov->addr = ib_dma_map_single(ia->ri_id->device,
 			va, len, DMA_BIDIRECTIONAL);
+	if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
+		return -ENOMEM;
+
 	iov->length = len;
 
 	if (ia->ri_have_dma_lkey) {
@@ -1483,8 +1715,10 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
 			struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_mr_seg *seg1 = seg;
-	struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
-
+	struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
+	struct rpcrdma_frmr *frmr = &mw->r.frmr;
+	struct ib_mr *mr = frmr->fr_mr;
+	struct ib_send_wr fastreg_wr, *bad_wr;
 	u8 key;
 	int len, pageoff;
 	int i, rc;
@@ -1502,8 +1736,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
 		rpcrdma_map_one(ia, seg, writing);
 		pa = seg->mr_dma;
 		for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
-			seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
-				page_list[page_no++] = pa;
+			frmr->fr_pgl->page_list[page_no++] = pa;
 			pa += PAGE_SIZE;
 		}
 		len += seg->mr_len;
@@ -1515,65 +1748,51 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
 			break;
 	}
 	dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
-		__func__, seg1->mr_chunk.rl_mw, i);
-
-	if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
-		dprintk("RPC:       %s: frmr %x left valid, posting invalidate.\n",
-			__func__,
-			seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
-		/* Invalidate before using. */
-		memset(&invalidate_wr, 0, sizeof invalidate_wr);
-		invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
-		invalidate_wr.next = &frmr_wr;
-		invalidate_wr.opcode = IB_WR_LOCAL_INV;
-		invalidate_wr.send_flags = IB_SEND_SIGNALED;
-		invalidate_wr.ex.invalidate_rkey =
-			seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
-		DECR_CQCOUNT(&r_xprt->rx_ep);
-		post_wr = &invalidate_wr;
-	} else
-		post_wr = &frmr_wr;
-
-	/* Prepare FRMR WR */
-	memset(&frmr_wr, 0, sizeof frmr_wr);
-	frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
-	frmr_wr.opcode = IB_WR_FAST_REG_MR;
-	frmr_wr.send_flags = IB_SEND_SIGNALED;
-	frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
-	frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
-	frmr_wr.wr.fast_reg.page_list_len = page_no;
-	frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
-	frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
-	if (frmr_wr.wr.fast_reg.length < len) {
-		while (seg1->mr_nsegs--)
-			rpcrdma_unmap_one(ia, seg++);
-		return -EIO;
+		__func__, mw, i);
+
+	frmr->fr_state = FRMR_IS_VALID;
+
+	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+	fastreg_wr.wr_id = (unsigned long)(void *)mw;
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
+	fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
+	fastreg_wr.wr.fast_reg.page_list_len = page_no;
+	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
+	if (fastreg_wr.wr.fast_reg.length < len) {
+		rc = -EIO;
+		goto out_err;
 	}
 
 	/* Bump the key */
-	key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
-	ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+	key = (u8)(mr->rkey & 0x000000FF);
+	ib_update_fast_reg_key(mr, ++key);
 
-	frmr_wr.wr.fast_reg.access_flags = (writing ?
+	fastreg_wr.wr.fast_reg.access_flags = (writing ?
 				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
 				IB_ACCESS_REMOTE_READ);
-	frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 
-	rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
-
+	rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
 	if (rc) {
 		dprintk("RPC:       %s: failed ib_post_send for register,"
 			" status %i\n", __func__, rc);
-		while (i--)
-			rpcrdma_unmap_one(ia, --seg);
+		ib_update_fast_reg_key(mr, --key);
+		goto out_err;
 	} else {
-		seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+		seg1->mr_rkey = mr->rkey;
 		seg1->mr_base = seg1->mr_dma + pageoff;
 		seg1->mr_nsegs = i;
 		seg1->mr_len = len;
 	}
 	*nsegs = i;
+	return 0;
+out_err:
+	frmr->fr_state = FRMR_IS_INVALID;
+	while (i--)
+		rpcrdma_unmap_one(ia, --seg);
 	return rc;
 }
 
@@ -1585,20 +1804,25 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
 	struct ib_send_wr invalidate_wr, *bad_wr;
 	int rc;
 
-	while (seg1->mr_nsegs--)
-		rpcrdma_unmap_one(ia, seg++);
+	seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
 
 	memset(&invalidate_wr, 0, sizeof invalidate_wr);
 	invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
 	invalidate_wr.opcode = IB_WR_LOCAL_INV;
-	invalidate_wr.send_flags = IB_SEND_SIGNALED;
 	invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 
+	read_lock(&ia->ri_qplock);
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
 	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
-	if (rc)
+	read_unlock(&ia->ri_qplock);
+	if (rc) {
+		/* Force rpcrdma_buffer_get() to retry */
+		seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
 		dprintk("RPC:       %s: failed ib_post_send for invalidate,"
 			" status %i\n", __func__, rc);
+	}
 	return rc;
 }
 
@@ -1656,8 +1880,10 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
 
 	list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
 	rc = ib_unmap_fmr(&l);
+	read_lock(&ia->ri_qplock);
 	while (seg1->mr_nsegs--)
 		rpcrdma_unmap_one(ia, seg++);
+	read_unlock(&ia->ri_qplock);
 	if (rc)
 		dprintk("RPC:       %s: failed ib_unmap_fmr,"
 			" status %i\n", __func__, rc);
@@ -1673,7 +1899,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
 
 	switch (ia->ri_memreg_strategy) {
 
-#if RPCRDMA_PERSISTENT_REGISTRATION
 	case RPCRDMA_ALLPHYSICAL:
 		rpcrdma_map_one(ia, seg, writing);
 		seg->mr_rkey = ia->ri_bind_mem->rkey;
@@ -1681,7 +1906,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
 		seg->mr_nsegs = 1;
 		nsegs = 1;
 		break;
-#endif
 
 	/* Registration using frmr registration */
 	case RPCRDMA_FRMR:
@@ -1711,11 +1935,11 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
 
 	switch (ia->ri_memreg_strategy) {
 
-#if RPCRDMA_PERSISTENT_REGISTRATION
 	case RPCRDMA_ALLPHYSICAL:
+		read_lock(&ia->ri_qplock);
 		rpcrdma_unmap_one(ia, seg);
+		read_unlock(&ia->ri_qplock);
 		break;
-#endif
 
 	case RPCRDMA_FRMR:
 		rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
@@ -1809,3 +2033,44 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
 			rc);
 	return rc;
 }
+
+/* Physical mapping means one Read/Write list entry per-page.
+ * All list entries must fit within an inline buffer
+ *
+ * NB: The server must return a Write list for NFS READ,
+ *     which has the same constraint. Factor in the inline
+ *     rsize as well.
+ */
+static size_t
+rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	unsigned int inline_size, pages;
+
+	inline_size = min_t(unsigned int,
+			    cdata->inline_wsize, cdata->inline_rsize);
+	inline_size -= RPCRDMA_HDRLEN_MIN;
+	pages = inline_size / sizeof(struct rpcrdma_segment);
+	return pages << PAGE_SHIFT;
+}
+
+static size_t
+rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+	return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
+}
+
+size_t
+rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+	size_t result;
+
+	switch (r_xprt->rx_ia.ri_memreg_strategy) {
+	case RPCRDMA_ALLPHYSICAL:
+		result = rpcrdma_physical_max_payload(r_xprt);
+		break;
+	default:
+		result = rpcrdma_mr_max_payload(r_xprt);
+	}
+	return result;
+}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 89e7cd479705..c419498b8f46 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -59,6 +59,7 @@
  * Interface Adapter -- one per transport instance
  */
 struct rpcrdma_ia {
+	rwlock_t		ri_qplock;
 	struct rdma_cm_id 	*ri_id;
 	struct ib_pd		*ri_pd;
 	struct ib_mr		*ri_bind_mem;
@@ -98,6 +99,14 @@ struct rpcrdma_ep {
 #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
 #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
 
+enum rpcrdma_chunktype {
+	rpcrdma_noch = 0,
+	rpcrdma_readch,
+	rpcrdma_areadch,
+	rpcrdma_writech,
+	rpcrdma_replych
+};
+
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
  * and complete a reply, asychronously. It needs several pieces of
@@ -137,6 +146,40 @@ struct rpcrdma_rep {
 };
 
 /*
+ * struct rpcrdma_mw - external memory region metadata
+ *
+ * An external memory region is any buffer or page that is registered
+ * on the fly (ie, not pre-registered).
+ *
+ * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During
+ * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
+ * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
+ * track of registration metadata while each RPC is pending.
+ * rpcrdma_deregister_external() uses this metadata to unmap and
+ * release these resources when an RPC is complete.
+ */
+enum rpcrdma_frmr_state {
+	FRMR_IS_INVALID,	/* ready to be used */
+	FRMR_IS_VALID,		/* in use */
+	FRMR_IS_STALE,		/* failed completion */
+};
+
+struct rpcrdma_frmr {
+	struct ib_fast_reg_page_list	*fr_pgl;
+	struct ib_mr			*fr_mr;
+	enum rpcrdma_frmr_state		fr_state;
+};
+
+struct rpcrdma_mw {
+	union {
+		struct ib_fmr		*fmr;
+		struct rpcrdma_frmr	frmr;
+	} r;
+	struct list_head	mw_list;
+	struct list_head	mw_all;
+};
+
+/*
  * struct rpcrdma_req -- structure central to the request/reply sequence.
  *
  * N of these are associated with a transport instance, and stored in
@@ -163,17 +206,7 @@ struct rpcrdma_rep {
 struct rpcrdma_mr_seg {		/* chunk descriptors */
 	union {				/* chunk memory handles */
 		struct ib_mr	*rl_mr;		/* if registered directly */
-		struct rpcrdma_mw {		/* if registered from region */
-			union {
-				struct ib_fmr	*fmr;
-				struct {
-					struct ib_fast_reg_page_list *fr_pgl;
-					struct ib_mr *fr_mr;
-					enum { FRMR_IS_INVALID, FRMR_IS_VALID  } state;
-				} frmr;
-			} r;
-			struct list_head mw_list;
-		} *rl_mw;
+		struct rpcrdma_mw *rl_mw;	/* if registered from region */
 	} mr_chunk;
 	u64		mr_base;	/* registration result */
 	u32		mr_rkey;	/* registration result */
@@ -191,6 +224,7 @@ struct rpcrdma_req {
 	unsigned int	rl_niovs;	/* 0, 2 or 4 */
 	unsigned int	rl_nchunks;	/* non-zero if chunks */
 	unsigned int	rl_connect_cookie;	/* retry detection */
+	enum rpcrdma_chunktype	rl_rtype, rl_wtype;
 	struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
 	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
 	struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
@@ -214,6 +248,7 @@ struct rpcrdma_buffer {
 	atomic_t	rb_credits;	/* most recent server credits */
 	int		rb_max_requests;/* client max requests */
 	struct list_head rb_mws;	/* optional memory windows/fmrs/frmrs */
+	struct list_head rb_all;
 	int		rb_send_index;
 	struct rpcrdma_req	**rb_send_bufs;
 	int		rb_recv_index;
@@ -306,7 +341,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
 				struct rpcrdma_create_data_internal *);
 void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
 int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 
 int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
 				struct rpcrdma_req *);
@@ -346,7 +381,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
+ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
 int rpcrdma_marshal_req(struct rpc_rqst *);
+size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
 
 /* Temporary NFS request map cache. Created in svc_rdma.c  */
 extern struct kmem_cache *svc_rdma_map_cachep;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index be8bbd5d65ec..43cd89eacfab 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -594,6 +594,7 @@ static int xs_local_send_request(struct rpc_task *task)
 	}
 
 	switch (status) {
+	case -ENOBUFS:
 	case -EAGAIN:
 		status = xs_nospace(task);
 		break;
@@ -661,6 +662,7 @@ static int xs_udp_send_request(struct rpc_task *task)
 		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
 			-status);
 	case -ENETUNREACH:
+	case -ENOBUFS:
 	case -EPIPE:
 	case -ECONNREFUSED:
 		/* When the server has died, an ICMP port unreachable message
@@ -758,6 +760,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
 		status = -ENOTCONN;
 		/* Should we call xs_close() here? */
 		break;
+	case -ENOBUFS:
 	case -EAGAIN:
 		status = xs_nospace(task);
 		break;
@@ -1946,6 +1949,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
 		dprintk("RPC:       xprt %p connected to %s\n",
 				xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
 		xprt_set_connected(xprt);
+	case -ENOBUFS:
 		break;
 	case -ENOENT:
 		dprintk("RPC:       xprt %p: socket %s does not exist\n",
@@ -2281,6 +2285,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 	case -ECONNREFUSED:
 	case -ECONNRESET:
 	case -ENETUNREACH:
+	case -ENOBUFS:
 		/* retry with existing socket, after a delay */
 		goto out;
 	}
@@ -3054,12 +3059,12 @@ static int param_set_uint_minmax(const char *val,
 		const struct kernel_param *kp,
 		unsigned int min, unsigned int max)
 {
-	unsigned long num;
+	unsigned int num;
 	int ret;
 
 	if (!val)
 		return -EINVAL;
-	ret = strict_strtoul(val, 0, &num);
+	ret = kstrtouint(val, 0, &num);
 	if (ret == -EINVAL || num < min || num > max)
 		return -EINVAL;
 	*((unsigned int *)kp->arg) = num;